public abstract class WikipediaStandardReaderBase extends WikipediaReaderBase
Modifier and Type | Field and Description |
---|---|
protected long |
currentArticleIndex |
protected long |
nrOfArticles |
protected boolean |
outputPlainText |
protected int |
pageBuffer |
protected String |
pageIdFile |
protected String[] |
pageIdParamArray |
protected Iterator<de.tudarmstadt.ukp.wikipedia.api.Page> |
pageIter |
protected String |
pageNameFile |
protected String[] |
pageNameParamArray |
static String |
PARAM_OUTPUT_PLAIN_TEXT
Whether the reader outputs plain text or wiki markup.
|
static String |
PARAM_PAGE_BUFFER
The page buffer size (#pages) of the page iterator.
|
static String |
PARAM_PAGE_ID_LIST
Defines an array of
page ids of the pages that should be retrieved.
|
static String |
PARAM_PAGE_TITLE_LIST
Defines an array of page titles of the pages that should be retrieved.
|
static String |
PARAM_PATH_TO_PAGE_ID_LIST
Defines the path to a file containing a line-separated list of
page ids of the pages that should be retrieved.
|
static String |
PARAM_PATH_TO_PAGE_TITLE_LIST
Defines the path to a file containing a line-separated list of
page titles of the pages that should be retrieved.
|
protected de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser |
parser |
dbconfig, PARAM_CREATE_DATABASE_CONFIG_ANNOTATION, PARAM_DB, PARAM_HOST, PARAM_LANGUAGE, PARAM_PASSWORD, PARAM_USER, wiki
Constructor and Description |
---|
WikipediaStandardReaderBase() |
Modifier and Type | Method and Description |
---|---|
protected String |
getDocumentText(de.tudarmstadt.ukp.wikipedia.api.Page page) |
void |
getNext(org.apache.uima.jcas.JCas jcas) |
de.tudarmstadt.ukp.wikipedia.api.Page |
getPage() |
protected abstract String |
getPlainDocumentText(de.tudarmstadt.ukp.wikipedia.api.Page page) |
org.apache.uima.util.Progress[] |
getProgress() |
boolean |
hasNext() |
void |
initialize(org.apache.uima.UimaContext context) |
protected abstract boolean |
isValidPage(de.tudarmstadt.ukp.wikipedia.api.Page page) |
close, getLogger, getNext, initialize
destroy, getCasInitializer, getProcessingResourceMetaData, initialize, isConsuming, reconfigure, setCasInitializer, typeSystemInit
getConfigParameterValue, getConfigParameterValue, setConfigParameterValue, setConfigParameterValue
getCasManager, getMetaData, getRelativePathResolver, getResourceManager, getUimaContext, getUimaContextAdmin, setLogger, setMetaData
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
public static final String PARAM_OUTPUT_PLAIN_TEXT
protected boolean outputPlainText
public static final String PARAM_PAGE_BUFFER
protected int pageBuffer
public static final String PARAM_PATH_TO_PAGE_ID_LIST
protected String pageIdFile
public static final String PARAM_PATH_TO_PAGE_TITLE_LIST
protected String pageNameFile
public static final String PARAM_PAGE_ID_LIST
protected String[] pageIdParamArray
public static final String PARAM_PAGE_TITLE_LIST
protected String[] pageNameParamArray
protected long currentArticleIndex
protected long nrOfArticles
protected Iterator<de.tudarmstadt.ukp.wikipedia.api.Page> pageIter
protected de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser parser
public void initialize(org.apache.uima.UimaContext context) throws org.apache.uima.resource.ResourceInitializationException
initialize
in class WikipediaReaderBase
org.apache.uima.resource.ResourceInitializationException
public boolean hasNext() throws IOException, org.apache.uima.collection.CollectionException
IOException
org.apache.uima.collection.CollectionException
public void getNext(org.apache.uima.jcas.JCas jcas) throws IOException, org.apache.uima.collection.CollectionException
getNext
in class WikipediaReaderBase
IOException
org.apache.uima.collection.CollectionException
protected abstract boolean isValidPage(de.tudarmstadt.ukp.wikipedia.api.Page page) throws de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException
de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException
public org.apache.uima.util.Progress[] getProgress()
getProgress
in interface org.apache.uima.collection.base_cpm.BaseCollectionReader
getProgress
in class WikipediaReaderBase
protected String getDocumentText(de.tudarmstadt.ukp.wikipedia.api.Page page)
protected abstract String getPlainDocumentText(de.tudarmstadt.ukp.wikipedia.api.Page page)
public de.tudarmstadt.ukp.wikipedia.api.Page getPage()
Copyright © 2007–2018 Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt. All rights reserved.