public class CoreNlpSegmenter extends SegmenterBase
Modifier and Type | Field and Description |
---|---|
static String |
PARAM_BOUNDARIES_TO_DISCARD
The set of regex for sentence boundary tokens that should be discarded.
|
static String |
PARAM_BOUNDARY_MULTI_TOKEN_REGEX |
static String |
PARAM_BOUNDARY_TOKEN_REGEX
The set of boundary tokens.
|
static String |
PARAM_HTML_ELEMENTS_TO_DISCARD
These are elements like "p" or "sent", which will be wrapped into regex for approximate XML
matching.
|
static String |
PARAM_NEWLINE_IS_SENTENCE_BREAK
Strategy for treating newlines as sentence breaks.
|
static String |
PARAM_TOKEN_REGEXES_TO_DISCARD
The set of regex for sentence boundary tokens that should be discarded.
|
PARAM_LANGUAGE, PARAM_STRICT_ZONING, PARAM_WRITE_FORM, PARAM_WRITE_SENTENCE, PARAM_WRITE_TOKEN, PARAM_ZONE_TYPES
Constructor and Description |
---|
CoreNlpSegmenter() |
Modifier and Type | Method and Description |
---|---|
void |
initialize(org.apache.uima.UimaContext aContext) |
protected void |
process(org.apache.uima.jcas.JCas aJCas,
String aText,
int aZoneBegin) |
createSentence, createToken, createToken, createToken, getLanguage, getLocale, getZoneTypes, isEmpty, isStrictZoning, isWriteSentence, isWriteToken, limit, process, trim, trimChar
getRequiredCasInterface, process
getCasInstancesRequired, hasNext, next
public static final String PARAM_BOUNDARY_TOKEN_REGEX
WordToSentenceProcessor.WordToSentenceProcessor(java.lang.String, java.lang.String, java.util.Set<java.lang.String>, java.util.Set<java.lang.String>, java.lang.String, edu.stanford.nlp.process.WordToSentenceProcessor.NewlineIsSentenceBreak, edu.stanford.nlp.ling.tokensregex.SequencePattern<? super IN>, java.util.Set<java.lang.String>, boolean, boolean)
,
Constant Field Valuespublic static final String PARAM_BOUNDARY_MULTI_TOKEN_REGEX
public static final String PARAM_HTML_ELEMENTS_TO_DISCARD
public static final String PARAM_BOUNDARIES_TO_DISCARD
WordToSentenceProcessor.DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD
,
Constant Field Valuespublic static final String PARAM_NEWLINE_IS_SENTENCE_BREAK
public static final String PARAM_TOKEN_REGEXES_TO_DISCARD
public void initialize(org.apache.uima.UimaContext aContext) throws org.apache.uima.resource.ResourceInitializationException
initialize
in interface org.apache.uima.analysis_component.AnalysisComponent
initialize
in class org.apache.uima.fit.component.JCasAnnotator_ImplBase
org.apache.uima.resource.ResourceInitializationException
protected void process(org.apache.uima.jcas.JCas aJCas, String aText, int aZoneBegin) throws org.apache.uima.analysis_engine.AnalysisEngineProcessException
process
in class SegmenterBase
org.apache.uima.analysis_engine.AnalysisEngineProcessException
Copyright © 2007–2018 Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt. All rights reserved.