DKPro Core - TreeTagger Part-of-speech tagging & parsing without reader or writer

Embedding

This is an example of how to use the DKPro Core TreeTaggerPosTagger component with a manually downloaded TreeTagger executable and model.

/**
* SYNOPSIS: treetagger.groovy [executable] [model]
*
* EXAMPLE: ./treetagger.groovy /usr/local/bin/tree-tagger english-par-linux-3.2-utf8.bin
*
* Annotates an English text using treetagger.
*/

@Grab(group='de.tudarmstadt.ukp.dkpro.core',
module='de.tudarmstadt.ukp.dkpro.core.treetagger-asl',
version='1.7.0')
@Grab(group='de.tudarmstadt.ukp.dkpro.core',
module='de.tudarmstadt.ukp.dkpro.core.tokit-asl',
version='1.7.0')
@Grab(group='de.tudarmstadt.ukp.dkpro.core',
module='de.tudarmstadt.ukp.dkpro.core.io.text-asl',
version='1.7.0')

import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.pipeline.SimplePipeline.iteratePipeline;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;

import de.tudarmstadt.ukp.dkpro.core.io.text.StringReader;
import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
import de.tudarmstadt.ukp.dkpro.core.treetagger.TreeTaggerPosTagger;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

def ttExecutable = args[0];
def ttModel = args[1]
def ttModelEncoding = "UTF-8";
def ttTagset = "ptb";
def language = "en";

// Assemble pipeline
def pipeline = iteratePipeline(
createReaderDescription(StringReader,
StringReader.PARAM_DOCUMENT_TEXT, "The quick brown fox jumps over the lazy dog.",
StringReader.PARAM_LANGUAGE, language),
createEngineDescription(BreakIteratorSegmenter),
createEngineDescription(TreeTaggerPosTagger,
TreeTaggerPosTagger.PARAM_EXECUTABLE_PATH, ttExecutable,
TreeTaggerPosTagger.PARAM_MODEL_LOCATION, ttModel ,
TreeTaggerPosTagger.PARAM_MODEL_ENCODING, ttModelEncoding,
TreeTaggerPosTagger.PARAM_POS_MAPPING_LOCATION,
"classpath:/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/" +
"${language}-${ttTagset}-pos.map"))

// Run the pipeline
for (doc in pipeline) {
for (t in select(doc, Token)) {
println "${t.coveredText}\t${t.pos.posValue}"
}
}

Example output:

The DT
quick JJ
brown JJ
fox NN
jumps NNS
over IN
the DT
lazy JJ
dog NN
. SENT