DKPro Core - Fully mixed pipeline

Analytics

Reads all text files (*.txt) in the specified folder and prints token, part-of-speech, lemma, and named entity in a tab-separated format, followed by the constituent tree as a bracketed structure.

Call with pipeline <inputfolder> <language>, e.g. pipeline input en.

@Grab(group='de.tudarmstadt.ukp.dkpro.core', 
      module='de.tudarmstadt.ukp.dkpro.core.io.text-asl', 
      version='1.5.0')
@Grab(group='de.tudarmstadt.ukp.dkpro.core', 
      module='de.tudarmstadt.ukp.dkpro.core.opennlp-asl', 
      version='1.5.0')
@Grab(group='de.tudarmstadt.ukp.dkpro.core', 
      module='de.tudarmstadt.ukp.dkpro.core.matetools-gpl', 
      version='1.5.0')
@Grab(group='de.tudarmstadt.ukp.dkpro.core', 
      module='de.tudarmstadt.ukp.dkpro.core.clearnlp-asl', 
      version='1.5.0')
@Grab(group='de.tudarmstadt.ukp.dkpro.core', 
      module='de.tudarmstadt.ukp.dkpro.core.berkeleyparser-gpl', 
      version='1.5.0')
@Grab(group='de.tudarmstadt.ukp.dkpro.core', 
      module='de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl', 
      version='1.5.0')

import static org.apache.uima.fit.pipeline.SimplePipeline.*;
import static org.apache.uima.fit.util.JCasUtil.*;
import static org.apache.uima.fit.factory.CollectionReaderFactory.*;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.*;

import de.tudarmstadt.ukp.dkpro.core.io.text.*;
import de.tudarmstadt.ukp.dkpro.core.opennlp.*;
import de.tudarmstadt.ukp.dkpro.core.matetools.*;
import de.tudarmstadt.ukp.dkpro.core.clearnlp.*;
import de.tudarmstadt.ukp.dkpro.core.berkeleyparser.*;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.*;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.*;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.*;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.*;

// Assemble and run pipeline
def pipeline = iteratePipeline(
  createReaderDescription(TextReader,
    TextReader.PARAM_PATH, args[0],     //first command line parameter
    TextReader.PARAM_LANGUAGE, args[1], //second command line parameter
    TextReader.PARAM_PATTERNS, "[+]*.txt"),
  createEngineDescription(OpenNlpSegmenter),
  createEngineDescription(MatePosTagger),
  createEngineDescription(ClearNlpLemmatizer),
  createEngineDescription(BerkeleyParser,
    BerkeleyParser.PARAM_WRITE_PENN_TREE, true),
  createEngineDescription(StanfordNamedEntityRecognizer));

for (def jcas : pipeline) {
  for (def sentence : select(jcas, Sentence)) {
    println "Token\tPOS\tLemma\tNamed entity";
    selectCovered(Token, sentence).each {
      print "${it.coveredText}\t${it.pos.posValue}\t${it.lemma.value}\t";
      print "${selectCovering(NamedEntity, it)*.value}\n";
    };
    selectCovered(PennTree, sentence).each {println "\n${it.pennTree}"};
  }
} 

Example output:

Token   POS Lemma   Named entity
Jim NNP jim [PERSON]
bought  VBD buy []
300 CD  0   []
shares  NNS share   []
of  IN  of  []
Acme    NNP acme    [ORGANIZATION]
Corp.   NNP corp.   [ORGANIZATION]
in  IN  in  []
2006    CD  0   []
.   .   .   []

(ROOT (S (NP (NNP Jim)) (VP (VBD bought) (NP (NP (CD 300) (NNS shares))
(PP (IN of) (NP (NNP Acme) (NNP Corp.)))) (PP (IN in) (NP (CD 2006)))) (. .)))