import com.aliasi.sentences.MedlineSentenceModel; import com.aliasi.sentences.SentenceModel; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.aliasi.tokenizer.Tokenizer; import com.aliasi.util.Files; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.Set; /** Use SentenceModel to find sentence boundaries in text */ public class SentenceBoundaryDemo { static final TokenizerFactory TOKENIZER_FACTORY = new IndoEuropeanTokenizerFactory(); static final SentenceModel SENTENCE_MODEL = new MedlineSentenceModel(); public static void main(String[] args) throws IOException { File file = new File(args[0]); String text = Files.readFromFile(file); System.out.println("INPUT TEXT: "); System.out.println(text); ArrayList tokenList = new ArrayList(); ArrayList whiteList = new ArrayList(); Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(text.toCharArray(),0,text.length()); tokenizer.tokenize(tokenList,whiteList); System.out.println(tokenList.size() + " TOKENS"); System.out.println(whiteList.size() + " WHITESPACES"); String[] tokens = new String[tokenList.size()]; String[] whites = new String[whiteList.size()]; tokenList.toArray(tokens); whiteList.toArray(whites); int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens,whites); System.out.println(sentenceBoundaries.length + " SENTENCE END TOKEN OFFSETS"); if (sentenceBoundaries.length < 1) { System.out.println("No sentence boundaries found."); return; } int sentStartTok = 0; int sentEndTok = 0; for (int i = 0; i < sentenceBoundaries.length; ++i) { sentEndTok = sentenceBoundaries[i]; System.out.println("SENTENCE "+(i+1)+": "); for (int j=sentStartTok; j<=sentEndTok; j++) { System.out.print(tokens[j]+whites[j+1]); } System.out.println(); sentStartTok = sentEndTok+1; } } }