import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.aliasi.lm.TokenizedLM; import com.aliasi.util.Files; import com.aliasi.util.ScoredObject; import com.aliasi.util.AbstractExternalizable; import java.io.File; import java.io.IOException; public class InterestingPhrases { private static int NGRAM = 3; private static int MIN_COUNT = 5; private static int MAX_NGRAM_REPORTING_LENGTH = 2; private static int NGRAM_REPORTING_LENGTH = 2; private static int MAX_COUNT = 100; private static File BACKGROUND_DIR = new File("../../data/rec.sport.hockey/train"); private static File FOREGROUND_DIR = new File("../../data/rec.sport.hockey/test"); public static void main(String[] args) throws IOException { IndoEuropeanTokenizerFactory tokenizerFactory = new IndoEuropeanTokenizerFactory(); System.out.println("Training background model"); TokenizedLM backgroundModel = buildModel(tokenizerFactory, NGRAM, BACKGROUND_DIR); backgroundModel.sequenceCounter().prune(3); System.out.println("\nAssembling collocations in Training"); ScoredObject[] coll = backgroundModel.collocations(NGRAM_REPORTING_LENGTH, MIN_COUNT,MAX_COUNT); System.out.println("\nCollocations in Order of Significance:"); report(coll); System.out.println("Training foreground model"); TokenizedLM foregroundModel = buildModel(tokenizerFactory, NGRAM, FOREGROUND_DIR); foregroundModel.sequenceCounter().prune(3); System.out.println("\nAssembling New Terms in Test vs. Training"); ScoredObject[] newTerms = foregroundModel.newTerms(NGRAM_REPORTING_LENGTH, MIN_COUNT, MAX_COUNT, backgroundModel); System.out.println("\nNew Terms in Order of Signficance:"); report(newTerms); System.out.println("\nDone."); } private static TokenizedLM buildModel(TokenizerFactory tokenizerFactory, int ngram, File directory) throws IOException { String[] trainingFiles = directory.list(); TokenizedLM model = new TokenizedLM(tokenizerFactory, ngram); System.out.println("Training on "+directory); for (int j = 0; j < trainingFiles.length; ++j) { String text = Files.readFromFile(new File(directory, trainingFiles[j])); model.train(text); } return model; } private static void report (ScoredObject[] nGrams) { for (int i=0; i