import com.aliasi.classify.Classification; import com.aliasi.classify.Classified; import com.aliasi.classify.DynamicLMClassifier; import com.aliasi.lm.NGramProcessLM; import com.aliasi.util.AbstractExternalizable; import com.aliasi.util.Strings; import java.io.*; public class TrainLanguageId { // java TrainLanguageId :dir // :file // :int // :int public static void main(String[] args) throws Exception { File dataDir = new File(args[0]); if (!dataDir.isDirectory()) { String msg = "Set first argument to the data directory." + " Found dataDir=" + dataDir; throw new IllegalArgumentException(msg); } File modelFile = new File(args[1]); int nGram = Integer.valueOf(args[2]); int numChars = Integer.valueOf(args[3]); System.out.println("nGram=" + nGram + " numChars=" + numChars); int minCount = args.length > 4 ? Integer.valueOf(args[4]) : 10; String[] categories = dataDir.list(); DynamicLMClassifier classifier = DynamicLMClassifier .createNGramProcess(categories,nGram); char[] csBuf = new char[numChars]; for (int i = 0; i < categories.length; ++i) { String category = categories[i]; System.out.println("Training category=" + category); File trainingFile = new File(new File(dataDir,category), category + ".txt"); FileInputStream fileIn = new FileInputStream(trainingFile); InputStreamReader reader = new InputStreamReader(fileIn,Strings.UTF8); reader.read(csBuf); String text = new String(csBuf,0,numChars); Classification c = new Classification(category); Classified classified = new Classified(text,c); classifier.handle(classified); reader.close(); } // prune substring counts by eliminating counts below 10 for (String cat : categories) classifier.languageModel(cat).substringCounter().prune(minCount); System.out.println("\nCompiling model to file=" + modelFile); AbstractExternalizable.compileTo(classifier,modelFile); } }