import com.aliasi.chunk.CharLmHmmChunker; import com.aliasi.corpus.parsers.GeneTagParser; import com.aliasi.hmm.HmmCharLmEstimator; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.aliasi.util.AbstractExternalizable; import com.aliasi.util.Streams; import java.io.File; import java.io.IOException; public class TrainGeneTag { static final int MAX_N_GRAM = 8; static final int NUM_CHARS = 256; static final double LM_INTERPOLATION = MAX_N_GRAM; // default behavior // java TrainGeneTag public static void main(String[] args) throws IOException { File corpusFile = new File(args[0]); File modelFile = new File(args[1]); System.out.println("Setting up Chunker Estimator"); TokenizerFactory factory = IndoEuropeanTokenizerFactory.INSTANCE; HmmCharLmEstimator hmmEstimator = new HmmCharLmEstimator(MAX_N_GRAM,NUM_CHARS,LM_INTERPOLATION); CharLmHmmChunker chunkerEstimator = new CharLmHmmChunker(factory,hmmEstimator); System.out.println("Setting up Data Parser"); GeneTagParser parser = new GeneTagParser(); parser.setHandler(chunkerEstimator); System.out.println("Training with Data from File=" + corpusFile); parser.parse(corpusFile); System.out.println("Compiling and Writing Model to File=" + modelFile); AbstractExternalizable.compileTo(chunkerEstimator,modelFile); } }