import com.aliasi.lm.NGramProcessLM; import com.aliasi.spell.CompiledSpellChecker; import com.aliasi.spell.SpellChecker; import com.aliasi.spell.TrainSpellChecker; import com.aliasi.spell.FixedWeightEditDistance; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Query; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NativeFSLockFactory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import com.aliasi.util.Files; import com.aliasi.util.Streams; import java.io.*; public class QuerySpellCheck { private static final File MODEL_FILE = new File("SpellCheck.model"); private static final int NGRAM_LENGTH = 5; static final File DATA = new File("../../data/rec.sport.hockey/train"); static final File LUCENE_INDEX_DIR = new File("lucene"); static final StandardAnalyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_30); static final double MATCH_WEIGHT = -0.0; static final double DELETE_WEIGHT = -4.0; static final double INSERT_WEIGHT = -1.0; static final double SUBSTITUTE_WEIGHT = -2.0; static final double TRANSPOSE_WEIGHT = -2.0; static final int MAX_HITS = 100; static final String TEXT_FIELD = "text"; public static void main(String[] args) throws IOException, ClassNotFoundException, ParseException { System.out.print(INTRO); // collect argument objects System.out.println(" CONFIGURATION:"); System.out.println(" Model File: " + MODEL_FILE); System.out.println(" N-gram Length: " + NGRAM_LENGTH); FixedWeightEditDistance fixedEdit = new FixedWeightEditDistance(MATCH_WEIGHT, DELETE_WEIGHT, INSERT_WEIGHT, SUBSTITUTE_WEIGHT, TRANSPOSE_WEIGHT); NGramProcessLM lm = new NGramProcessLM(NGRAM_LENGTH); TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE; TrainSpellChecker sc = new TrainSpellChecker(lm,fixedEdit,tokenizerFactory); FSDirectory fsDir = new SimpleFSDirectory(LUCENE_INDEX_DIR, new NativeFSLockFactory()); IndexWriter luceneIndexWriter = new IndexWriter(fsDir, ANALYZER, IndexWriter.MaxFieldLength.LIMITED); if (!DATA.isDirectory()) { System.out.println("Could not find training directory=" + DATA); System.out.println("Have you unpacked the data?"); } String[] filesToIndex = DATA.list(); for (int i = 0; i < filesToIndex.length; ++i) { System.out.println(" File=" + DATA+"/"+filesToIndex[i]); String charSequence = Files.readFromFile(new File(DATA,filesToIndex[i]),"ISO-8859-1"); sc.handle(charSequence); Document luceneDoc = new Document(); Field textField = new Field(TEXT_FIELD, charSequence, Field.Store.YES, Field.Index.ANALYZED); luceneDoc.add(textField); luceneIndexWriter.addDocument(luceneDoc); } System.out.println("Writing model to file=" + MODEL_FILE); writeModel(sc,MODEL_FILE); System.out.println("Writing lucene index to =" + LUCENE_INDEX_DIR); luceneIndexWriter.close(); // read compiled model from model file System.out.println("Reading model from file=" + MODEL_FILE); CompiledSpellChecker compiledSC = readModel(MODEL_FILE); IndexSearcher luceneSearcher = new IndexSearcher(fsDir); System.out.print(TEST_INTRO); testModelLoop(compiledSC,luceneSearcher); System.out.print(GOODBYE); } private static void writeModel(TrainSpellChecker sc, File MODEL_FILE) throws IOException { // create object output stream from file FileOutputStream fileOut = new FileOutputStream(MODEL_FILE); BufferedOutputStream bufOut = new BufferedOutputStream(fileOut); ObjectOutputStream objOut = new ObjectOutputStream(bufOut); // write the spell checker to the file sc.compileTo(objOut); // close the resources Streams.closeOutputStream(objOut); Streams.closeOutputStream(bufOut); Streams.closeOutputStream(fileOut); } private static CompiledSpellChecker readModel(File file) throws ClassNotFoundException, IOException { // create object input stream from file FileInputStream fileIn = new FileInputStream(file); BufferedInputStream bufIn = new BufferedInputStream(fileIn); ObjectInputStream objIn = new ObjectInputStream(bufIn); // read the spell checker CompiledSpellChecker sc = (CompiledSpellChecker) objIn.readObject(); // close the resources and return result Streams.closeInputStream(objIn); Streams.closeInputStream(bufIn); Streams.closeInputStream(fileIn); return sc; } private static void testModelLoop(SpellChecker sc, IndexSearcher searcher) throws IOException,ParseException { // create buffered character reader from system in InputStreamReader isReader = new InputStreamReader(System.in); BufferedReader bufReader = new BufferedReader(isReader); QueryParser queryParser = new QueryParser(Version.LUCENE_30, TEXT_FIELD, ANALYZER); while (true) { // collect query or end if null System.out.print("\n>"); System.out.flush(); String queryString = bufReader.readLine(); if (queryString == null || queryString.length() == 0) break; Query query = queryParser.parse(queryString); TopDocs results = searcher.search(query,MAX_HITS); System.out.println("Found " + results.totalHits + " document(s) that matched query '" + queryString + "':"); // compute alternative spelling String bestAlternative = sc.didYouMean(queryString); if (bestAlternative.equals(queryString)) { System.out.println(" No spelling correction found."); } else { try { Query alternativeQuery = queryParser.parse(bestAlternative); TopDocs results2 = searcher.search(alternativeQuery,MAX_HITS); System.out.println("Found " + results2.totalHits + " document(s) matching best alt='" + bestAlternative + "':"); } catch (ParseException e) { System.out.println("Best alternative not valid query."); System.out.println("Alternative=" + bestAlternative); } } } } private static final String INTRO = '\n' + "Alias-i Query Spell Checker Demo" + '\n' + "PHASE I: TRAINING" + '\n'; private static final String TEST_INTRO = '\n' + "PHASE II: CORRECTION" + '\n' + " Constructing Spell Checker from File" + '\n' + '\n' + "Enter query ." + '\n' + " Use just to exit." + '\n'; private static final String GOODBYE = '\n' + " Detected empty line." + '\n' + " Ending test." + '\n'; }