import com.aliasi.util.Files; import com.aliasi.util.Strings; import java.io.File; import java.util.Set; import java.util.TreeSet; public class MobyHyphenCorpus { public static void main(String[] args) throws Exception { File dataInFile = new File(args[0]); File dataOutFile = new File(args[1]); System.out.println("Raw Input=" + dataInFile); System.out.println("Hyphenation Output=" + dataOutFile); String[] hyphenations = Files.readLinesFromFile(dataInFile,"ASCII"); System.out.println("Found #input lines=" + hyphenations.length); Set hyphenationSet = new TreeSet(); for (int i = 0; i < hyphenations.length; ++i) for (String hyphenation : hyphenations[i].split(" ")) hyphenationSet.add(hyphenation.replaceAll("\uFFFD"," ") .trim() .replaceAll("\\s+"," ") .toLowerCase()); String[] finalHyphenations = hyphenationSet.toArray(new String[0]); System.out.println("Found #hyphenations=" + finalHyphenations.length); String dataOut = Strings.concatenate(finalHyphenations,"\n"); Files.writeStringToFile(dataOut,dataOutFile,Strings.UTF8); } }