import com.aliasi.util.Files; import com.aliasi.util.ObjectToSet; import com.aliasi.util.Strings; import java.io.File; import java.util.HashSet; import java.util.TreeSet; import java.util.Set; public class CelexSyllableCorpus { public static void main(String[] args) throws Exception { File inputFile = new File(args[0]); File outputFile = new File(args[1]); // read input data String text = Files.readFromFile(inputFile,"ASCII"); String[] lines = text.split("\\n"); Set hyphenationSet = new TreeSet(); for (String line : lines) extractLine(line,hyphenationSet); System.out.println("# syllabifications=" + hyphenationSet.size()); // report duplicates ObjectToSet wordToHyphenationSetMap = new ObjectToSet(); for (String hyphenation : hyphenationSet) { String word = hyphenation.replaceAll(" ",""); wordToHyphenationSetMap.addMember(word,hyphenation); } System.out.println("# unique words=" + wordToHyphenationSetMap.size()); for (Set hyphenationSetForWord : wordToHyphenationSetMap.values()) if (hyphenationSetForWord.size() > 1) System.out.println(hyphenationSetForWord); // generate output file Set finalHyphenationSet = new TreeSet(); for (Set hyphenations : wordToHyphenationSetMap.values()) finalHyphenationSet.addAll(hyphenations); StringBuilder sb = new StringBuilder(); for (String hyphenatedWord : finalHyphenationSet) { if (sb.length() > 0) sb.append('\n'); sb.append(hyphenatedWord); } Files.writeStringToFile(sb.toString(),outputFile,Strings.UTF8); } static void extractLine(String line, Set hyphenationSet) { String[] fields = line.split("\\\\"); if (fields.length < 9) return; if (fields[1].indexOf(' ') >= 0 || fields[1].indexOf('-') >= 0) return; // remove compounds for (int i = 8; i < fields.length; i += 4) { String hyphenatedWord = fields[i]; if (hyphenatedWord.length() > 0) { String wordHyphenation = hyphenatedWord.replaceAll("\\]\\["," ").replaceAll("\\[","").replaceAll("\\]","").replaceAll("@","+"); hyphenationSet.add(wordHyphenation); } } } // could use this to push back into IPA, but still wind up with compound symbols // LHS is all the symbols of length more than one static String toIpa(String word) { // escape " with \\x22 and ^ with \\x5E return word.replaceAll("tS","?") .replaceAll("dZ","?") .replaceAll("N,","?") .replaceAll("m,","?") .replaceAll("n,","?") .replaceAll("l,","?") .replaceAll("r*","?") .replaceAll("i:","?") .replaceAll("A:","?") .replaceAll("O:","?") .replaceAll("u:","?") .replaceAll("3:","?") .replaceAll("eI","?") .replaceAll("aI","?") .replaceAll("OI","?") .replaceAll("@U","?") .replaceAll("aU","?") .replaceAll("I@","?") .replaceAll("E@","?") .replaceAll("U@","?") .replaceAll("&~","?") .replaceAll("A~:","?") .replaceAll("&~:","?") .replaceAll("O~:","?"); } static char[] DIACRITICS = new char[] { '#', '`', '\"', '^', ',', '~', '@' }; // for German // $ -> ss (Eszett) }