import com.aliasi.util.Files;
import com.aliasi.util.Strings;
import java.io.File;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WebstersHyphensCorpus {
public static void main(String[] args) throws Exception {
File rawFile = new File(args[0]);
File outFile = new File(args[1]);
String data = Files.readFromFile(rawFile,"ASCII");
Pattern hwPattern = Pattern.compile("()+(.*?)");
Matcher matcher = hwPattern.matcher(data);
Set entrySet = new TreeSet();
int numBadEntries = 0;
while (matcher.find()) {
String entry = matcher.group(2).toLowerCase();
for (String subEntry : entry.split(" |-")) {
String splitEntry = subEntry.replaceAll("\\x22|\\x2A|\\x60", " ").trim().replaceAll("\\s+"," "); // split on " or ` or *
if (goodWord(splitEntry))
entrySet.add(splitEntry);
else {
System.out.println("BAD: " + splitEntry);
++numBadEntries;
}
}
}
System.out.println("# rejected entries=" + numBadEntries);
System.out.println("# retained entries=" + entrySet.size());
String[] entries = entrySet.toArray(new String[0]);
String dataOut = Strings.concatenate(entries,"\n");
Files.writeStringToFile(dataOut,outFile,Strings.UTF8);
System.out.println("DONE.");
}
static boolean goodWord(String w) {
for (int i = 0; i < w.length(); ++i) {
char c = w.charAt(i);
if (Character.isLetter(c) || c == ' ' || c == '\'')
continue;
return false;
}
return true;
}
}