/*
* LingPipe v. 3.9
* Copyright (C) 2003-2010 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.chunk;
import com.aliasi.tokenizer.TokenCategorizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.corpus.TagHandler;
import com.aliasi.corpus.ObjectHandler;
import com.aliasi.corpus.ChunkHandlerAdapter;
import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.Strings;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* A TrainTokenShapeChunker is used to train a token and
* shape-based chunker.
*
*
Estimation is based on a joint model of tags
* T1,...,TN and tokens W1,...,WN, which is
* approximated with a limited history and smoothed using linear
* interpolation.
*
*
* By the chain rule:
* * P(W1,...,WN,T1,...TN) * = P(W1,T1) * P(W2,T2|W1,T1) * P(W3,T3|W1,W2,T1,T2) * * ... * P(WN,TN|W1,...,WN-1,T1,...,TN-1) ** The longer contexts are approximated with the two previous * tokens and one previous tag. * * P(WN,TN|W1,...,WN-1,T1,...,TN-1) * ~ P(WN,TN|WN-2,WN-1,TN-1) ** The shorter contexts are padded with tags and tokens for the * beginning of a stream, and an addition end-of-stream symbol is * trained after the last symbol in the input. * The joint model is further decomposed into a conditional tag model * and a conditional token model by the chain rule: * * P(WN,TN|WN-2,WN-1,TN-1) * = P(TN|WN-2,WN-1,TN-1) * * P(WN|WN-2,WN-1,TN-1,TN) ** The token model is further approximated as: * * P(WN|WN-2,WN-1,TN-1,TN) * ~ P(WN|WN-1,interior(TN-1),TN) ** where interior(TN-1) is the interior
* version of a tag; for instance:
*
* interior("ST_PERSON").equals("PERSON")
* interior("PERSON").equals("PERSON")
*
* This performs what is known as "model tying", and it
* amounts to sharing the models for the two contexts.
* The tag model is also approximated by tying start
* and interior tag histories:
* * P(TN|WN-2,WN-1,TN-1) * ~ P(TN|WN-2,WN-1,interior(TN-1)) ** The tag and token models are themselves simple * linear interpolation models, with smoothing parameters defined * by the Witten-Bell method. The order * of contexts for the token model is: *
* P(WN|TN,interior(TN-1),WN-1)
* ~ lambda(TN,interior(TN-1),WN-1) * P_ml(WN|TN,interior(TN-1),WN-1)
* + (1-lambda(")) * P(WN|TN,interior(TN-1))
*
* P(WN|TN,interior(TN-1))
* ~ lambda(TN,interior(TN-1)) * P_ml(WN|TN,interior(TN-1))
* + (1-lambda(")) * P(WN|TN)
*
* P(WN|TN) ~ lambda(TN) * P_ml(WN|TN)
* + 1-lambda(") * UNIFORM_ESTIMATE
*
*
* The last step is degenerate in that SUM_W P(W|T) =
* INFINITY, because there are infinitely many possible tokens,
* and each is assigned the uniform estimate. To fix this, a model
* would be needed of character sequences that ensured SUM_W
* P(W|T) = 1.0. (The steps to do the final uniform estimate
* are handled by the compiled estimator.)
* * The tag estimator is smoothed by: *
* P(TN|interior(TN-1),WN-1,WN-2)
* ~ lambda(interior(TN-1),WN-1,WN-2) * P_ml(TN|interior(TN-1),WN-1,WN-2)
* + (1-lambda(")) * P(TN|interior(TN-1),WN-1)
*
* P(TN|interior(TN-1),WN-1)
* ~ lambda(interior(TN-1),WN-1) * P_ml(TN|interior(TN-1),WN-1)
* + (1-lambda(")) * P_ml(TN|interior(TN-1))
*
*
* Note that the smoothing stops at estimating a tag in terms
* of the previous tags. This guarantees that only bigram tag
* sequences seen in the training data get non-zero probability
* under the estimator.
*
*
* |
Sequences of training pairs are added via the {@link
* #handle(String[],String[],String[])} or the {@link
* #handle(Chunking)} methods.
*
* @author Bob Carpenter
* @version 3.9
* @since LingPipe1.0
*/
public class TrainTokenShapeChunker
implements TagHandler,
ObjectHandler4.0, the
* number of tokens to 3,000,000, the
* known minimum token count to 8, and the min tag and
* token count for pruning to 1.
*
* @param categorizer Token categorizer for unknown tokens.
* @param factory Tokenizer factory for creating tokenizers.
*/
public TrainTokenShapeChunker(TokenCategorizer categorizer,
TokenizerFactory factory) {
this(categorizer,factory,
8, 1, 1);
}
/**
* Construct a trainer for a token/shape chunker based on
* the specified token categorizer, tokenizer factory and
* numerical parameters. The parameters are described in
* detail in the class documentation above.
*
* @param categorizer Token categorizer for unknown tokens.
* @param factory Tokenizer factory for tokenizing data.
* @param knownMinTokenCount Number of instances required for
* a token to count as known for unknown training.
* @param minTokenCount Minimum token count for token contexts to
* survive after pruning.
* @param minTagCount Minimum count for tag contexts to survive
* after pruning.
*/
public TrainTokenShapeChunker(TokenCategorizer categorizer,
TokenizerFactory factory,
int knownMinTokenCount,
int minTokenCount,
int minTagCount) {
mTokenCategorizer = categorizer;
mTokenizerFactory = factory;
mKnownMinTokenCount = knownMinTokenCount;
mMinTokenCount = minTokenCount;
mMinTagCount = minTagCount;
mTrainableEstimator = new TrainableEstimator(categorizer);
}
/**
* Trains the underlying estimator on the specified BIO-encoded
* chunk tagging.
*
* @param tokens Sequence of tokens to train.
* @param whitespaces Sequence of whitespaces (ignored).
* @param tags Sequence of tags to train.
* @throws IllegalArgumentException If the tags and tokens are
* different lengths.
* @throws NullPointerException If any of the tags or tokens are null.
*/
public void handle(String[] tokens, String[] whitespaces, String[] tags) {
if (tokens.length != tags.length) {
String msg = "Tokens and tags must be same length."
+ " Found tokens.length=" + tokens.length
+ " tags.length=" + tags.length;
throw new IllegalArgumentException(msg);
}
for (int i = 0; i < tokens.length; ++i) {
if (tokens[i] == null || tags[i] == null) {
String msg = "Tags and tokens must not be null."
+ " Found tokens[" + i + "]=" + tokens[i]
+ " tags[" + i + "]=" + tags[i];
throw new NullPointerException(msg);
}
mTokenList.add(tokens[i]);
mTagList.add(tags[i]);
}
}
/**
* Add the specified chunking as a training event.
*
* @param chunking Chunking for training.
*/
public void handle(Chunking chunking) {
ObjectHandler