public CharJaccard() { super(new NGramTokenizer(1,1,false,SimpleTokenizer.DEFAULT_TOKENIZER)); } public String toString() { return "[CharJaccard]"; }
/** Return tokenized version of a string. Tokens are sequences * of alphanumerics, or any single punctuation character. */ public Token[] tokenize(String input) { Token[] initialTokens = innerTokenizer.tokenize(input); List tokens = new ArrayList(); for (int i=0; i<initialTokens.length; i++) { Token tok = initialTokens[i]; String str = "^"+tok.getValue()+"$"; if (keepOldTokens) tokens.add( intern(str) ); for (int lo=0; lo<str.length(); lo++) { for (int len=minNGramSize; len<=maxNGramSize; len++) { if (lo+len<str.length()) { tokens.add( innerTokenizer.intern( str.substring(lo,lo+len) )); } } } } return (Token[]) tokens.toArray(new Token[tokens.size()]); }
/** Test routine */ public static void main(String[] argv) { NGramTokenizer tokenizer = NGramTokenizer.DEFAULT_TOKENIZER; //NGramTokenizer tokenizer = new NGramTokenizer(1,1,false,SimpleTokenizer.DEFAULT_TOKENIZER); int n = 0; for (int i=0; i<argv.length; i++) { System.out.println("argument "+i+": '"+argv[i]+"'"); Token[] tokens = tokenizer.tokenize(argv[i]); for (int j=0; j<tokens.length; j++) { System.out.println("token "+(++n)+":" +" id="+tokens[j].getIndex() +" value: '"+tokens[j].getValue()+"'"); } } } }
public static void makeDistMetric(List list, StringDistance tfidf, StringDistance triGramDistanceMetric) { List nodes = new ArrayList(); Iterator iter = list.iterator(); while (iter.hasNext()) { nodes.add((Citation)iter.next()); } NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); ArrayList allStrings = new ArrayList(); tfidf = new TFIDF (); //softtfidf = new SoftTFIDF(new JaroWinkler(), 0.8); triGramDistanceMetric = new TFIDF(nGramTokenizer); for (int i=0; i < nodes.size(); i++) { Citation c = (Citation)nodes.get(i); allStrings.addAll(c.getAllStringsWrapped()); } tfidf.accumulateStatistics(allStrings.iterator()); triGramDistanceMetric.accumulateStatistics(allStrings.iterator()); //softtfidf.accumulateStatistics(allStrings.iterator()); }
private static void makeDistMetric(List list) { List nodes = new ArrayList(); Iterator iter = list.iterator(); while (iter.hasNext()) { nodes.add((Citation)iter.next()); } NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); ArrayList allStrings = new ArrayList(); tfidf = new TFIDF (); //softtfidf = new SoftTFIDF(new JaroWinkler(), 0.8); triGramDistanceMetric = new TFIDF(nGramTokenizer); for (int i=0; i < nodes.size(); i++) { Citation c = (Citation)nodes.get(i); allStrings.addAll(c.getAllStringsWrapped()); } tfidf.accumulateStatistics(allStrings.iterator()); triGramDistanceMetric.accumulateStatistics(allStrings.iterator()); //softtfidf.accumulateStatistics(allStrings.iterator()); }
private static TFIDF getDistanceMetric (ArrayList allnodes) { //make distance metrics TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF ret = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, ret); return ret; }
TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allPaperTrainingNodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
public Pipe getPaperPipe (ArrayList nodes) { AbstractStatisticalTokenDistance distanceMetric = (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (nodes); TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(nodes, tfidf, triGramDistanceMetric); Pipe p = new SerialPipes (new Pipe[] { new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"), new GlobalPipe(distanceMetric), new AuthorPipe(distanceMetric), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), new NodePair2FeatureVector (), new Target2Label (), }); return p; }
private static Pipe getPaperPipe (ArrayList nodes) { AbstractStatisticalTokenDistance distanceMetric = (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (nodes); TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(nodes, tfidf, triGramDistanceMetric); Pipe p = new SerialPipes (new Pipe[] { new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"), new GlobalPipe(distanceMetric), new AuthorPipe(distanceMetric), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), new NodePair2FeatureVector (), new Target2Label (), }); return p; }