public CharJaccard() { super(new NGramTokenizer(1,1,false,SimpleTokenizer.DEFAULT_TOKENIZER)); } public String toString() { return "[CharJaccard]"; }
public static void makeDistMetric(List list, StringDistance tfidf, StringDistance triGramDistanceMetric) { List nodes = new ArrayList(); Iterator iter = list.iterator(); while (iter.hasNext()) { nodes.add((Citation)iter.next()); } NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); ArrayList allStrings = new ArrayList(); tfidf = new TFIDF (); //softtfidf = new SoftTFIDF(new JaroWinkler(), 0.8); triGramDistanceMetric = new TFIDF(nGramTokenizer); for (int i=0; i < nodes.size(); i++) { Citation c = (Citation)nodes.get(i); allStrings.addAll(c.getAllStringsWrapped()); } tfidf.accumulateStatistics(allStrings.iterator()); triGramDistanceMetric.accumulateStatistics(allStrings.iterator()); //softtfidf.accumulateStatistics(allStrings.iterator()); }
private static void makeDistMetric(List list) { List nodes = new ArrayList(); Iterator iter = list.iterator(); while (iter.hasNext()) { nodes.add((Citation)iter.next()); } NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); ArrayList allStrings = new ArrayList(); tfidf = new TFIDF (); //softtfidf = new SoftTFIDF(new JaroWinkler(), 0.8); triGramDistanceMetric = new TFIDF(nGramTokenizer); for (int i=0; i < nodes.size(); i++) { Citation c = (Citation)nodes.get(i); allStrings.addAll(c.getAllStringsWrapped()); } tfidf.accumulateStatistics(allStrings.iterator()); triGramDistanceMetric.accumulateStatistics(allStrings.iterator()); //softtfidf.accumulateStatistics(allStrings.iterator()); }
private static TFIDF getDistanceMetric (ArrayList allnodes) { //make distance metrics TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF ret = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, ret); return ret; }
TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allPaperTrainingNodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, triGramDistanceMetric);
public Pipe getPaperPipe (ArrayList nodes) { AbstractStatisticalTokenDistance distanceMetric = (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (nodes); TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(nodes, tfidf, triGramDistanceMetric); Pipe p = new SerialPipes (new Pipe[] { new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"), new GlobalPipe(distanceMetric), new AuthorPipe(distanceMetric), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), new NodePair2FeatureVector (), new Target2Label (), }); return p; }
private static Pipe getPaperPipe (ArrayList nodes) { AbstractStatisticalTokenDistance distanceMetric = (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (nodes); TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(nodes, tfidf, triGramDistanceMetric); Pipe p = new SerialPipes (new Pipe[] { new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"), new GlobalPipe(distanceMetric), new AuthorPipe(distanceMetric), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), new NodePair2FeatureVector (), new Target2Label (), }); return p; }