public CharJaccard() { super(new NGramTokenizer(1,1,false,SimpleTokenizer.DEFAULT_TOKENIZER)); } public String toString() { return "[CharJaccard]"; }
public Token intern(String s) { return innerTokenizer.intern(s); }
/** Test routine */ public static void main(String[] argv) { SimpleTokenizer tokenizer = DEFAULT_TOKENIZER; int n = 0; for (int i=0; i<argv.length; i++) { System.out.println("argument "+i+": '"+argv[i]+"'"); Token[] tokens = tokenizer.tokenize(argv[i]); for (int j=0; j<tokens.length; j++) { System.out.println("token "+(++n)+":" +" id="+tokens[j].getIndex() +" value: '"+tokens[j].getValue()+"'"); } } } }
/** Return tokenized version of a string. Tokens are sequences * of alphanumerics, or any single punctuation character. */ public Token[] tokenize(String input) { Token[] initialTokens = innerTokenizer.tokenize(input); List tokens = new ArrayList(); for (int i=0; i<initialTokens.length; i++) { Token tok = initialTokens[i]; String str = "^"+tok.getValue()+"$"; if (keepOldTokens) tokens.add( intern(str) ); for (int lo=0; lo<str.length(); lo++) { for (int len=minNGramSize; len<=maxNGramSize; len++) { if (lo+len<str.length()) { tokens.add( innerTokenizer.intern( str.substring(lo,lo+len) )); } } } } return (Token[]) tokens.toArray(new Token[tokens.size()]); }
/** Test routine */ public static void main(String[] argv) { NGramTokenizer tokenizer = NGramTokenizer.DEFAULT_TOKENIZER; //NGramTokenizer tokenizer = new NGramTokenizer(1,1,false,SimpleTokenizer.DEFAULT_TOKENIZER); int n = 0; for (int i=0; i<argv.length; i++) { System.out.println("argument "+i+": '"+argv[i]+"'"); Token[] tokens = tokenizer.tokenize(argv[i]); for (int j=0; j<tokens.length; j++) { System.out.println("token "+(++n)+":" +" id="+tokens[j].getIndex() +" value: '"+tokens[j].getValue()+"'"); } } } }
private static TFIDF getDistanceMetric (ArrayList allnodes) { //make distance metrics TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF ret = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, ret); return ret; }
public List getTokens(String s) { return makeList(tokenizer.tokenize(s)); }
private List makeList(Token[] tokens) { List strTokens = new ArrayList(); for (int i = 0; i < tokens.length; i++) { strTokens.add(tokens[i].getValue()); } return strTokens; }
/** Preprocess a string by finding tokens */ public StringWrapper prepare(String s) { return new BagOfTokens(s, tokenizer.tokenize(s)); }
private Token internSomething(String s) { return intern( ignoreCase ? s.toLowerCase() : s ); }
public Token intern(String s) { Token tok = (Token)tokMap.get(s); if (tok==null) { tok = new Token(++nextId,s); tokMap.put(s,tok); } return tok; }
public CitationNormalizer() { makeStopTable(STOP_WORDS); tokenizer = new SimpleTokenizer(true, true); }
public String getFourDigitString(String s) { List tokens = makeList(tokenizer.tokenize(s)); for (int i = 0; i < tokens.size(); i++) { String token = (String) tokens.get(i); if (token.matches("[0-9][0-9][0-9][0-9]")) { return token; } } return ""; }
/** Preprocess a string by finding tokens and giving them Mixture weights */ public StringWrapper prepare(String s) { return new BagOfTokens(s, tokenizer.tokenize(s)); }
public TokenNeighbor(String tokVal,double score) { this.tokVal=tokVal; this.score=score; this.freq = getDocumentFrequency(tokenizer.intern(tokVal)); } // sort by score, closest first
public String norm1(String s) { List tokens = makeList(tokenizer.tokenize(s)); tokens = removeStopWords(tokens); tokens = makeSubstitutions(tokens); return tokensToString(tokens); }
/** Preprocess a string by finding tokens. */ public StringWrapper prepare(String s) { return new BagOfTokens(s, tokenizer.tokenize(s)); }
/** Preprocess a string by finding tokens. */ public StringWrapper prepare(String s) { return new BagOfTokens(s, tokenizer.tokenize(s)); }
/** Preprocess a string by finding tokens and giving them weights W * such that W is the smoothed probability of the token appearing * in the document. */ final public StringWrapper prepare(String s) { BagOfTokens bag = new BagOfTokens(s, tokenizer.tokenize(s)); double totalWeight = bag.getTotalWeight(); for (Iterator i=bag.tokenIterator(); i.hasNext(); ) { Token tok = (Token)i.next(); double freq = bag.getWeight(tok); bag.setWeight( tok, smoothedProbability(tok, freq, totalWeight) ); } return bag; }