/** * Creates a pair. * * @param left the left element * @param right the right element */ public PairOfStrings(String left, String right) { set(left, right); }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { List<String> tokens = Tokenizer.tokenize(value.toString()); if (tokens.size() < 2) return; for (int i = 1; i < tokens.size(); i++) { BIGRAM.set(tokens.get(i - 1), tokens.get(i)); context.write(BIGRAM, ONE); BIGRAM.set(tokens.get(i - 1), "*"); context.write(BIGRAM, ONE); } } }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { List<String> tokens = Tokenizer.tokenize(value.toString()); for (int i = 0; i < tokens.size(); i++) { for (int j = Math.max(i - window, 0); j < Math.min(i + window + 1, tokens.size()); j++) { if (i == j) continue; PAIR.set(tokens.get(i), tokens.get(j)); context.write(PAIR, ONE); } } } }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String prev = null; StringTokenizer itr = new StringTokenizer(line); while (itr.hasMoreTokens()) { String cur = itr.nextToken(); // Emit only if we have an actual bigram. if (prev != null) { // Simple way to truncate tokens that are too long. if (cur.length() > 100) { cur = cur.substring(0, 100); } if (prev.length() > 100) { prev = prev.substring(0, 100); } BIGRAM.set(prev, cur); context.write(BIGRAM, ONE); BIGRAM.set(prev, "*"); context.write(BIGRAM, ONE); } prev = cur; } } }
@Override public void map(LongWritable key, Text line, Context context) throws IOException, InterruptedException { String text = line.toString(); String[] terms = text.split("\\s+"); for (int i = 0; i < terms.length; i++) { String term = terms[i]; // skip empty tokens if (term.length() == 0) continue; for (int j = i - window; j < i + window + 1; j++) { if (j == i || j < 0) continue; if (j >= terms.length) break; // skip empty tokens if (terms[j].length() == 0) continue; PAIR.set(term, terms[j]); context.write(PAIR, ONE); } } } }