@Test public void testAdjustOrPutValue() { OpenObjectIntHashMap<String> map = new OpenObjectIntHashMap<String>(); map.put("Eleven", (int) 11); map.put("Twelve", (int) 12); map.put("Thirteen", (int) 13); map.put("Fourteen", (int) 14); map.adjustOrPutValue("Eleven", (int)1, (int)3); assertEquals(14, map.get("Eleven") ); map.adjustOrPutValue("Fifteen", (int)1, (int)3); assertEquals(1, map.get("Fifteen") ); }
@Test public void testGet() { OpenObjectIntHashMap<String> map = new OpenObjectIntHashMap<String>(); map.put("Eleven", (int) 11); map.put("Twelve", (int) 12); assertEquals((int)11, map.get("Eleven") ); }
@Override public boolean apply(String tag, int total) { int correct = tagCorrect.get(tag); out.println(tag + "\t" + total + "\t" + correct + "\t" + nf.format(((correct * 100) / (float) total))); return true; } });
public static String[] invertDictionary(OpenObjectIntHashMap<String> termIdMap) { int maxTermId = -1; for (String term : termIdMap.keys()) { maxTermId = Math.max(maxTermId, termIdMap.get(term)); } maxTermId++; String[] dictionary = new String[maxTermId]; for (String term : termIdMap.keys()) { dictionary[termIdMap.get(term)] = term; } return dictionary; }
@Override public int getId(TYPE item) { if (map.containsKey(item)) { return map.get(item); } return -1; }
public static String[] invertDictionary(OpenObjectIntHashMap<String> termIdMap) { int maxTermId = -1; for (String term : termIdMap.keys()) { maxTermId = Math.max(maxTermId, termIdMap.get(term)); } maxTermId++; String[] dictionary = new String[maxTermId]; for (String term : termIdMap.keys()) { dictionary[termIdMap.get(term)] = term; } return dictionary; }
public static String[] invertDictionary(OpenObjectIntHashMap<String> termIdMap) { int maxTermId = -1; for (String term : termIdMap.keys()) { maxTermId = Math.max(maxTermId, termIdMap.get(term)); } maxTermId++; String[] dictionary = new String[maxTermId]; for (String term : termIdMap.keys()) { dictionary[termIdMap.get(term)] = term; } return dictionary; }
@Override protected void map(Text labelText, VectorWritable instance, Context ctx) throws IOException, InterruptedException { String label = SLASH.split(labelText.toString())[1]; if (labelIndex.containsKey(label)) { ctx.write(new IntWritable(labelIndex.get(label)), instance); } else { ctx.getCounter(Counter.SKIPPED_INSTANCES).increment(1); } } }
@Override protected void map(Text labelText, VectorWritable instance, Context ctx) throws IOException, InterruptedException { String label = SLASH.split(labelText.toString())[1]; if (labelIndex.containsKey(label)) { ctx.write(new IntWritable(labelIndex.get(label)), instance); } else { ctx.getCounter(Counter.SKIPPED_INSTANCES).increment(1); } } }
@Override protected void map(Text labelText, VectorWritable instance, Context ctx) throws IOException, InterruptedException { String label = SLASH.split(labelText.toString())[1]; if (labelIndex.containsKey(label)) { ctx.write(new IntWritable(labelIndex.get(label)), instance); } else { ctx.getCounter(Counter.SKIPPED_INSTANCES).increment(1); } } }
@Override protected void cleanup(Context context) throws IOException, InterruptedException { // sort terms in descending order of their collection frequency List<String> temp = cfs.keys(); String[] terms = temp.toArray(new String[temp.size()]); Arrays.sort(terms, new Comparator<String>() { @Override public int compare(String t, String u) { return cfs.get(u) - cfs.get(t); } }); // assign term identifiers OpenObjectIntHashMap<String> tids = new OpenObjectIntHashMap<String>(); for (int i = 0; i < terms.length; i++) { tids.put(terms[i], (i + 1)); } // sort terms in lexicographic order and produce output Arrays.sort(terms); for (String term : terms) { outKey.set(term); outValue.set(cfs.get(term) + "\t" + dfs.get(term) + "\t" + tids.get(term)); context.write(outKey, outValue); } } }
@Override @Synchronized public int add(@NonNull TYPE item) { if (!map.containsKey(item)) { list.add(item); map.put(item, list.size() - 1); } return map.get(item); }
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(itemSeparator); OpenObjectIntHashMap<String> itemCounts = new OpenObjectIntHashMap<String>(); // ignore pos 0 which contains a sequence identifier for (int i = 1; i < items.length; i++) { // update counts of items String item = items[i]; itemCounts.adjustOrPutValue(item, +1, +1); } // emit item and frequency for (String term : itemCounts.keys()) { outKey.set(term); outValue.set(itemCounts.get(term)); context.write(outKey, outValue); } } }
@Override public void map(LongWritable k1, Text v1, OutputCollector<Text, IntWritable> oc, Reporter rprtr) throws IOException { OpenObjectIntHashMap<String> counts = new OpenObjectIntHashMap<String>(); for (String sentence : v1.toString().split("\n")) { String[] terms = sentence.split("\\s+"); for (String term : terms) { counts.adjustOrPutValue(term, +1, +1); } } for (String term : counts.keys()) { outKey.set(term); outValue.set(counts.get(term)); oc.collect(outKey, outValue); } }
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { int sid = 0; for (String sentence : value.toString().split("\n")) { // do nothing, if maximum number of documents has been seen if (--maxdocs < 0) { return; } long did = key.hashCode() * 10000L + (long) sid++;; OpenObjectIntHashMap<String> wordCounts = new OpenObjectIntHashMap<String>(); for (String term : sentence.split("\\s+")) { wordCounts.adjustOrPutValue(term, +1, +1); } for (String term : wordCounts.keys()) { outKey.set(term); outValue.setDId(did); outValue.setOffsets(new int[]{wordCounts.get(term)}); context.write(outKey, outValue); } } } }
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(itemSeparator); IntArrayList itemIds = new IntArrayList(); String sequenceId = items[0]; long id = 0; try { id = Long.parseLong(sequenceId); } catch (NumberFormatException e) { // if this is not a number, hash it id = sequenceId.hashCode(); } // ignore pos 0 which contains a sequence identifier for (int i = 1; i < items.length; i++) { // System.out.println(itemTIdMap.get(items[i])); itemIds.add(itemTIdMap.get(items[i])); } if (itemIds.size() > 0) { outKey.set(id); outValue.setContents(itemIds.toArray(new int[0])); // System.out.println(itemIds.toString()); context.write(outKey, outValue); } } }
@SuppressWarnings({ "unchecked", "rawtypes" }) @Override protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { int sid = 0; for (String sentence : value.toString().split("\n")) { // do nothing, if maximum number of documents has been seen if (--maxdocs < 0) { return; } IntArrayList tids = new IntArrayList(); for (String term : sentence.split("\\s+")) { tids.add(termTIdMap.get(term)); } long did = key.hashCode() * 10000L + (long) sid++; outKey.set(did); outValue.setContents(tids.toArray(new int[0])); context.write(outKey, outValue); } } }
/** * Read a dictionary in {@link SequenceFile} generated by * {@link org.apache.mahout.vectorizer.DictionaryVectorizer} * * @param filePattern * <PATH TO DICTIONARY>/dictionary.file-* */ public static String[] loadTermDictionary(Configuration conf, String filePattern) { OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<String>(); for (Pair<Text,IntWritable> record : new SequenceFileDirIterable<Text,IntWritable>(new Path(filePattern), PathType.GLOB, null, null, true, conf)) { dict.put(record.getFirst().toString(), record.getSecond().get()); } String[] dictionary = new String[dict.size()]; for (String feature : dict.keys()) { dictionary[dict.get(feature)] = feature; } return dictionary; }