public static <T> HashNgramMap<T> createImplicitWordHashNgramMap(final ValueContainer<T> values, final ConfigOptions opts, final LongArray[] numNgramsForEachWord, final boolean reversed) { return new HashNgramMap<T>(values, opts, numNgramsForEachWord, reversed); }
private long getKey(final int[] ngram, final int startPos, final int endPos) { long contextOffset = 0; for (int ngramOrder = 0; ngramOrder < endPos - startPos - 1; ++ngramOrder) { final int currNgramPos = reversed ? (endPos - ngramOrder - 1) : (startPos + ngramOrder); contextOffset = getOffsetForContextEncoding(contextOffset, ngramOrder - 1, ngram[currNgramPos], null); if (contextOffset == -1L) { return -1; } } return combineToKey(headWord(ngram, startPos, endPos), contextOffset); }
/** * @param ngram * @param startPos * @param endPos * @return */ private long getOffsetFromRawNgram(final int[] ngram, final int startPos, final int endPos) { if (containsOutOfVocab(ngram, startPos, endPos)) return -1; final int ngramOrder = endPos - startPos - 1; if (ngramOrder >= getMaxNgramOrder()) return -1; final long key = getKey(ngram, startPos, endPos); if (key < 0) return -1; final HashMap currMap = getMap(ngramOrder); if (currMap == null) return -1; final long index = currMap.getOffset(key); return index; }
/** * Gets the offset of the context for an n-gram (represented by offset) * * @param offset * @return */ public long getPrefixOffset(final long offset, final int ngramOrder) { if (ngramOrder == 0) return -1; return contextOffsetOf(getKey(offset, ngramOrder)); }
/** * @param word * @param suffixIndex * @return */ private final long shrinkKey(final long key) { final int word = ngramMap.wordOf(key); final long suffixIndex = ngramMap.contextOffsetOf(key); return (((long) word) << (numSuffixBits)) | suffixIndex; }
@Override public long put(final int[] ngram, int startPos, int endPos, final T val) { final int ngramOrder = endPos - startPos - 1; HashMap map = maps[ngramOrder]; if (map == null) { initMap(initCapacities[ngramOrder], ngramOrder); map = maps[ngramOrder]; } if (map instanceof ExplicitWordHashMap && map.getLoadFactor() >= maxLoadFactor) { rehash(ngramOrder, map.getCapacity() * 3 / 2); map = maps[ngramOrder]; } final long key = getKey(ngram, startPos, endPos); if (key < 0) return -1L; long oldSize = map.size(); final long index = map.put(key); final long suffixIndex = getSuffixOffset(ngram, startPos, endPos); values.add(ngram, startPos, endPos, ngramOrder, index, contextOffsetOf(key), wordOf(key), val, suffixIndex, map.size() > oldSize); return index; }
final HashNgramMap<T> newMap = new HashNgramMap<T>(newValues, opts, newCapacities, reversed, Arrays.copyOf(explicitMaps, changedNgramOrder)); final ExplicitWordHashMap newHashMap = (ExplicitWordHashMap) newMap.getHashMapForOrder(ngramOrder); final T val = values.getScratchValue(); final int[] scratchArray = new int[ngramOrder + 1]; getNgramFromContextEncodingHelp(contextOffsetOf(key), ngramOrder - 1, wordOf(key), scratchArray); final long newKey = newMap.getKey(scratchArray, 0, scratchArray.length); assert newKey >= 0 : "Failure for old n-gram " + Arrays.toString(scratchArray); final long index = newHashMap.put(newKey); assert index >= 0; final long suffixIndex = storeSuffixOffsets ? newMap.getSuffixOffset(scratchArray, 0, scratchArray.length) : -1L; assert !storeSuffixOffsets || suffixIndex >= 0 : "Could not find suffix offset for " + Arrays.toString(scratchArray); final boolean addWorked = newMap.values.add(scratchArray, 0, scratchArray.length, ngramOrder, index, contextOffsetOf(newKey), wordOf(newKey), val, suffixIndex, true); assert addWorked;
public List<TargetSideTranslation> getTranslations(final int[] src, final int startPos, final int endPos) { final long offsetForNgram = map.getOffsetForNgramInModel(src, startPos, endPos); if (offsetForNgram < 0) return Collections.emptyList(); final TargetTranslationsValues scratch = new PhraseTableValueContainer.TargetTranslationsValues(); map.getValues().getFromOffset(offsetForNgram, endPos - startPos - 1, scratch); final List<TargetSideTranslation> ret = new ArrayList<TargetSideTranslation>(); for (int i = 0; i < scratch.targetTranslationOffsets.length; ++i) { final FeaturePhraseTableValues features = new PhraseTableValueContainer.FeaturePhraseTableValues(null); final long currOffset = scratch.targetTranslationOffsets[i]; final int currOrder = scratch.targetTranslationOrders[i]; map.getValues().getFromOffset(currOffset, currOrder, features); if (features.features == null) { Logger.warn("Should probably fix"); continue; } final TargetSideTranslation tr = new TargetSideTranslation(); tr.features = Arrays.copyOf(features.features, features.features.length); int sepIndex = 0; final int[] srcAndTrg = map.getNgramForOffset(currOffset, currOrder); for (; sepIndex < srcAndTrg.length; ++sepIndex) { if (srcAndTrg[sepIndex] == ((PhraseTableValueContainer) map.getValues()).getSeparatorWord()) { break; } } tr.trgWords = Arrays.copyOfRange(srcAndTrg, sepIndex + 1, srcAndTrg.length); assert tr.trgWords.length > 0; ret.add(tr); } return ret; }
public int getFirstWordForOffset(final long offset, final int ngramOrder) { final long key = getMap(ngramOrder).getKey(offset); if (ngramOrder == 0) return wordOf(key); else return getFirstWordForOffset(contextOffsetOf(key), ngramOrder - 1); }
/** * @param contextOffset * @param contextOrder * @param word * @param scratch * @return */ private void getNgramFromContextEncodingHelp(final long contextOffset, final int contextOrder, final int word, final int[] scratch) { if (contextOrder < 0) { scratch[0] = word; } else { long contextOffset_ = contextOffset; int word_ = word; scratch[reversed ? 0 : (scratch.length - 1)] = word_; for (int i = 0; i <= contextOrder; ++i) { final int ngramOrder = contextOrder - i; final long key = getKey(contextOffset_, ngramOrder); contextOffset_ = contextOffsetOf(key); word_ = wordOf(key); scratch[reversed ? (i + 1) : (scratch.length - i - 2)] = word_; } } }
/** * @param ngram * @param startPos * @param endPos * @param val * @return */ private long putHelp(final int[] ngram, final int startPos, final int endPos, final T val, final boolean forcedNew) { final int ngramOrder = endPos - startPos - 1; HashMap map = getHashMapForOrder(ngramOrder); if (!forcedNew && map instanceof ExplicitWordHashMap && map.getLoadFactor() >= maxLoadFactor) { rehash(ngramOrder, map.getCapacity() * 3 / 2, 1); map = getHashMapForOrder(ngramOrder); } final long key = getKey(ngram, startPos, endPos); if (key < 0) return -1L; return putHelp(map, ngram, startPos, endPos, key, val, forcedNew); }
public int[] getNgramForOffset(final long offset, final int ngramOrder, final int[] ret) { long offset_ = offset; for (int i = 0; i <= ngramOrder; ++i) { final long key = getMap(ngramOrder - i).getKey(offset_); offset_ = contextOffsetOf(key); final int word_ = wordOf(key); ret[reversed ? (i) : (ngramOrder - i)] = word_; } return ret; }
private void rehash(final int changedNgramOrder, final long newCapacity) { final ValueContainer<T> newValues = values.createFreshValues(); final long[] newCapacities = new long[maps.length]; Arrays.fill(newCapacities, -1L); for (int ngramOrder = 0; ngramOrder < maps.length; ++ngramOrder) { if (maps[ngramOrder] == null) break; newCapacities[ngramOrder] = ngramOrder == changedNgramOrder ? newCapacity : maps[ngramOrder].getCapacity(); } final HashNgramMap<T> newMap = new HashNgramMap<T>(newValues, opts, newCapacities, reversed); for (int ngramOrder = 0; ngramOrder < maps.length; ++ngramOrder) { final HashMap currMap = maps[ngramOrder]; if (currMap == null) continue; for (long actualIndex = 0; actualIndex < currMap.getCapacity(); ++actualIndex) { final long key = currMap.getKey(actualIndex); if (currMap.isEmptyKey(key)) continue; final int[] ngram = getNgramFromContextEncoding(AbstractNgramMap.contextOffsetOf(key), ngramOrder - 1, AbstractNgramMap.wordOf(key)); final T val = values.getScratchValue(); values.getFromOffset(actualIndex, ngramOrder, val); newMap.put(ngram, 0, ngram.length, val); } } System.arraycopy(newMap.maps, 0, maps, 0, newMap.maps.length); values.setFromOtherValues(newValues); values.setMap(this); }
/** * @param ngramOrder * @param offset * @param contextOffset * @param word */ private void addPointerToTargetSidePhrase(final int ngramOrder, final long offset, final long contextOffset, final int word) { int currWord = word; long srcPhraseOffset = contextOffset; int srcPhraseOrder = ngramOrder - 1; while (currWord != separatorWord) { currWord = map.getNextWord(srcPhraseOffset, srcPhraseOrder); srcPhraseOffset = map.getNextContextOffset(srcPhraseOffset, srcPhraseOrder); srcPhraseOrder--; } final long valueIndex = -valueIndexes[srcPhraseOrder].get(srcPhraseOffset) - 1; final ArrayList<CustomWidthArray> targetTranslationPointersHere = targetTranslations[srcPhraseOrder]; targetTranslationPointersHere.get((int) valueIndex).add(combineOrderAndOffset(ngramOrder, offset)); }
/** * Warning: does not rehash if load factor is exceeded, must call * rehashIfNecessary explicitly. This is so that the offsets returned remain * valid. Basically, you should not use this function unless you really know * what you're doing. * * @param ngram * @param startPos * @param endPos * @param contextOffset * @param val * @return */ public long putWithOffsetAndSuffix(final int[] ngram, final int startPos, final int endPos, final long contextOffset, final long suffixOffset, final T val) { final int ngramOrder = endPos - startPos - 1; final long key = combineToKey(ngram[endPos - 1], contextOffset); final HashMap map = getHashMapForOrder(ngramOrder); return putHelpWithSuffixIndex(map, ngram, startPos, endPos, key, val, false, suffixOffset); }
/** * Warning: does not rehash if load factor is exceeded, must call * rehashIfNecessary explicitly. This is so that the offsets returned remain * valid. Basically, you should not use this function unless you really know * what you're doing. * * @param ngram * @param startPos * @param endPos * @param contextOffset * @param val * @return */ public long putWithOffset(final int[] ngram, final int startPos, final int endPos, final long contextOffset, final T val) { final int ngramOrder = endPos - startPos - 1; final long key = combineToKey(ngram[endPos - 1], contextOffset); final HashMap map = getHashMapForOrder(ngramOrder); return putHelp(map, ngram, startPos, endPos, key, val, false); }
/** * @param ngram * @param startPos * @param endPos * @return */ private long getOffsetFromRawNgram(final int[] ngram, final int startPos, final int endPos) { if (containsOutOfVocab(ngram, startPos, endPos)) return -1; final int ngramOrder = endPos - startPos - 1; if (ngramOrder >= maps.length) return -1; final HashMap currMap = maps[ngramOrder]; final long key = getKey(ngram, startPos, endPos); if (key < 0) return -1; final long index = currMap.getOffset(key); return index; }
public KneserNeyLmReaderCallback(PrintWriter outputFile, WordIndexer<W> wordIndexer, int maxOrder, ConfigOptions opts) { this.outputFile = outputFile; this.lmOrder = maxOrder; if (maxOrder >= MAX_ORDER) throw new IllegalArgumentException("Reguested n-grams of order " + maxOrder + " but we only allow up to " + 10); this.opts = opts; double last = Double.NEGATIVE_INFINITY; for (double c : opts.kneserNeyMinCounts) { if (c < last) throw new IllegalArgumentException("Please ensure that ConfigOptions.kneserNeyMinCounts is monotonic (value was " + Arrays.toString(opts.kneserNeyMinCounts) + ")"); } this.wordIndexer = wordIndexer; final KneseryNeyCountValueContainer values = new KneseryNeyCountValueContainer(lmOrder); ngrams = HashNgramMap.createExplicitWordHashNgramMap(values, new ConfigOptions(), lmOrder, false); }
/** * Gets the offset of the context for an n-gram (represented by offset) * * @param offset * @return */ public long getPrefixOffset(long offset, int ngramOrder) { if (ngramOrder == 0) return -1; return AbstractNgramMap.contextOffsetOf(getKey(offset, ngramOrder)); }