public long size() { long size = 0; for (NgramsForOrderIterableWrapper<W, V> map : ngramsForOrder) { size += map.size(); } if (size > Integer.MAX_VALUE) Logger.warn(NgramMapWrapper.class.getSimpleName() + " doesn't like maps with size greater than Integer.MAX_VALUE"); return (int) size; } }
@Override public void call(final int[] ngram, int startPos, int endPos, final V v, final String words) { final long add = map.put(ngram, startPos, endPos, v); if (add < 0) { if (warnCount >= 0 && warnCount < 10) { Logger.warn("Could not add line " + words + "\nThis is probabcly because the prefix or suff of the n-grams was not already in the map. This will be fixed in an upcoming release."); warnCount++; } if (warnCount > 10) warnCount = -1; } }
public long size() { long size = 0; for (final NgramsForOrderIterableWrapper<W, V> map : ngramsForOrder) { size += map.size(); } if (size > Integer.MAX_VALUE) Logger.warn(NgramMapWrapper.class.getSimpleName() + " doesn't like maps with size greater than Integer.MAX_VALUE"); return (int) size; } }
/** * @param sortedVocabPath */ public static <W> void addToIndexer(WordIndexer<W> wordIndexer, final String sortedVocabPath) { if (!(new File(sortedVocabPath).getName().equals(sortedVocabFile))) { Logger.warn("You have specified that " + sortedVocabPath + " is the count-sorted vocab file for Google n-grams, but it is usually named " + sortedVocabFile); } try { for (final String line : Iterators.able(IOUtils.lineIterator(sortedVocabPath))) { final String[] parts = line.split("\t"); final String word = parts[0]; wordIndexer.getOrAddIndexFromString(word); } } catch (final NumberFormatException e) { throw new RuntimeException(e); } catch (final IOException e) { throw new RuntimeException(e); } addSpecialSymbols(wordIndexer); }
/** * @param sortedVocabPath */ public static <W> void addToIndexer(final WordIndexer<W> wordIndexer, final String sortedVocabPath) { if (!(new File(sortedVocabPath).getName().equals(sortedVocabFile))) { Logger.warn("You have specified that " + sortedVocabPath + " is the count-sorted vocab file for Google n-grams, but it is usually named " + sortedVocabFile); } try { for (final String line : Iterators.able(IOUtils.lineIterator(sortedVocabPath))) { final String[] parts = line.split("\t"); final String word = parts[0]; wordIndexer.getOrAddIndexFromString(word); } } catch (final NumberFormatException e) { throw new RuntimeException(e); } catch (final IOException e) { throw new RuntimeException(e); } addSpecialSymbols(wordIndexer); }
final Float val = Float.parseFloat(featStrings[i]); if (val.isInfinite() || val.isNaN()) { Logger.warn("Non-finite feature: " + featStrings[i]); continue;
Float val = Float.parseFloat(featStrings[i]); if (val.isInfinite() || val.isNaN()) { Logger.warn("Non-finite feature: " + featStrings[i]); continue;
addToIndexer(wordIndexer, sortedVocabPath); } else if (ngramFiles.length == 0) { Logger.warn("Did not find any files matching expected regex " + regex);
public List<TargetSideTranslation> getTranslations(final int[] src, final int startPos, final int endPos) { final long offsetForNgram = map.getOffsetForNgramInModel(src, startPos, endPos); if (offsetForNgram < 0) return Collections.emptyList(); final TargetTranslationsValues scratch = new PhraseTableValueContainer.TargetTranslationsValues(); map.getValues().getFromOffset(offsetForNgram, endPos - startPos - 1, scratch); final List<TargetSideTranslation> ret = new ArrayList<TargetSideTranslation>(); for (int i = 0; i < scratch.targetTranslationOffsets.length; ++i) { final FeaturePhraseTableValues features = new PhraseTableValueContainer.FeaturePhraseTableValues(null); final long currOffset = scratch.targetTranslationOffsets[i]; final int currOrder = scratch.targetTranslationOrders[i]; map.getValues().getFromOffset(currOffset, currOrder, features); if (features.features == null) { Logger.warn("Should probably fix"); continue; } final TargetSideTranslation tr = new TargetSideTranslation(); tr.features = Arrays.copyOf(features.features, features.features.length); int sepIndex = 0; final int[] srcAndTrg = map.getNgramForOffset(currOffset, currOrder); for (; sepIndex < srcAndTrg.length; ++sepIndex) { if (srcAndTrg[sepIndex] == ((PhraseTableValueContainer) map.getValues()).getSeparatorWord()) { break; } } tr.trgWords = Arrays.copyOfRange(srcAndTrg, sepIndex + 1, srcAndTrg.length); assert tr.trgWords.length > 0; ret.add(tr); } return ret; }
public List<TargetSideTranslation> getTranslations(int[] src, int startPos, int endPos) { long offsetForNgram = map.getOffsetForNgramInModel(src, startPos, endPos); if (offsetForNgram < 0) return Collections.emptyList(); TargetTranslationsValues scratch = new PhraseTableValueContainer.TargetTranslationsValues(); map.getValues().getFromOffset(offsetForNgram, endPos - startPos - 1, scratch); List<TargetSideTranslation> ret = new ArrayList<TargetSideTranslation>(); for (int i = 0; i < scratch.targetTranslationOffsets.length; ++i) { FeaturePhraseTableValues features = new PhraseTableValueContainer.FeaturePhraseTableValues(null); final long currOffset = scratch.targetTranslationOffsets[i]; final int currOrder = scratch.targetTranslationOrders[i]; map.getValues().getFromOffset(currOffset, currOrder, features); if (features.features == null) { Logger.warn("Should probably fix"); continue; } TargetSideTranslation tr = new TargetSideTranslation(); tr.features = Arrays.copyOf(features.features, features.features.length); int sepIndex = 0; int[] srcAndTrg = map.getNgramForOffset(currOffset, currOrder); for (; sepIndex < srcAndTrg.length; ++sepIndex) { if (srcAndTrg[sepIndex] == ((PhraseTableValueContainer) map.getValues()).getSeparatorWord()) { break; } } tr.trgWords = Arrays.copyOfRange(srcAndTrg, sepIndex + 1, srcAndTrg.length); assert tr.trgWords.length > 0; ret.add(tr); } return ret; }