if (wordVectors.vocab().words().size() != FileUtils.readLines(augmentedCSVFile).size()) { throw new Exception("Something went wrong"); // Continue in catch clause
/** * Writes the model to DATEXIS binary format * @param vec * @param outputStream */ private static void writeBinaryModel(WordVectors vec, OutputStream outputStream) throws IOException { int words = 0; try(BufferedOutputStream buf = new BufferedOutputStream(outputStream); DataOutputStream writer = new DataOutputStream(buf)) { for(Object word : vec.vocab().words()) { if(word == null) continue; INDArray wordVector = vec.getWordVectorMatrix((String) word); log.trace("Write: " + word + " (size " + wordVector.length() + ")"); writer.writeUTF((String) word); Nd4j.write(wordVector, writer); words++; } writer.flush(); } log.info("Wrote " + words + " words with size " + vec.lookupTable().layerSize()); }
@SuppressWarnings("unchecked") public static void verify(WordVectors wv, Path binaryTarget) throws IOException { BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget); if (vec.contains(UNK)) { System.out.printf("Unknown word is contained in vocabulary!%n"); } float[] gen1 = makeUnk(wv.lookupTable().layerSize()).data().asFloat(); float[] gen2 = makeUnk(wv.lookupTable().layerSize()).data().asFloat(); float[] stored = vec.vectorize(UNK); if (!Arrays.equals(gen1, gen2)) { System.out.printf("Unstable generated unknown word%n"); } if (!Arrays.equals(gen1, stored)) { System.out.printf("Vectors differ for unknown word%n"); } for (String word : (Collection<String>) wv.vocab().words()) { float[] orig = ArrayUtil.toFloats(wv.getWordVector(word)); float[] conv = vec.vectorize(word); if (!Arrays.equals(orig, conv)) { System.out.printf("Vectors differ for word [%s]%n", word); } } }
/** * Test different word vector formats crafted by hand. */ @Test public void testDifferentEmbeddings() { File embDir = new File("src/test/resources/embeddings/small"); final File[] embeddings = embDir.listFiles(); Set<String> words = new HashSet<>(); words.add("snowball"); words.add("christmas"); words.add("tree"); for (File f : embeddings) { log.info("Testing embedding {}", f.getAbsolutePath()); RnnTextEmbeddingInstanceIterator teii = new RnnTextEmbeddingInstanceIterator(); if (f.getAbsolutePath().contains("arff")) { log.info(""); } teii.setWordVectorLocation(f); teii.initialize(); final Collection ws = teii.getWordVectors().vocab().words(); assertTrue(ws.containsAll(words) && words.containsAll(ws)); } } }
public Instances makeData() throws Exception { final Instances data = TestUtil.makeTestDataset(42, 100, 0, 0, 1, 0, 0, 1, Attribute.NUMERIC, 1, false); WordVectors wordVectors = WordVectorSerializer.loadStaticModel(DatasetLoader.loadGoogleNewsVectors()); String[] words = (String[]) wordVectors.vocab().words().toArray(new String[0]); Random rand = new Random(42); for (Instance inst : data) { StringBuilder sentence = new StringBuilder(); for(int i = 0; i < 10; i++){ final int idx = rand.nextInt(words.length); sentence.append(" ").append(words[idx]); } inst.setValue(0, sentence.toString()); } return data; }
public Instances makeData() throws Exception { final Instances data = TestUtil.makeTestDataset(42, 100, 0, 0, 1, 0, 0, 1, Attribute.NUMERIC, 1, false); WordVectors wordVectors = WordVectorSerializer.loadStaticModel(DatasetLoader.loadGoogleNewsVectors()); String[] words = (String[]) wordVectors.vocab().words().toArray(new String[0]); Random rand = new Random(42); for (Instance inst : data) { StringBuilder sentence = new StringBuilder(); for(int i = 0; i < 10; i++){ final int idx = rand.nextInt(words.length); sentence.append(" ").append(words[idx]); } inst.setValue(0, sentence.toString()); } return data; }
public Collection<String> getNearestNeighbours(INDArray v, int k) { Counter<String> distances = new Counter<>(); for(Object s : vec.vocab().words()) { String word = (String) s; INDArray otherVec = encode(word); double sim = Transforms.cosineSim(v, otherVec); distances.incrementCount(word, sim); } distances.keepTopNElements(k); return distances.keySetSorted(); }
@Override public Instances getDataSet() throws IOException { if (m_sourceFile == null) { throw new IOException("No source has been specified"); } if (getRetrieval() == INCREMENTAL) { throw new IOException("This loader cannot load instances incrementally."); } setRetrieval(BATCH); if (m_structure == null) { getStructure(); } Instances result = new Instances(m_structure); for (String word : vec.getVocab().words()) { double[] values = new double[result.numAttributes()]; for (int i = 0; i < this.vec.getWordVector(word).length; i++) values[i] = this.vec.getWordVector(word)[i]; values[result.numAttributes() - 1] = result.attribute("word_id").addStringValue(word); Instance inst = new DenseInstance(1, values); inst.setDataset(result); result.add(inst); } return result; }
String[] words = this.vec.getVocab().words().toArray(new String[0]); Arrays.sort(words);