org.deeplearning4j.models.word2vec.wordstore.VocabCache.words java code examples

if (wordVectors.vocab().words().size() != FileUtils.readLines(augmentedCSVFile).size()) {
 throw new Exception("Something went wrong"); // Continue in catch clause

header.wordCount = wv.vocab().words().size();
header.vectorLength = wv.lookupTable().layerSize();
header.caseless = aCaseless;
  String[] words = (String[]) wv.vocab().words()
      .toArray(new String[wv.vocab().words().size()]);
  Arrays.sort(words);

/**
 * Writes the model to DATEXIS binary format
 * @param vec
 * @param outputStream 
 */
private static void writeBinaryModel(WordVectors vec, OutputStream outputStream) throws IOException {
 
 int words = 0;
 
 try(BufferedOutputStream buf = new BufferedOutputStream(outputStream);
    DataOutputStream writer = new DataOutputStream(buf)) {
  for(Object word : vec.vocab().words()) {
   if(word == null) continue;
   INDArray wordVector = vec.getWordVectorMatrix((String) word);
   log.trace("Write: " + word + " (size " + wordVector.length() + ")");
   writer.writeUTF((String) word);
   Nd4j.write(wordVector, writer);
   words++;
  }
  writer.flush();
 }
 
 log.info("Wrote " + words + " words with size " + vec.lookupTable().layerSize());
 
}

@SuppressWarnings("unchecked")
public static void verify(WordVectors wv, Path binaryTarget) throws IOException
{
  BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget);
  if (vec.contains(UNK)) {
    System.out.printf("Unknown word is contained in vocabulary!%n");
  }
  float[] gen1 = makeUnk(wv.lookupTable().layerSize()).data().asFloat();
  float[] gen2 = makeUnk(wv.lookupTable().layerSize()).data().asFloat();
  float[] stored = vec.vectorize(UNK);
  if (!Arrays.equals(gen1, gen2)) {
    System.out.printf("Unstable generated unknown word%n");
  }
  if (!Arrays.equals(gen1, stored)) {
    System.out.printf("Vectors differ for unknown word%n");
  }
  for (String word : (Collection<String>) wv.vocab().words()) {
    float[] orig = ArrayUtil.toFloats(wv.getWordVector(word));
    float[] conv = vec.vectorize(word);
    if (!Arrays.equals(orig, conv)) {
      System.out.printf("Vectors differ for word [%s]%n", word);
    }
  }
}

 /**
  * Test different word vector formats crafted by hand.
  */
 @Test
 public void testDifferentEmbeddings() {
  File embDir = new File("src/test/resources/embeddings/small");
  final File[] embeddings = embDir.listFiles();
  Set<String> words = new HashSet<>();
  words.add("snowball");
  words.add("christmas");
  words.add("tree");

  for (File f : embeddings) {
   log.info("Testing embedding {}", f.getAbsolutePath());
   RnnTextEmbeddingInstanceIterator teii = new RnnTextEmbeddingInstanceIterator();
   if (f.getAbsolutePath().contains("arff")) {
    log.info("");
   }
   teii.setWordVectorLocation(f);

   teii.initialize();
   final Collection ws = teii.getWordVectors().vocab().words();
   assertTrue(ws.containsAll(words) && words.containsAll(ws));
  }
 }
}

public Instances makeData() throws Exception {
 final Instances data = TestUtil.makeTestDataset(42,
   100,
   0,
   0,
   1,
   0,
   0,
   1,
   Attribute.NUMERIC,
   1,
   false);
 WordVectors wordVectors = WordVectorSerializer.loadStaticModel(DatasetLoader.loadGoogleNewsVectors());
 String[] words = (String[]) wordVectors.vocab().words().toArray(new String[0]);
 Random rand = new Random(42);
 for (Instance inst : data) {
  StringBuilder sentence = new StringBuilder();
  for(int i = 0; i < 10; i++){
   final int idx = rand.nextInt(words.length);
   sentence.append(" ").append(words[idx]);
  }
  inst.setValue(0, sentence.toString());
 }
 return data;
}

public Instances makeData() throws Exception {
 final Instances data = TestUtil.makeTestDataset(42,
   100,
   0,
   0,
   1,
   0,
   0,
   1,
   Attribute.NUMERIC,
   1,
   false);
 WordVectors wordVectors = WordVectorSerializer.loadStaticModel(DatasetLoader.loadGoogleNewsVectors());
 String[] words = (String[]) wordVectors.vocab().words().toArray(new String[0]);
 Random rand = new Random(42);
 for (Instance inst : data) {
  StringBuilder sentence = new StringBuilder();
  for(int i = 0; i < 10; i++){
   final int idx = rand.nextInt(words.length);
   sentence.append(" ").append(words[idx]);
  }
  inst.setValue(0, sentence.toString());
 }
 return data;
}

public Collection<String> getNearestNeighbours(INDArray v, int k) {
  Counter<String> distances = new Counter<>();
for(Object s : vec.vocab().words()) {
    String word = (String) s;
    INDArray otherVec = encode(word);
    double sim = Transforms.cosineSim(v, otherVec);
    distances.incrementCount(word, sim);
  }
  distances.keepTopNElements(k);
  return distances.keySetSorted();
}

@Override
public Instances getDataSet() throws IOException {
 if (m_sourceFile == null) {
  throw new IOException("No source has been specified");
 }
 if (getRetrieval() == INCREMENTAL) {
  throw new IOException("This loader cannot load instances incrementally.");
 }
 setRetrieval(BATCH);
 if (m_structure == null) {
  getStructure();
 }
 Instances result = new Instances(m_structure);
 for (String word : vec.getVocab().words()) {
  double[] values = new double[result.numAttributes()];
  for (int i = 0; i < this.vec.getWordVector(word).length; i++)
   values[i] = this.vec.getWordVector(word)[i];
  values[result.numAttributes() - 1] = result.attribute("word_id").addStringValue(word);
  Instance inst = new DenseInstance(1, values);
  inst.setDataset(result);
  result.add(inst);
 }
 return result;
}

String[] words = this.vec.getVocab().words().toArray(new String[0]);
Arrays.sort(words);

Popular methods of VocabCache

Popular in Java

Creating JSON documents from java classes using gson
getSystemService (Context)
compareTo (BigDecimal)
requestLocationUpdates (LocationManager)
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
Selector (java.nio.channels)
A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Timer (java.util)
Timers schedule one-shot or recurring TimerTask for execution. Prefer java.util.concurrent.Scheduled
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Best IntelliJ plugins

How to use wordsmethodin org.deeplearning4j.models.word2vec.wordstore.VocabCache

Best Java code snippets using org.deeplearning4j.models.word2vec.wordstore.VocabCache.words (Showing top 10 results out of 315)

How to use
words
method
in
org.deeplearning4j.models.word2vec.wordstore.VocabCache