public VectorExtractor(int numThreads, int numPages, String lsaRoot, int dim, boolean normalized) throws IOException { super(numThreads, numPages); if (!lsaRoot.endsWith(File.separator)) { lsaRoot += File.separator; } logger.info("reading lsm model from " + lsaRoot + " (" + dim + ")..."); File Ut = new File(lsaRoot + "X-Ut"); File Sk = new File(lsaRoot + "X-S"); File r = new File(lsaRoot + "X-row"); File c = new File(lsaRoot + "X-col"); File df = new File(lsaRoot + "X-df"); boolean rescaleIdf = true; lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf, normalized); }
/** * */ public float compare(BOW bow1, BOW bow2) { Vector d1 = lsm.mapDocument(bow1); Vector d2 = lsm.mapDocument(bow2); Vector pd1 = lsm.mapPseudoDocument(d1); Vector pd2 = lsm.mapPseudoDocument(d2); //float cosVSM = d1.dotProduct(d2) / (float) Math.sqrt(d1.dotProduct(d2) * d2.dotProduct(d2)); //float dotLSM = pd1.dotProduct(pd2) / (float) Math.sqrt(d1.dotProduct(pd2) * d2.dotProduct(pd2)); float dotLSM = pd1.dotProduct(pd2) / (float) Math.sqrt(pd1.dotProduct(pd1) * pd2.dotProduct(pd2)); return dotLSM; } // end compare
Vector vec = lsm.mapDocument(bow); vec.normalize(); Vector pseudoVec = lsm.mapPseudoDocument(vec); pseudoVec.normalize(); sb = new StringBuilder(); sb.append(pseudoVec.toString()); sb.append(CharacterTable.HORIZONTAL_TABULATION); sb.append(vec.toString(lsm.getDimension())); set.add(sb.toString());
public static void main(String[] args) throws Exception { String logConfig = System.getProperty("log-config"); if (logConfig == null) { logConfig = "log-config.txt"; } long begin = System.currentTimeMillis(); PropertyConfigurator.configure(logConfig); if (args.length != 5) { logger.info(getHelp()); System.exit(1); } File Ut = new File(args[0] + "-Ut"); File Sk = new File(args[0] + "-S"); File r = new File(args[0] + "-row"); File c = new File(args[0] + "-col"); File df = new File(args[0] + "-df"); double threshold = Double.parseDouble(args[1]); int size = Integer.parseInt(args[2]); int dim = Integer.parseInt(args[3]); boolean rescaleIdf = Boolean.parseBoolean(args[4]); LSM LSM = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); LSM.interactive(); long end = System.currentTimeMillis(); logger.info("term similarity calculated in " + (end - begin) + " ms"); } // end main
LSM lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); LSSimilarity lss = new LSSimilarity(lsm, size); log.println(i + ":" + j + "(" + s1[0] + ":" + s2[0] + ") bow2:" + bow2); Vector d1 = lsm.mapDocument(bow1); log.println("d1:" + d1); Vector d2 = lsm.mapDocument(bow2); log.println("d2:" + d2); Vector pd1 = lsm.mapPseudoDocument(d1); log.println("pd1:" + pd1); Vector pd2 = lsm.mapPseudoDocument(d2); log.println("pd2:" + pd2);
Vector d1 = lsm.mapDocument(bow1); Vector d2 = lsm.mapDocument(bow2); Vector pd1 = lsm.mapPseudoDocument(d1); Vector pd2 = lsm.mapPseudoDocument(d2); long begin = System.nanoTime(); ScoreTermMap map = new ScoreTermMap(query, size); Vector vec1 = lsm.mapTerm(query); Iterator<String> it = lsm.terms(); while (it.hasNext()) { term = it.next(); Vector vec2 = lsm.mapTerm(term); float cos = vec1.dotProduct(vec2) / (float) Math .sqrt(vec1.dotProduct(vec1) * vec2.dotProduct(vec2));
/** * Returns a document in the VSM. */ public Vector mapDocument(BOW bow, boolean b) { //logger.info("lsm.mapDocument " + b); SparseVector vector = new SparseVector(); Iterator<String> it = bow.termSet().iterator(); for (int i = 0; it.hasNext(); i++) { //logger.info(i + " " + t[i]); String term = it.next(); int index = termIndex.get(term); if (index != -1) { int tf = bow.getFrequency(term); float tfIdf = (float) (log2(tf)); if (b) { tfIdf *= Iidf[index]; } //logger.info(term + " ==> " + index + ", tf.idf = " + tf + "(" + (log2(tf)) + ") * " + Iidf[index] + " = " + tfIdf); vector.add(index, tfIdf); } } // end for return vector; } // end map
LSM lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); LSSimilarity lss = new LSSimilarity(lsm, size); logger.info(bow1); Vector d1 = lsm.mapDocument(bow1); d1.normalize(); log.println("d1:" + d1); Vector pd1 = lsm.mapPseudoDocument(d1); pd1.normalize(); log.println("pd1:" + pd1); log.println(i + ":" + j + "(" + s1[0] + ":" + s2[0] + ") bow2:" + bow2); Vector d2 = lsm.mapDocument(bow2); d2.normalize(); log.println("d2:" + d2); Vector pd2 = lsm.mapPseudoDocument(d2); pd2.normalize(); log.println("pd2:" + pd2);
Vector d1 = mapDocument(bow1); Vector d2 = mapDocument(bow2); Vector pd1 = mapPseudoDocument(d1); Vector pd2 = mapPseudoDocument(d2); long begin = System.nanoTime(); ScoreTermMap map = new ScoreTermMap(query, 20); Vector vec1 = mapTerm(query); Iterator<String> it = terms(); while (it.hasNext()) { term = it.next(); Vector vec2 = mapTerm(term); double cos = vec1.dotProduct(vec2) / Math.sqrt(vec1.dotProduct(vec1) * vec2.dotProduct(vec2)); map.put(cos, term);
/** * Returns a document in the VSM. */ public Vector mapDocument(MultiSet<String> bow) { //logger.info("lsm.mapDocument"); SparseVector vector = new SparseVector(); Iterator<String> it = bow.iterator(); String term = null; int index = 0; int tf = 0; float tfIdf; for (int i = 0; it.hasNext(); i++) { //logger.info(i + " " + t[i]); term = it.next(); index = termIndex.get(term); if (index != -1) { tf = bow.getFrequency(term); tfIdf = (float) (log2(tf)) * Iidf[index]; //logger.info(term + " ==> " + index + ", tf.idf = " + tf + "(" + (log2(tf)) + ") * " + Iidf[index] + " = " + tfIdf); vector.add(index, tfIdf); } } // end for return vector; } // end map
/** * dot[0] is cosine * dot[1] is lsa */ public float[] compare2(BOW bow1, BOW bow2) { Vector d1 = lsm.mapDocument(bow1); Vector d2 = lsm.mapDocument(bow2); Vector pd1 = lsm.mapPseudoDocument(d1); Vector pd2 = lsm.mapPseudoDocument(d2); float[] dot = new float[2]; dot[0] = d1.dotProduct(d2) / (float) Math.sqrt(d1.dotProduct(d1) * d2.dotProduct(d2)); //float dotLSM = pd1.dotProduct(pd2) / (float) Math.sqrt(d1.dotProduct(pd2) * d2.dotProduct(pd2)); dot[1] = pd1.dotProduct(pd2) / (float) Math.sqrt(pd1.dotProduct(pd1) * pd2.dotProduct(pd2)); return dot; } // end compare2
public WikipediaVectorExtractor(int numThreads, int numPages, Locale locale, String lsaRoot) throws IOException { super(numThreads, numPages, locale); if (!lsaRoot.endsWith(File.separator)) { lsaRoot += File.separator; } File Ut = new File(lsaRoot + "X-Ut"); File Sk = new File(lsaRoot + "X-S"); File r = new File(lsaRoot + "X-row"); File c = new File(lsaRoot + "X-col"); File df = new File(lsaRoot + "X-df"); int dim = 100; boolean rescaleIdf = true; lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); }
Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); pw.print(s[i]); pw.print(" ");
public static void main(String[] args) throws Exception { String logConfig = System.getProperty("log-config"); if (logConfig == null) { logConfig = "log-config.txt"; } PropertyConfigurator.configure(logConfig); if (args.length != 6) { System.out.println( "Usage: java -mx512M eu.fbk.utils.lsa.util.NgramComparator input threshold size dim idf file"); System.exit(1); } File Ut = new File(args[0] + "-Ut"); File Sk = new File(args[0] + "-S"); File r = new File(args[0] + "-row"); File c = new File(args[0] + "-col"); File df = new File(args[0] + "-df"); double threshold = Double.parseDouble(args[1]); int size = Integer.parseInt(args[2]); int dim = Integer.parseInt(args[3]); boolean rescaleIdf = Boolean.parseBoolean(args[4]); LSM lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); LSSimilarity lss = new LSSimilarity(lsm, size); new NgramComparator(args[5], lss); } // end main } // end NgramComparator
Vector createVector(List<String[]> list) { BOW bow = new BOW(); String[] s; String[] leftContext; String[] rightContext; for (int i = 0; i < list.size(); i++) { s = list.get(i); leftContext = spacePattern.split(s[OneExamplePerSenseExtractor.LEFT_CONTEXT_INDEX].toLowerCase()); rightContext = spacePattern.split(s[OneExamplePerSenseExtractor.RIGHT_CONTEXT_INDEX].toLowerCase()); bow.addAll(leftContext); bow.addAll(rightContext); } Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); return pd; }
File doc = new File(args[3]); LSM lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); LSSimilarity lss = new LSSimilarity(lsm, size); double qt = Double.parseDouble(args[4]);
@Override public void processLine(String line) { //To change body of implemented methods use File | Settings | File Templates. String[] tokens = spacePattern.split(line); if (tokens.length < 2) { return; } try { BOW bow = new BOW(); for (int i = 1; i < tokens.length; i++) { //logger.debug(i + "\t'" + tokens[i] + "'\t" + tokens[0]); bow.add(tokens[i].toLowerCase()); } Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); synchronized (this) { vectorWriter.print(tokens[0]); //vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); //vectorWriter.print(bow.toSingleLine()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.print(pd.toString()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.println(d.toString()); } } catch (Exception e) { logger.error("Error processing page " + tokens[0]); } }
boolean rescaleIdf = Boolean.parseBoolean(args[4]); LSM lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); LSSimilarity lss = new LSSimilarity(lsm, size);
@Override public void contentPage(String text, String title, int wikiID) { try { WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance(); //logger.debug(title + "\t" + wikiID); String[] prefixes = {filePrefix, imagePrefix}; ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes); String page = tokenizedText(parsedPage, title); BOW bow = new BOW(page.toLowerCase()); Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); synchronized (this) { vectorWriter.print(title); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.print(pd.toString()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.println(d.toString()); } } catch (Exception e) { logger.error("Error processing page " + title + " (" + wikiID + ")"); } }