public static void main(String[] args) throws Exception { String logConfig = System.getProperty("log-config"); if (logConfig == null) { logConfig = "log-config.txt"; } long begin = System.currentTimeMillis(); PropertyConfigurator.configure(logConfig); if (args.length != 4) { System.out.println("Usage: java -mx1024M eu.fbk.utils.lsa.TermSet file size column lowercase"); System.exit(1); } TermSet set = new TermSet(); set.setMaxSize(Integer.parseInt(args[1])); set.setColumn(Integer.parseInt(args[2])); set.setLowercase(Boolean.parseBoolean(args[3])); Reader reader = new InputStreamReader(new FileInputStream(args[0]), "UTF-8"); set.read(reader); logger.info("set:\n" + set.toString()); logger.info("size: " + set.size()); logger.info("max size: " + set.getMaxSize()); logger.info("column: " + set.getColumn()); logger.info("lowercase: " + set.getLowercase()); } // end main
/** * Constructs a reader. */ public TermDocumentMatrixBuilder(String matrixName, File stopwordFile, File keywordFile) throws IOException { totalKW = 0; keywordSet = new TermSet(); keywordSet.read(new FileReader(keywordFile)); logger.info("keyword to be indexed: " + keywordSet.size()); stopwordSet = new TermSet(); stopwordSet.read(new FileReader(stopwordFile)); logger.info(stopwordFile + "(" + stopwordSet.size() + ")"); lengthFreq = new int[101]; columnCount = 0; matrixFile = new File(matrixName + "-matrix"); rowFile = new File(matrixName + "-row"); colFile = new File(matrixName + "-col"); dfFile = new File(matrixName + "-df"); termIndex = new Index(); documentIndex = new Index(); matrixWriter = new SparseBinaryMatrixFileWriter(matrixFile); corpusVocabulary = new Vocabulary(); } // end constructor
if (keywordSet.size() == 0) { if (stopwordSet.size() == 0) { logger.debug("1 adding " + token); documentVocabulary.add(token); } else if (!stopwordSet.contains(token)) { logger.debug("2 adding " + token); documentVocabulary.add(token); } else if (keywordSet.contains(token)) { logger.debug("3 adding " + token); documentVocabulary.add(token);
public void read(Reader in, Stemmer stemmer) throws IOException { this.stemmer = stemmer; read(in); }
try { totalKW = 0; keywordSet = new TermSet(); keywordSet.read(new BufferedReader(new InputStreamReader(new FileInputStream(keywordFile), "UTF-8"))); logger.info(keywordSet.size() + " keywords read from " + keywordFile); stopwordSet = new TermSet(); stopwordSet.read(new BufferedReader(new InputStreamReader(new FileInputStream(stopwordFile), "UTF-8")), stemmer); logger.info(stopwordSet.size() + " stopwords read from " + stopwordFile);