public void addAll(String[] words) { //logger.debug(Arrays.toString(words)); for (int i = 0; i < words.length; i++) { add(words[i]); } }
public void addAll(String[] words, int from, int to) { //logger.debug(Arrays.toString(words)); for (int i = from; i < to; i++) { add(words[i]); } }
public BOW(String[] text) { map = new HashMap<String, Counter>(); for (int i = 0; i < text.length; i++) { //add(text[i].toLowerCase()); if (text[i].length() > 0) { //logger.debug(i + "\t" + text[i]); add(text[i]); } } }
private BOW createBow(Token[] tokenArray) { BOW bow = new BOW(); for (int i = 0; i < tokenArray.length; i++) { bow.add(tokenArray[i].getForm().toLowerCase()); } return bow; }
private void tokenize(String text) { BreakIterator boundary = BreakIterator.getWordInstance(Locale.US); boundary.setText(text); int start = boundary.first(); String token = null; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { token = text.substring(start, end).toLowerCase(); if (!token.matches("\\s+")) { //logger.info("'" + token + "'\t" + start + ", " + end); add(token); } } // end for end }
@Override public void processLine(String line) { //To change body of implemented methods use File | Settings | File Templates. String[] tokens = spacePattern.split(line); if (tokens.length < 2) { return; } try { BOW bow = new BOW(); for (int i = 1; i < tokens.length; i++) { //logger.debug(i + "\t'" + tokens[i] + "'\t" + tokens[0]); bow.add(tokens[i].toLowerCase()); } Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); synchronized (this) { vectorWriter.print(tokens[0]); //vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); //vectorWriter.print(bow.toSingleLine()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.print(pd.toString()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.println(d.toString()); } } catch (Exception e) { logger.error("Error processing page " + tokens[0]); } }