@Override public int getStartOffset() { return this.word.getStartOffset(); }
@Override public String getText() { return word.getString(); }
@Override public int getEndOffset() { return this.word.getEndOffset(); }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); Word word = mmSeg.next(); if(word != null) { termAtt.copyBuffer(word.getSen(), word.getWordOffset(), word.getLength()); offsetAtt.setOffset(word.getStartOffset(), word.getEndOffset()); typeAtt.setType(word.getType()); return true; } else { return false; } } }
Word word = chunk.words[i]; if(word.getLength() < 3) { cks.add(word); } else { char[] chs = word.getSen(); int offset = word.getWordOffset(), n = 0, wordEnd = word.getWordOffset()+word.getLength(); int senStartOffset = word.getStartOffset() - offset; //sen 在文件中的位置 int end = -1; //上一次找到的位置 for(; offset<wordEnd-1; offset++) { int idx = search(chs, offset, 1); if(idx > -1) { cks.add(new Word(chs, senStartOffset, offset, 2)); end = offset+2; n++; } else if(offset >= end) { //有单字 cks.add(new Word(chs, senStartOffset, offset, 1)); end = offset+1; cks.add(new Word(chs, senStartOffset, offset, 1));
public String getString() { return new String(getSen(), getWordOffset(), getLength()); }
public int getEndOffset() { return getStartOffset() + getLength(); } public int getDegree() {
/** Word Length */ public int getLen() { if(len < 0) { len = 0; count = 0; for(Word word : words) { if(word != null) { len += word.getLength(); count++; } } } return len; }
private Word createWord(StringBuilder bufSentence, int startIdx, String type) { return new Word(toChars(bufSentence), startIdx, type); }
private Chunk createChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns/*, char[][] cks*/) { Chunk ck = new Chunk(); for(int i=0; i<3; i++) { if(offsets[i] < chs.length) { ck.words[i] = new Word(chs, sen.getStartOffset(), offsets[i], tailLen[i]+1);//new Word(cks[i], sen.getStartOffset()+offsets[i]); if(tailLen[i] == 0) { //单字的要取得"字频计算出自由度" CharNode cn = cns[i]; //dic.head(chs[offsets[i]]); if(cn !=null) { ck.words[i].setDegree(cn.getFreq()); } } } } return ck; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); Word word = mmSeg.next(); if(word != null) { termAtt.copyBuffer(word.getSen(), word.getWordOffset(), word.getLength()); offsetAtt.setOffset(word.getStartOffset(), word.getEndOffset()); typeAtt.setType(word.getType()); return true; } else { return false; } } }
Word word = chunk.words[i]; if(word.getLength() < 3) { cks.add(word); } else { char[] chs = word.getSen(); int offset = word.getWordOffset(), n = 0, wordEnd = word.getWordOffset()+word.getLength(); int senStartOffset = word.getStartOffset() - offset; //sen 在文件中的位置 int end = -1; //上一次找到的位置 for(; offset<wordEnd-1; offset++) { int idx = search(chs, offset, 1); if(idx > -1) { cks.add(new Word(chs, senStartOffset, offset, 2)); end = offset+2; n++; } else if(offset >= end) { //有单字 cks.add(new Word(chs, senStartOffset, offset, 1)); end = offset+1; cks.add(new Word(chs, senStartOffset, offset, 1));
public String getString() { return new String(getSen(), getWordOffset(), getLength()); }
public int getEndOffset() { return getStartOffset() + getLength(); } public int getDegree() {
/** Word Length */ public int getLen() { if(len < 0) { len = 0; count = 0; for(Word word : words) { if(word != null) { len += word.getLength(); count++; } } } return len; }
private Word createWord(StringBuilder bufSentence, int startIdx, String type) { return new Word(toChars(bufSentence), startIdx, type); }
private Chunk createChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns/*, char[][] cks*/) { Chunk ck = new Chunk(); for(int i=0; i<3; i++) { if(offsets[i] < chs.length) { ck.words[i] = new Word(chs, sen.getStartOffset(), offsets[i], tailLen[i]+1);//new Word(cks[i], sen.getStartOffset()+offsets[i]); if(tailLen[i] == 0) { //单字的要取得"字频计算出自由度" CharNode cn = cns[i]; //dic.head(chs[offsets[i]]); if(cn !=null) { ck.words[i].setDegree(cn.getFreq()); } } } } return ck; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); Word word = mmSeg.get().next(); if(word != null) { //lucene 3.0 //termAtt.setTermBuffer(word.getSen(), word.getWordOffset(), word.getLength()); //lucene 3.1 termAtt.copyBuffer(word.getSen(), word.getWordOffset(), word.getLength()); offsetAtt.setOffset(word.getStartOffset(), word.getEndOffset()); typeAtt.setType(word.getType()); return true; } else { end(); return false; } } }