Tabnine Logo
JapaneseTokenizer
Code IndexAdd Tabnine to your IDE (free)

How to use
JapaneseTokenizer
in
org.apache.lucene.analysis.ja

Best Java code snippets using org.apache.lucene.analysis.ja.JapaneseTokenizer (Showing top 20 results out of 315)

origin: org.elasticsearch.plugin/analysis-kuromoji

  @Override
  public Tokenizer create() {
    return new JapaneseTokenizer(null, true, Mode.SEARCH);
  }
}));
origin: org.codelibs/elasticsearch-analysis-ja

@Override
public Tokenizer create() {
  final JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
  int nBestCost = this.nBestCost;
  if (nBestExamples != null) {
    nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
  }
  t.setNBestCost(nBestCost);
  return t;
}
origin: org.codelibs/elasticsearch-analysis-ja

@Override
public void close() throws IOException {
  tokenizer.close();
}
origin: org.apache.lucene/lucene-analyzers-kuromoji

setReader(new StringReader(inText));
reset();
try {
 setNBestCost(1);
 int prevRootBase = -1;
 while (incrementToken()) {
  if (lattice.rootBase != prevRootBase) {
   prevRootBase = lattice.rootBase;
 end();
 close();
 setNBestCost(saveNBestCost);
origin: omegat-org/omegat

@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed)
    throws IOException {
  if (stemsAllowed) {
    // Blank out tags when stemming only
    strOrig = blankOutTags(strOrig);
    CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
    Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.emptySet();
    return new JapaneseAnalyzer(null, Mode.SEARCH, stopWords, stopTags).tokenStream("",
        new StringReader(strOrig));
  } else {
    JapaneseTokenizer tokenizer = new JapaneseTokenizer(null, false, Mode.NORMAL);
    tokenizer.setReader(new StringReader(strOrig));
    return new TagJoiningFilter(tokenizer);
  }
}
origin: org.codelibs/elasticsearch-analysis-ja

  tokenizer.setReader(inputPending);
tokenizer.reset();
origin: org.apache.lucene/lucene-analyzers-kuromoji

 parse();
clearAttributes();
assert length > 0;
offsetAtt.setOffset(correctOffset(position), correctOffset(position+length));
basicFormAtt.setToken(token);
posAtt.setToken(token);
origin: com.google.code/lucene-gosen-ipadic

@Override
public boolean incrementToken() throws IOException {
 Token token = tagger.next();
 if (token == null) {
  return false;
 } else {
  clearAttributes();
  final Morpheme m = token.getMorpheme();
 
  // note, unlike the previous implementation, we set the surface form
  termAtt.setEmpty().append(token.getSurface());
  final int cost = token.getCost();
  
  if (token.isSentenceStart()) {
   accumulatedCost = 0;
   sentenceAtt.setSentenceStart(true);
  }
  
  costAtt.setCost(cost - accumulatedCost);
  accumulatedCost = cost;
  basicFormAtt.setMorpheme(m);
  conjugationAtt.setMorpheme(m);
  partOfSpeechAtt.setMorpheme(m);
  pronunciationsAtt.setMorpheme(m);
  readingsAtt.setMorpheme(m);
  offsetAtt.setOffset(correctOffset(token.getStart()), correctOffset(token.end()));
  return true;
 }
}
origin: org.apache.lucene/lucene-analyzers-kuromoji

@Override
public void end() throws IOException {
 super.end();
 // Set final offset
 int finalOffset = correctOffset(pos);
 offsetAtt.setOffset(finalOffset, finalOffset);
}
origin: org.codelibs/elasticsearch-analysis-ja

@Override
public void end() throws IOException {
  tokenizer.end();
}
origin: org.codelibs/elasticsearch-analysis-ja

@Override
public boolean incrementToken() throws IOException {
  return tokenizer.incrementToken();
}
origin: com.google.code/lucene-gosen-ipadic

 @Override
 public void end() throws IOException {
  // set final offset
  final int finalOffset = correctOffset(tagger.end());
  offsetAtt.setOffset(finalOffset, finalOffset);
 }
}
origin: com.google.code/lucene-gosen-ipadic

 public Tokenizer create(Reader reader) {
  return new JapaneseTokenizer(reader, compositeTokenFilter, dictionaryDir);
 }
}
origin: org.apache.lucene/lucene-analyzers-kuromoji

 @Override
 public JapaneseTokenizer create(AttributeFactory factory) {
  JapaneseTokenizer t = new JapaneseTokenizer(factory, userDictionary, discardPunctuation, mode);
  if (nbestExamples != null) {
   nbestCost = Math.max(nbestCost, t.calcNBestCost(nbestExamples));
  }
  t.setNBestCost(nbestCost);
  return t;
 }
}
origin: org.elasticsearch/elasticsearch-analysis-kuromoji

@Override
public Tokenizer create(Reader reader) {
  return new JapaneseTokenizer(reader, userDictionary,
      discartPunctuation, mode);
}
origin: org.elasticsearch/elasticsearch-analysis-kuromoji

  @Override
  public Tokenizer create(Reader reader) {
    return new JapaneseTokenizer(reader, null, true,
        Mode.SEARCH);
  }
}));
origin: shilad/wikibrain

@Override
public Tokenizer makeTokenizer(Reader r) {
  return new org.apache.lucene.analysis.ja.JapaneseTokenizer(r, null, false, org.apache.lucene.analysis.ja.JapaneseTokenizer.DEFAULT_MODE);
}
origin: org.elasticsearch.plugin/analysis-kuromoji

@Override
public Tokenizer create() {
  return new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
}
origin: org.codelibs/elasticsearch-analysis-ja

TokenizerWrapper() {
  super();
  tokenizerTimestamp = dictionaryTimestamp;
  tokenizer = new JapaneseTokenizer(userDictionary,
      discartPunctuation, mode);
  try {
    final Field attributesField = getAccessibleField(AttributeSource.class, "attributes");
    final Object attributesObj = attributesField.get(tokenizer);
    attributesField.set(this, attributesObj);
    final Field attributeImplsField = getAccessibleField(AttributeSource.class, "attributeImpls");
    final Object attributeImplsObj = attributeImplsField.get(tokenizer);
    attributeImplsField.set(this, attributeImplsObj);
    final Field currentStateField = getAccessibleField(AttributeSource.class, "currentState");
    final Object currentStateObj = currentStateField.get(tokenizer);
    currentStateField.set(this, currentStateObj);
  } catch (final Exception e) {
    throw new IllegalStateException(
        "Failed to update the tokenizer.", e);
  }
}
origin: org.apache.lucene/lucene-analyzers-kuromoji

@Override
protected TokenStreamComponents createComponents(String fieldName) {
 Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode);
 TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
 stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
 stream = new CJKWidthFilter(stream);
 stream = new StopFilter(stream, stopwords);
 stream = new JapaneseKatakanaStemFilter(stream);
 stream = new LowerCaseFilter(stream);
 return new TokenStreamComponents(tokenizer, stream);
}
org.apache.lucene.analysis.jaJapaneseTokenizer

Javadoc

This is a Japanese tokenizer which uses "Sen" morphological analyzer.

sets the surface form as the term text, but also sets these attributes:

  • BasicFormAttribute
  • ConjugationAttribute
  • PartOfSpeechAttribute
  • PronunciationsAttribute
  • ReadingsAttribute
  • CostAttribute
  • SentenceStartAttribute

Most used methods

  • <init>
    Create a new JapaneseTokenizer.
  • setReader
  • calcNBestCost
  • clearAttributes
  • close
  • correctOffset
  • end
  • incrementToken
  • reset
  • setNBestCost
  • add
  • backtrace
  • add,
  • backtrace,
  • backtraceNBest,
  • computePenalty,
  • computeSecondBestThreshold,
  • equals,
  • fixupPendingList,
  • getDict,
  • hashCode,
  • isPunctuation

Popular in Java

  • Reading from database using SQL prepared statement
  • setRequestProperty (URLConnection)
  • getApplicationContext (Context)
  • startActivity (Activity)
  • HttpServer (com.sun.net.httpserver)
    This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
  • Component (java.awt)
    A component is an object having a graphical representation that can be displayed on the screen and t
  • SocketTimeoutException (java.net)
    This exception is thrown when a timeout expired on a socket read or accept operation.
  • Selector (java.nio.channels)
    A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
  • Enumeration (java.util)
    A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
  • TimeUnit (java.util.concurrent)
    A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
  • From CI to AI: The AI layer in your organization
Tabnine Logo
  • Products

    Search for Java codeSearch for JavaScript code
  • IDE Plugins

    IntelliJ IDEAWebStormVisual StudioAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimGoLandRubyMineEmacsJupyter NotebookJupyter LabRiderDataGripAppCode
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogTabnine AcademyTerms of usePrivacy policyJava Code IndexJavascript Code Index
Get Tabnine for your IDE now