org.apache.solr.handler.clustering.carrot2 java code examples

 public static IStemmer createStemmer() {
  try {
   return new LuceneStemmerAdapter();
  } catch (Exception e) {
   return IdentityStemmer.INSTANCE;
  }
 }
}

@Override
public IStemmer getStemmer(LanguageCode language) {
 switch (language) {
 case ARABIC:
  return ArabicStemmerFactory.createStemmer();
 case CHINESE_SIMPLIFIED:
  return IdentityStemmer.INSTANCE;
 default:
  /*
   * For other languages, try to use snowball's stemming.
   */
  return SnowballStemmerFactory.createStemmer(language);
 }
}

static ITokenizer createTokenizer() {
 try {
  return new ChineseTokenizer();
 } catch (Throwable e) {
  if (e instanceof OutOfMemoryError) {
   throw (OutOfMemoryError) e;
  }
  return new ExtendedWhitespaceTokenizer();
 }
}

@Override
public Object cluster(Query query, SolrDocumentList solrDocList,
  Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
 try {
  // Prepare attributes for Carrot2 clustering call
  Map<String, Object> attributes = new HashMap<>();
  List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
  attributes.put(AttributeNames.DOCUMENTS, documents);
  attributes.put(AttributeNames.QUERY, query.toString());

  // Pass the fields on which clustering runs.
  attributes.put("solrFieldNames", getFieldsForClustering(sreq));

  // Pass extra overriding attributes from the request, if any
  extractCarrotAttributes(sreq.getParams(), attributes);

  // Perform clustering and convert to an output structure of clusters.
  //
  // Carrot2 uses current thread's context class loader to get
  // certain classes (e.g. custom tokenizer/stemmer) at runtime.
  // To make sure classes from contrib JARs are available,
  // we swap the context class loader for the time of clustering.
  return withContextClassLoader(core.getResourceLoader().getClassLoader(),
    () -> clustersToNamedList(controller.process(attributes,
      clusteringAlgorithmClass).getClusters(), sreq.getParams()));
 } catch (Exception e) {
  log.error("Carrot2 clustering failed", e);
  throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
 }
}

public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
 try {
  // Prepare attributes for Carrot2 clustering call
  Map<String, Object> attributes = new HashMap<String, Object>();
  List<Document> documents = getDocuments(docList, query, sreq);
  attributes.put(AttributeNames.DOCUMENTS, documents);
  attributes.put(AttributeNames.QUERY, query.toString());
  // Pass extra overriding attributes from the request, if any
  extractCarrotAttributes(sreq.getParams(), attributes);
  // Perform clustering and convert to named list
  return clustersToNamedList(controller.process(attributes,
      clusteringAlgorithmClass).getClusters(), sreq.getParams());
 } catch (Exception e) {
  log.error("Carrot2 clustering failed", e);
  throw new RuntimeException(e);
 }
}

 new SolrResourceLocator(core, initParams),
  withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> {
   try {
    AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open());
extractCarrotAttributes(initParams, initAttributes);
withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> this.controller.init(initAttributes));

private List clustersToNamedList(List<Cluster> carrotClusters,
                 SolrParams solrParams) {
 List result = new ArrayList();
 clustersToNamedList(carrotClusters, result, solrParams.getBool(
     CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
     CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
 return result;
}

@Override
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
 SolrParams solrParams = sreq.getParams();
 HashSet<String> fields = new HashSet<>(getFieldsForClustering(sreq));
 fields.add(idFieldName);
 fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
 fields.addAll(getCustomFieldsMap(solrParams).keySet());
 String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME);
 if (StringUtils.isNotBlank(languageField)) { 
  fields.add(languageField);
 }
 return fields;
}

@Override
public String init(NamedList config, final SolrCore core) {
 String result = super.init(config, core);
 SolrParams initParams = SolrParams.toSolrParams(config);
 // Initialize Carrot2 controller. Pass initialization attributes, if any.
 HashMap<String, Object> initAttributes = new HashMap<String, Object>();
 extractCarrotAttributes(initParams, initAttributes);
 this.controller.init(initAttributes);
 this.idFieldName = core.getSchema().getUniqueKeyField().getName();
 // Make sure the requested Carrot2 clustering algorithm class is available
 String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
 Class<?> algorithmClass = core.getResourceLoader().findClass(carrotAlgorithmClassName);
 if (!IClusteringAlgorithm.class.isAssignableFrom(algorithmClass)) {
  throw new IllegalArgumentException("Class provided as "
      + CarrotParams.ALGORITHM + " must implement "
      + IClusteringAlgorithm.class.getName());
 }
 this.clusteringAlgorithmClass = (Class<? extends IClusteringAlgorithm>) algorithmClass;
 return result;
}

Map<String, String> customFields = getCustomFieldsMap(solrParams);
  snippet = getConcatenated(sdoc, snippetFieldSpec);
 Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
     snippet, ObjectUtils.toString(sdoc.getFieldValue(urlField), ""));

@Override
public ITokenizer getTokenizer(LanguageCode language) {
 switch (language) {
 case CHINESE_SIMPLIFIED:
  return ChineseTokenizerFactory.createTokenizer();
  /*
   * We use our own analyzer for Arabic. Lucene's version has special
   * support for Nonspacing-Mark characters (see
   * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
   * have them included as letters in the parser.
   */
 case ARABIC:
  // Intentional fall-through.
 default:
  return new ExtendedWhitespaceTokenizer();
 }
}

public SolrResourceLocator(SolrCore core, SolrParams initParams) {
 resourceLoader = core.getResourceLoader();
 
 String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR);
 carrot2ResourcesDir = firstNonNull(resourcesDir, CarrotClusteringEngine.CARROT_RESOURCES_PREFIX);
}

@Override
public CharSequence stem(CharSequence word) {
 if (word.length() > buffer.length) {
  buffer = new char[word.length()];
 }
 for (int i = 0; i < word.length(); i++) {
  buffer[i] = word.charAt(i);
 }
 int newLen = normalizer.normalize(buffer, word.length());
 newLen = delegate.stem(buffer, newLen);
 if (newLen != word.length() || !equals(buffer, newLen, word)) {
  return CharBuffer.wrap(buffer, 0, newLen);
 }
 // Same-same.
 return null;
}

 /**
  * Create and return an {@link IStemmer} adapter for a
  * {@link SnowballProgram} for a given language code. An identity stemmer is
  * returned for unknown languages.
  */
 public static IStemmer createStemmer(LanguageCode language) {
  final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
    .get(language);
  if (stemmerClazz == null) {
   log.warn("No Snowball stemmer class for: " + language.name()
     + ". Quality of clustering may be degraded.");
   return IdentityStemmer.INSTANCE;
  }
  try {
   return new SnowballStemmerAdapter(stemmerClazz.newInstance());
  } catch (Exception e) {
   log.warn("Could not instantiate snowball stemmer"
     + " for language: " + language.name()
     + ". Quality of clustering may be degraded.", e);
   return IdentityStemmer.INSTANCE;
  }
 }
}

org.apache.lucene.document.Document doc = searcher.doc(id,
    fieldSelector);
String snippet = getValue(doc, snippetField);
if (produceSummary == true) {
 docsHolder[0] = id.intValue();
 highligher.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
Document carrotDocument = new Document(getValue(doc, titleField),
    snippet, doc.get(urlField));
carrotDocument.addField("solrId", doc.get(idFieldName));

private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
                 SolrParams solrParams) {
 List<NamedList<Object>> result = new ArrayList<>();
 clustersToNamedList(carrotClusters, result, solrParams.getBool(
     CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
     CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
 return result;
}

private void clustersToNamedList(List<Cluster> outputClusters,
                 List parent, boolean outputSubClusters, int maxLabels) {
 for (Cluster outCluster : outputClusters) {
  NamedList cluster = new SimpleOrderedMap();
  parent.add(cluster);
  List<String> labels = outCluster.getPhrases();
  if (labels.size() > maxLabels)
   labels = labels.subList(0, maxLabels);
  cluster.add("labels", labels);
  List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
  List docList = new ArrayList();
  cluster.add("docs", docList);
  for (Document doc : docs) {
   docList.add(doc.getField("solrId"));
  }
  if (outputSubClusters) {
   List subclusters = new ArrayList();
   cluster.add("clusters", subclusters);
   clustersToNamedList(outCluster.getSubclusters(), subclusters,
       outputSubClusters, maxLabels);
  }
 }
}

List<NamedList<Object>> subclusters = new ArrayList<>();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
    outputSubClusters, maxLabels);

How to use org.apache.solr.handler.clustering.carrot2

Best Java code snippets using org.apache.solr.handler.clustering.carrot2 (Showing top 18 results out of 315)