public static IStemmer createStemmer() { try { return new LuceneStemmerAdapter(); } catch (Exception e) { return IdentityStemmer.INSTANCE; } } }
@Override public IStemmer getStemmer(LanguageCode language) { switch (language) { case ARABIC: return ArabicStemmerFactory.createStemmer(); case CHINESE_SIMPLIFIED: return IdentityStemmer.INSTANCE; default: /* * For other languages, try to use snowball's stemming. */ return SnowballStemmerFactory.createStemmer(language); } }
static ITokenizer createTokenizer() { try { return new ChineseTokenizer(); } catch (Throwable e) { if (e instanceof OutOfMemoryError) { throw (OutOfMemoryError) e; } return new ExtendedWhitespaceTokenizer(); } }
@Override public Object cluster(Query query, SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) { try { // Prepare attributes for Carrot2 clustering call Map<String, Object> attributes = new HashMap<>(); List<Document> documents = getDocuments(solrDocList, docIds, query, sreq); attributes.put(AttributeNames.DOCUMENTS, documents); attributes.put(AttributeNames.QUERY, query.toString()); // Pass the fields on which clustering runs. attributes.put("solrFieldNames", getFieldsForClustering(sreq)); // Pass extra overriding attributes from the request, if any extractCarrotAttributes(sreq.getParams(), attributes); // Perform clustering and convert to an output structure of clusters. // // Carrot2 uses current thread's context class loader to get // certain classes (e.g. custom tokenizer/stemmer) at runtime. // To make sure classes from contrib JARs are available, // we swap the context class loader for the time of clustering. return withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> clustersToNamedList(controller.process(attributes, clusteringAlgorithmClass).getClusters(), sreq.getParams())); } catch (Exception e) { log.error("Carrot2 clustering failed", e); throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e); } }
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) { try { // Prepare attributes for Carrot2 clustering call Map<String, Object> attributes = new HashMap<String, Object>(); List<Document> documents = getDocuments(docList, query, sreq); attributes.put(AttributeNames.DOCUMENTS, documents); attributes.put(AttributeNames.QUERY, query.toString()); // Pass extra overriding attributes from the request, if any extractCarrotAttributes(sreq.getParams(), attributes); // Perform clustering and convert to named list return clustersToNamedList(controller.process(attributes, clusteringAlgorithmClass).getClusters(), sreq.getParams()); } catch (Exception e) { log.error("Carrot2 clustering failed", e); throw new RuntimeException(e); } }
new SolrResourceLocator(core, initParams), withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> { try { AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open()); extractCarrotAttributes(initParams, initAttributes); withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> this.controller.init(initAttributes));
private List clustersToNamedList(List<Cluster> carrotClusters, SolrParams solrParams) { List result = new ArrayList(); clustersToNamedList(carrotClusters, result, solrParams.getBool( CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt( CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE)); return result; }
@Override protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){ SolrParams solrParams = sreq.getParams(); HashSet<String> fields = new HashSet<>(getFieldsForClustering(sreq)); fields.add(idFieldName); fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url")); fields.addAll(getCustomFieldsMap(solrParams).keySet()); String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME); if (StringUtils.isNotBlank(languageField)) { fields.add(languageField); } return fields; }
@Override public String init(NamedList config, final SolrCore core) { String result = super.init(config, core); SolrParams initParams = SolrParams.toSolrParams(config); // Initialize Carrot2 controller. Pass initialization attributes, if any. HashMap<String, Object> initAttributes = new HashMap<String, Object>(); extractCarrotAttributes(initParams, initAttributes); this.controller.init(initAttributes); this.idFieldName = core.getSchema().getUniqueKeyField().getName(); // Make sure the requested Carrot2 clustering algorithm class is available String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM); Class<?> algorithmClass = core.getResourceLoader().findClass(carrotAlgorithmClassName); if (!IClusteringAlgorithm.class.isAssignableFrom(algorithmClass)) { throw new IllegalArgumentException("Class provided as " + CarrotParams.ALGORITHM + " must implement " + IClusteringAlgorithm.class.getName()); } this.clusteringAlgorithmClass = (Class<? extends IClusteringAlgorithm>) algorithmClass; return result; }
Map<String, String> customFields = getCustomFieldsMap(solrParams); snippet = getConcatenated(sdoc, snippetFieldSpec); Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec), snippet, ObjectUtils.toString(sdoc.getFieldValue(urlField), ""));
@Override public ITokenizer getTokenizer(LanguageCode language) { switch (language) { case CHINESE_SIMPLIFIED: return ChineseTokenizerFactory.createTokenizer(); /* * We use our own analyzer for Arabic. Lucene's version has special * support for Nonspacing-Mark characters (see * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we * have them included as letters in the parser. */ case ARABIC: // Intentional fall-through. default: return new ExtendedWhitespaceTokenizer(); } }
public SolrResourceLocator(SolrCore core, SolrParams initParams) { resourceLoader = core.getResourceLoader(); String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR); carrot2ResourcesDir = firstNonNull(resourcesDir, CarrotClusteringEngine.CARROT_RESOURCES_PREFIX); }
@Override public CharSequence stem(CharSequence word) { if (word.length() > buffer.length) { buffer = new char[word.length()]; } for (int i = 0; i < word.length(); i++) { buffer[i] = word.charAt(i); } int newLen = normalizer.normalize(buffer, word.length()); newLen = delegate.stem(buffer, newLen); if (newLen != word.length() || !equals(buffer, newLen, word)) { return CharBuffer.wrap(buffer, 0, newLen); } // Same-same. return null; }
/** * Create and return an {@link IStemmer} adapter for a * {@link SnowballProgram} for a given language code. An identity stemmer is * returned for unknown languages. */ public static IStemmer createStemmer(LanguageCode language) { final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses .get(language); if (stemmerClazz == null) { log.warn("No Snowball stemmer class for: " + language.name() + ". Quality of clustering may be degraded."); return IdentityStemmer.INSTANCE; } try { return new SnowballStemmerAdapter(stemmerClazz.newInstance()); } catch (Exception e) { log.warn("Could not instantiate snowball stemmer" + " for language: " + language.name() + ". Quality of clustering may be degraded.", e); return IdentityStemmer.INSTANCE; } } }
org.apache.lucene.document.Document doc = searcher.doc(id, fieldSelector); String snippet = getValue(doc, snippetField); if (produceSummary == true) { docsHolder[0] = id.intValue(); highligher.doHighlighting(docAsList, theQuery, req, snippetFieldAry); Document carrotDocument = new Document(getValue(doc, titleField), snippet, doc.get(urlField)); carrotDocument.addField("solrId", doc.get(idFieldName));
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters, SolrParams solrParams) { List<NamedList<Object>> result = new ArrayList<>(); clustersToNamedList(carrotClusters, result, solrParams.getBool( CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt( CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE)); return result; }
private void clustersToNamedList(List<Cluster> outputClusters, List parent, boolean outputSubClusters, int maxLabels) { for (Cluster outCluster : outputClusters) { NamedList cluster = new SimpleOrderedMap(); parent.add(cluster); List<String> labels = outCluster.getPhrases(); if (labels.size() > maxLabels) labels = labels.subList(0, maxLabels); cluster.add("labels", labels); List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments(); List docList = new ArrayList(); cluster.add("docs", docList); for (Document doc : docs) { docList.add(doc.getField("solrId")); } if (outputSubClusters) { List subclusters = new ArrayList(); cluster.add("clusters", subclusters); clustersToNamedList(outCluster.getSubclusters(), subclusters, outputSubClusters, maxLabels); } } }
List<NamedList<Object>> subclusters = new ArrayList<>(); cluster.add("clusters", subclusters); clustersToNamedList(outCluster.getSubclusters(), subclusters, outputSubClusters, maxLabels);