protected void extractAndUpdate() { // update all docs if (updateSimpleText) { extractAndUpdateSimpleText(); } if (updateBinaryText) { extractAndUpdateBinaryText(); } // reset job id for (DocumentRef docRef : docsToUpdate) { session.setDocumentSystemProp(docRef, SYSPROP_FULLTEXT_JOBID, null); } }
protected String stringToText(String string) { string = removeHtml(string); string = removeEntities(string); return string; }
@Override public void work() { openSystemSession(); // if the runtime has shut down (normally because tests are finished) // this can happen, see NXP-4009 if (session.getPrincipal() == null) { return; } DocumentRef docRef = new IdRef(docId); if (!session.exists(docRef)) { return; } document = session.getDocument(docRef); findDocsToUpdate(); if (docsToUpdate.isEmpty()) { return; } initFulltextConfiguration(); setStatus("Extracting"); setProgress(Progress.PROGRESS_0_PC); extractAndUpdate(); setStatus("Saving"); session.save(); setProgress(Progress.PROGRESS_100_PC); setStatus("Done"); }
protected void extractAndUpdateSimpleText() { if (fulltextConfiguration.fulltextSearchDisabled) { // if fulltext search is disabled, we don't extract simple text at all return; } for (String indexName : fulltextConfiguration.indexNames) { if (!fulltextConfiguration.indexesAllSimple.contains(indexName) && fulltextConfiguration.propPathsByIndexSimple.get(indexName) == null) { // nothing to do: index not configured for simple text continue; } Set<String> includedPaths = fulltextConfiguration.indexesAllSimple.contains(indexName) ? null : fulltextConfiguration.propPathsByIndexSimple.get(indexName); Set<String> excludedPaths = fulltextConfiguration.propPathsExcludedByIndexSimple.get(indexName); // get string properties List<String> strings = new StringsExtractor().findStrings(document, includedPaths, excludedPaths); // transform to text (remove HTML and entities) // we do this here rather than in the indexing backend (Elasticsearch) because it's more efficient here // add space at beginning and end for simulated phrase search using LIKE "% foo bar %" String text = strings.stream().map(this::stringToText).collect(Collectors.joining(" ", " ", " ")); // limit size text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit); String property = getFulltextPropertyName(SYSPROP_FULLTEXT_SIMPLE, indexName); for (DocumentRef docRef : docsToUpdate) { session.setDocumentSystemProp(docRef, property, text); } } }
boolean updateSimpleText = dirtyStrings.contains(id); boolean updateBinaryText = dirtyBinaries.contains(id); Work work = new FulltextExtractorWork(repository.getName(), model.idToString(id), updateSimpleText, updateBinaryText, true); works.add(work);
protected void extractAndUpdateBinaryText() { // we extract binary text even if fulltext search is disabled, // because it is still used to inject into external indexers like Elasticsearch BlobsExtractor blobsExtractor = new BlobsExtractor(); Map<Blob, String> blobsText = new IdentityHashMap<>(); for (String indexName : fulltextConfiguration.indexNames) { if (!fulltextConfiguration.indexesAllBinary.contains(indexName) && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { // nothing to do: index not configured for blob continue; } // get original text from all blobs blobsExtractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), fulltextConfiguration.indexesAllBinary.contains(indexName)); List<String> strings = new ArrayList<>(); for (Blob blob : blobsExtractor.getBlobs(document)) { String string = blobsText.computeIfAbsent(blob, this::blobToText); strings.add(string); } // add space at beginning and end for simulated phrase search using LIKE "% foo bar %" String text = " " + String.join(" ", strings) + " "; text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit); String property = getFulltextPropertyName(SYSPROP_FULLTEXT_BINARY, indexName); for (DocumentRef docRef : docsToUpdate) { session.setDocumentSystemProp(docRef, property, text); } } }