/** * Gets the blob properties of the document. * * @param doc the document * @return the list of blob properties */ public List<Property> getBlobsProperties(DocumentModel doc) { List<Property> properties = new ArrayList<>(); for (String path : getBlobPaths(doc.getDocumentType())) { if (!isInterestingPath(path)) { continue; } List<String> split = Arrays.asList(path.split("/[*]/")); if (split.isEmpty()) { throw new IllegalStateException("Path detected not well-formed: " + path); } Property property = doc.getProperty(split.get(0)); List<String> subPath = split.subList(1, split.size()); findBlobsProperties(property, subPath, properties); } return properties; }
/** * Gets the blobs of the document. * * @param doc the document * @return the list of blobs */ public List<Blob> getBlobs(DocumentModel doc) { List<Blob> blobs = new ArrayList<>(); for (Property property : getBlobsProperties(doc)) { blobs.add((Blob) property.getValue()); } return blobs; }
/** * Sets extractor properties, controlling what properties or values are returned by {@link #getBlobsProperties} or * {@link #getBlobs}. * <p> * The properties have to be defined without prefix if there is no prefix in the schema definition. For blob * properties, the path must include the {@code /data} part. */ public void setExtractorProperties(Set<String> includedPaths, Set<String> excludedPaths, boolean allBlobs) { this.includedPaths = normalizePaths(includedPaths); this.excludedPaths = normalizePaths(excludedPaths); this.allBlobs = allBlobs; isDefaultConfiguration = includedPaths == null && excludedPaths == null && allBlobs; }
protected void extractAndUpdateBinaryText() { // we extract binary text even if fulltext search is disabled, // because it is still used to inject into external indexers like Elasticsearch BlobsExtractor blobsExtractor = new BlobsExtractor(); Map<Blob, String> blobsText = new IdentityHashMap<>(); for (String indexName : fulltextConfiguration.indexNames) { if (!fulltextConfiguration.indexesAllBinary.contains(indexName) && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { // nothing to do: index not configured for blob continue; } // get original text from all blobs blobsExtractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), fulltextConfiguration.indexesAllBinary.contains(indexName)); List<String> strings = new ArrayList<>(); for (Blob blob : blobsExtractor.getBlobs(document)) { String string = blobsText.computeIfAbsent(blob, this::blobToText); strings.add(string); } // add space at beginning and end for simulated phrase search using LIKE "% foo bar %" String text = " " + String.join(" ", strings) + " "; text = limitStringSize(text, fulltextConfiguration.fulltextFieldSizeLimit); String property = getFulltextPropertyName(SYSPROP_FULLTEXT_BINARY, indexName); for (DocumentRef docRef : docsToUpdate) { session.setDocumentSystemProp(docRef, property, text); } } }
List<Property> properties = new BlobsExtractor().getBlobsProperties(doc);
/** * Gets the blob paths of the document type. Extractor properties are ignored. * * @param documentType the document type * @return the list of blob paths * * @since 8.3 */ public List<String> getBlobPaths(DocumentType documentType) { String docType = documentType.getName(); List<String> paths = docBlobPaths.get(docType); if (paths == null) { paths = new ArrayList<>(); for (Schema schema : documentType.getSchemas()) { findBlobPaths(schema, null, schema, paths); } docBlobPaths.put(docType, paths); } return paths; }
protected void findBlobsProperties(Property property, List<String> split, List<Property> properties) { if (split.isEmpty()) { if (property.getValue() != null) { properties.add(property); } } else { for (Property childProperty : property.getChildren()) { Property childSubProp = childProperty.get(split.get(0)); List<String> subPath = split.subList(1, split.size()); findBlobsProperties(childSubProp, subPath, properties); } } }
Type fieldType = ((ListType) type).getFieldType(); if (fieldType.isComplexType()) { findBlobPaths((ComplexType) fieldType, fieldPath + "/*", schema, paths); } else { continue; // not binary text findBlobPaths(ctype, fieldPath, schema, paths);