/** * Logs content extraction for the specified resource and index.<p> * * @param resource the resource to log content extraction for * @param index the search index to log content extraction for */ protected void logContentExtraction(CmsResource resource, CmsSearchIndex index) { if (LOG.isDebugEnabled()) { LOG.debug( Messages.get().getBundle().key( Messages.LOG_EXTRACT_CONTENT_2, resource.getRootPath(), index.getName())); } }
/** * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List) */ public List<String> getDocumentKeys(List<String> resourceTypes, List<String> mimeTypes) throws CmsException { List<String> keys = new ArrayList<String>(); if (resourceTypes.contains("*")) { List<String> allTypes = new ArrayList<String>(); for (Iterator<I_CmsResourceType> i = OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) { I_CmsResourceType resourceType = i.next(); allTypes.add(resourceType.getTypeName()); } resourceTypes = allTypes; } try { for (Iterator<String> i = resourceTypes.iterator(); i.hasNext();) { String typeName = OpenCms.getResourceManager().getResourceType(i.next()).getTypeName(); for (Iterator<String> j = mimeTypes.iterator(); j.hasNext();) { keys.add(getDocumentKey(typeName, j.next())); } if (mimeTypes.isEmpty()) { keys.add(getDocumentKey(typeName, null)); } } } catch (Exception exc) { throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc); } return keys; }
/** * Just returns an empty extraction result since the content can't be extracted form a generic resource.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException { if (resource == null) { throw new CmsIndexException(Messages.get().container(Messages.ERR_NO_RAW_CONTENT_1, index.getLocale())); } // just return an empty result set return new CmsExtractionResult(""); }
/** * Just returns an empty extraction result since the content can't be extracted form a generic resource.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException { if (resource == null) { throw new CmsIndexException(Messages.get().container(Messages.ERR_NO_RAW_CONTENT_1, index.getLocale())); } // just return an empty result set return new CmsExtractionResult(""); }
/** * Returns the raw text content of a given vfs resource containing MS Excel data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { CmsFile file = readFile(cms, resource); try { return CmsExtractorMsExcel.getExtractor().extractText(file.getContents()); } catch (Exception e) { if (e instanceof FileNotFoundException) { if ((e.getMessage() != null) && (e.getMessage().indexOf("Workbook") > 0)) { // special case: catch Excel95 format error throw new CmsIndexException(Messages.get().container( Messages.ERR_NO_EXCEL_FORMAT_1, resource.getRootPath()), e); } } throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Upgrades the given resource to a {@link CmsFile} with content.<p> * * @param cms the current users OpenCms context * @param resource the resource to upgrade * * @return the given resource upgraded to a {@link CmsFile} with content * * @throws CmsException if the resource could not be read * @throws CmsIndexException if the resource has no content */ protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexException { CmsFile file = cms.readFile(resource); if (file.getLength() <= 0) { throw new CmsIndexException(Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath())); } return file; } }
/** * Upgrades the given resource to a {@link CmsFile} with content.<p> * * @param cms the current users OpenCms context * @param resource the resource to upgrade * * @return the given resource upgraded to a {@link CmsFile} with content * * @throws CmsException if the resource could not be read * @throws CmsIndexNoContentException if the resource has no content */ protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException { CmsFile file = cms.readFile(resource); if (file.getLength() <= 0) { throw new CmsIndexNoContentException( Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath())); } return file; } }
/** * Generates a new lucene document instance from contents of the given resource for the provided index.<p> * * For container pages, we must not cache based on the container page content age, * since the content of the included elements may change any time. */ @Override public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsException { // extract the content from the resource I_CmsExtractionResult content = null; if (index.isExtractingContent()) { // do full text content extraction only if required try { content = extractContent(cms, resource, index); } catch (Exception e) { // text extraction failed for document - continue indexing meta information only LOG.error(Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } } // create the Lucene document according to the index field configuration return index.getFieldConfiguration().createDocument(cms, resource, index, content); }
/** * Returns the raw text content of a given vfs resource containing plain text data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsException { CmsFile file = readFile(cms, resource); try { CmsProperty encProp = cms.readPropertyObject( resource, CmsPropertyDefinition.PROPERTY_CONTENT_ENCODING, true); String encoding = encProp.getValue(OpenCms.getSystemInfo().getDefaultEncoding()); return new CmsExtractionResult(new String(file.getContents(), encoding)); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing RTF data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsException { CmsFile file = readFile(cms, resource); try { return CmsExtractorRtf.getExtractor().extractText(file.getContents()); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing MS Word data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { CmsFile file = readFile(cms, resource); try { return CmsExtractorMsWord.getExtractor().extractText(file.getContents()); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing plain text data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsException { logContentExtraction(resource, index); CmsFile file = readFile(cms, resource); try { CmsProperty encProp = cms.readPropertyObject( resource, CmsPropertyDefinition.PROPERTY_CONTENT_ENCODING, true); String encoding = encProp.getValue(OpenCms.getSystemInfo().getDefaultEncoding()); return new CmsExtractionResult(new String(file.getContents(), encoding)); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing MS PowerPoint data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { CmsFile file = readFile(cms, resource); try { return CmsExtractorMsPowerPoint.getExtractor().extractText(file.getContents()); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing MS Word data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { CmsFile file = readFile(cms, resource); try { return CmsExtractorOpenOffice.getExtractor().extractText(file.getContents()); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing MS Word data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { logContentExtraction(resource, index); CmsFile file = readFile(cms, resource); try { return CmsExtractorMsOfficeOOXML.getExtractor().extractText(file.getContents()); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing MS Word data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { logContentExtraction(resource, index); CmsFile file = readFile(cms, resource); try { return CmsExtractorMsOfficeOLE2.getExtractor().extractText(file.getContents()); } catch (Throwable e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing MS Word data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { logContentExtraction(resource, index); CmsFile file = readFile(cms, resource); try { return CmsExtractorOpenOffice.getExtractor().extractText(file.getContents()); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing RTF data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsException { logContentExtraction(resource, index); CmsFile file = readFile(cms, resource); try { return CmsExtractorRtf.getExtractor().extractText(file.getContents()); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given VFS resource containing HTML data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { CmsFile file = readFile(cms, resource); try { CmsProperty encProp = cms.readPropertyObject( resource, CmsPropertyDefinition.PROPERTY_CONTENT_ENCODING, true); String encoding = encProp.getValue(OpenCms.getSystemInfo().getDefaultEncoding()); return CmsExtractorHtml.getExtractor().extractText(file.getContents(), encoding); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given VFS resource containing HTML data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { logContentExtraction(resource, index); CmsFile file = readFile(cms, resource); try { CmsProperty encProp = cms.readPropertyObject( resource, CmsPropertyDefinition.PROPERTY_CONTENT_ENCODING, true); String encoding = encProp.getValue(OpenCms.getSystemInfo().getDefaultEncoding()); return CmsExtractorHtml.getExtractor().extractText(file.getContents(), encoding); } catch (Exception e) { throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }