String result = removeControlChars(stripper.getText(pdfDocument)); StringBuffer content = new StringBuffer(result); if (CmsStringUtil.isNotEmpty(result)) { combineContentItem(info.getTitle(), I_CmsExtractionResult.ITEM_TITLE, content, contentItems); combineContentItem(info.getKeywords(), I_CmsExtractionResult.ITEM_KEYWORDS, content, contentItems); combineContentItem(info.getSubject(), I_CmsExtractionResult.ITEM_SUBJECT, content, contentItems); combineContentItem(info.getAuthor(), I_CmsExtractionResult.ITEM_AUTHOR, content, contentItems); combineContentItem(info.getCreator(), I_CmsExtractionResult.ITEM_CREATOR, content, contentItems); combineContentItem(info.getProducer(), I_CmsExtractionResult.ITEM_PRODUCER, content, contentItems);
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in) throws Exception { return extractText(in, new PDFParser()); } }
textExtractor = CmsExtractorPdf.getExtractor(); } else if (path1.endsWith(".doc") && path2.endsWith(".doc")) { textExtractor = CmsExtractorMsOfficeOLE2.getExtractor();
textExtractor = CmsExtractorPdf.getExtractor(); } else if (path1.endsWith(".doc") && path2.endsWith(".doc")) { textExtractor = CmsExtractorMsOfficeOLE2.getExtractor();
textExtractor = CmsExtractorPdf.getExtractor(); } else if (path1.endsWith(".doc") && path2.endsWith(".doc")) { textExtractor = CmsExtractorMsWord.getExtractor();
/** * Returns the raw text content of a given vfs resource containing Adobe PDF data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { CmsFile file = readFile(cms, resource); try { return CmsExtractorPdf.getExtractor().extractText(file.getContents()); } catch (Exception e) { if (e instanceof CryptographyException) { throw new CmsIndexException(Messages.get().container( Messages.ERR_DECRYPTING_RESOURCE_1, resource.getRootPath()), e); } if (e instanceof InvalidPasswordException) { // default password "" was wrong. throw new CmsIndexException(Messages.get().container( Messages.ERR_PWD_PROTECTED_1, resource.getRootPath()), e); } throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
/** * Returns the raw text content of a given vfs resource containing Adobe PDF data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { logContentExtraction(resource, index); CmsFile file = readFile(cms, resource); try { return CmsExtractorPdf.getExtractor().extractText(file.getContents()); } catch (Exception e) { if (e instanceof CryptographyException) { throw new CmsIndexException( Messages.get().container(Messages.ERR_DECRYPTING_RESOURCE_1, resource.getRootPath()), e); } if (e instanceof InvalidPasswordException) { // default password "" was wrong. throw new CmsIndexException( Messages.get().container(Messages.ERR_PWD_PROTECTED_1, resource.getRootPath()), e); } throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }