/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { String rawContent = ""; try { // first extract the table content rawContent = extractTableContent(getStreamCopy(in)); rawContent = removeControlChars(rawContent); // now extract the meta information using POI POIFSReader reader = new POIFSReader(); reader.registerListener(this); reader.read(getStreamCopy(in)); } catch (Exception e) { if (LOG.isErrorEnabled()) { LOG.error(Messages.get().container(Messages.LOG_EXTRACT_TEXT_ERROR_0), e); } } // combine the meta information with the content and create the result return createExtractionResult(rawContent); }
/** * Returns the raw text content of a given vfs resource containing MS Excel data.<p> * * @see org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject, CmsResource, CmsSearchIndex) */ public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsIndexException, CmsException { CmsFile file = readFile(cms, resource); try { return CmsExtractorMsExcel.getExtractor().extractText(file.getContents()); } catch (Exception e) { if (e instanceof FileNotFoundException) { if ((e.getMessage() != null) && (e.getMessage().indexOf("Workbook") > 0)) { // special case: catch Excel95 format error throw new CmsIndexException(Messages.get().container( Messages.ERR_NO_EXCEL_FORMAT_1, resource.getRootPath()), e); } } throw new CmsIndexException( Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e); } }
textExtractor = CmsExtractorMsWord.getExtractor(); } else if (path1.endsWith(".xls") && path2.endsWith(".xls")) { textExtractor = CmsExtractorMsExcel.getExtractor(); } else if (path1.endsWith(".rtf") && path2.endsWith(".rtf")) { textExtractor = CmsExtractorRtf.getExtractor();