/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[]) */ public I_CmsExtractionResult extractText(byte[] content) throws Exception { // encoding is null return extractText(content, null); }
/** * Returns an instance of this text extractor.<p> * * @return an instance of this text extractor */ public static I_CmsTextExtractor getExtractor() { // since this extractor requires a member variable we have no static instance return new CmsExtractorMsPowerPoint(); }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in) throws Exception { return extractText(in, new OOXMLParser()); } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { String result = ""; try { if (CmsStringUtil.isEmpty(encoding)) { encoding = OpenCms.getSystemInfo().getDefaultEncoding(); } result = CmsHtmlExtractor.extractText(in, encoding); result = removeControlChars(result); } catch (Exception e) { if (LOG.isErrorEnabled()) { LOG.error(Messages.get().container(Messages.LOG_EXTRACT_TEXT_ERROR_0), e); } } return new CmsExtractionResult(result); } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { String rawContent = ""; try { // first extract the table content rawContent = extractTableContent(getStreamCopy(in)); rawContent = removeControlChars(rawContent); // now extract the meta information using POI POIFSReader reader = new POIFSReader(); reader.registerListener(this); reader.read(getStreamCopy(in)); } catch (Exception e) { if (LOG.isErrorEnabled()) { LOG.error(Messages.get().container(Messages.LOG_EXTRACT_TEXT_ERROR_0), e); } } // combine the meta information with the content and create the result return createExtractionResult(rawContent); }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { String rawContent = ""; try { POIFSReader reader = new POIFSReader(); reader.registerListener(this); reader.read(in); // extract all information rawContent = removeControlChars(m_buffer.toString()); // free buffer memory m_buffer = new StringBuffer(4096); } catch (Exception e) { if (LOG.isErrorEnabled()) { LOG.error(Messages.get().container(Messages.LOG_EXTRACT_TEXT_ERROR_0), e); } } // combine the meta information with the content and create the result return createExtractionResult(rawContent); }
/** * Serializes the given extraction result and saves it in the disk cache.<p> * * @param rfsName the RFS name of the file to save the extraction result in * @param content the extraction result to serialize and save * * @throws IOException in case of disk access errors */ public void saveCacheObject(String rfsName, I_CmsExtractionResult content) throws IOException { byte[] byteContent = content.getBytes(); if (byteContent != null) { CmsVfsDiskCache.saveFile(rfsName, byteContent); } } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in) throws Exception { return extractText(in, new OfficeParser()); } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in) throws Exception { return extractText(in, new PDFParser()); } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in) throws Exception { return extractText(in, new RTFParser()); } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ @Override public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { String result = ""; try { if (CmsStringUtil.isEmpty(encoding)) { encoding = OpenCms.getSystemInfo().getDefaultEncoding(); } result = CmsHtmlExtractor.extractText(in, encoding); result = removeControlChars(result); } catch (Exception e) { if (LOG.isErrorEnabled()) { LOG.error(Messages.get().container(Messages.LOG_EXTRACT_TEXT_ERROR_0), e); } } return new CmsExtractionResult(result); } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream) */ public I_CmsExtractionResult extractText(InputStream in) throws Exception { // encoding is null // (using cast to disambiguate method) return extractText(in, (String)null); }
/** * Serializes the given extraction result and saves it in the disk cache.<p> * * @param rfsName the RFS name of the file to save the extraction result in * @param content the extraction result to serialize and save * * @throws IOException in case of disk access errors */ public void saveCacheObject(String rfsName, I_CmsExtractionResult content) throws IOException { byte[] byteContent = content.getBytes(); if (byteContent != null) { CmsVfsDiskCache.saveFile(rfsName, byteContent); } } }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream) */ public I_CmsExtractionResult extractText(InputStream in) throws Exception { // encoding is null return extractText(in, null); }
/** * Extends the given document by a field that contains the extracted content blob.<p> * * @param document the document to extend * @param cms the OpenCms context used for building the search index * @param resource the resource that is indexed * @param extractionResult the plain text extraction result from the resource * @param properties the list of all properties directly attached to the resource (not searched) * @param propertiesSearched the list of all searched properties of the resource * * @return the document extended by a field that contains the extracted content blob */ protected I_CmsSearchDocument appendContentBlob( I_CmsSearchDocument document, CmsObject cms, CmsResource resource, I_CmsExtractionResult extractionResult, List<CmsProperty> properties, List<CmsProperty> propertiesSearched) { if (extractionResult != null) { byte[] data = extractionResult.getBytes(); if (data != null) { document.addContentField(data); } } return document; }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[], java.lang.String) */ public I_CmsExtractionResult extractText(byte[] content, String encoding) throws Exception { // call stream based method of extraction with encoding return extractText(new ByteArrayInputStream(content), encoding); }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[], java.lang.String) */ public I_CmsExtractionResult extractText(byte[] content, String encoding) throws Exception { // call stream based method of extraction m_inputBuffer = content; return extractText(new ByteArrayInputStream(content), encoding); }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(byte[]) */ public I_CmsExtractionResult extractText(byte[] content) throws Exception { // call stream based method of extraction without encoding return extractText(new ByteArrayInputStream(content)); }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { // read the byte content byte[] text = CmsFileUtil.readFully(in); // call byte array based method of extraction return extractText(text, encoding); }
/** * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String) */ public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { // read the byte content byte[] text = CmsFileUtil.readFully(in); // call byte array based method of extraction return extractText(text, encoding); }