WordExtractor we = new WordExtractor(new HWPFDocument(fis));
/** * {@inheritDoc} */ @Override protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options) throws Exception { // DocumentEntry documentEntry = (DocumentEntry) // poiFs.getRoot().getEntry(POIFS_WORD_DOC); // DocumentInputStream documentInputStream = // poiFs.createDocumentInputStream(POIFS_ENTRY); WordExtractor extractor = new WordExtractor(poiFs); return extractor.getText(); }
/** * {@inheritDoc} * Returns an empty reader if an error occured extracting text from * the word document. */ public Reader extractText(InputStream stream, String type, String encoding) throws IOException { try { return new StringReader(new WordExtractor(stream).getText()); } catch (Exception e) { logger.warn("Failed to extract Word text content", e); return new StringReader(""); } finally { stream.close(); } }
/** * Command line extractor, so people will stop moaning that they can't just * run this. */ public static void main( String[] args ) throws IOException { if ( args.length == 0 ) { System.err.println( "Use:" ); System.err .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" ); System.exit( 1 ); } // Process the first argument as a file FileInputStream fin = new FileInputStream( args[0] ); WordExtractor extractor = new WordExtractor( fin ); System.out.println( extractor.getText() ); }
public static String docText(File f) { try { if (toLowerCase(f.getName()).endsWith(FILE_DOC)) { FileInputStream fis = new FileInputStream(f); WordExtractor ex = new WordExtractor(fis); String text = ex.getText(); text = text.replaceAll("(\\r\\n){2,}", "\r\n").replaceAll("(\\n){2,}", "\n"); fis.close(); return trim(text); } } catch (Exception e) { LOG.error(e.getLocalizedMessage(), e); } return EMPTY; }
/** * Command line extractor, so people will stop moaning that they can't just * run this. */ public static void main( String[] args ) throws IOException { if ( args.length == 0 ) { System.err.println( "Use:" ); System.err .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" ); System.exit( 1 ); } // Process the first argument as a file InputStream fin = new FileInputStream( args[0] ); WordExtractor extractor = new WordExtractor( fin ); try { System.out.println( extractor.getText() ); } finally { extractor.close(); } }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); } try { return new ExtractData( new org.apache.poi.hwpf.extractor.WordExtractor(in).getText()); } catch (final IOException e) { throw new ExtractException(e); } }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); } try { return new ExtractData( new org.apache.poi.hwpf.extractor.WordExtractor(in).getText()); } catch (final IOException e) { throw new ExtractException(e); } }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); } try { return new ExtractData( new org.apache.poi.hwpf.extractor.WordExtractor(in) .getText()); } catch (final IOException e) { throw new ExtractException(e); } }
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) { try { WordExtractor extractor = new WordExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent()))); String s = extractor.getText(); char[] c = s.toCharArray(); handler.startRegion("document"); handler.text(c, 0, c.length); handler.endRegion(); } catch (Exception e) { throw new OntopiaRuntimeException(e); } }
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException { try { POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data)); WordExtractor extractor = new WordExtractor(fs); String wordText = extractor.getText(); return new IndexDocument(fileData.path, wordText, null); } catch (IOException e) { String msg = "Failed to write to the index"; log.error(msg, e); throw new SolrException(ErrorCode.SERVER_ERROR, msg); } }
@Override public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) { try { WordExtractor extractor = new WordExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent()))); String s = extractor.getText(); char[] c = s.toCharArray(); handler.startRegion("document"); handler.text(c, 0, c.length); handler.endRegion(); } catch (Exception e) { throw new OntopiaRuntimeException(e); } }
/** * Extrae el texto de un fichero word. * @param in * @return String. Devuelve el texto crudo * @throws Exception */ public static String extractText(InputStream in) throws Exception { String result = ""; HWPFDocument doc = new HWPFDocument(in); WordExtractor we = new WordExtractor(doc); result = we.getText(); // Eliminamos los caracteres que no nos sirven para indexar. result = ExtractorUtil.removeControlChars(result); return result; }
/** * initialize the word document from an input stream * * @param is */ public void init(InputStream is) { try { POIFSFileSystem fs = new POIFSFileSystem(is); doc = new HWPFDocument(fs); we = new WordExtractor(doc); range = doc.getRange(); } catch (Throwable th) { error = th; } } }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new CrawlerSystemException("The inputstream is null."); } try { @SuppressWarnings("resource") final org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(in); return new ExtractData(wordExtractor.getText()); } catch (final IOException e) { throw new ExtractException(e); } }
/** * Read the .doc file and put it in document * * @param uri URL to create PDF * @param document PDF Document * @param myfont Font style in PDF */ private void readDocFile(Uri uri, Document document, Font myfont) { InputStream inputStream; try { inputStream = mContext.getContentResolver().openInputStream(uri); if (inputStream == null) return; HWPFDocument doc = new HWPFDocument(inputStream); WordExtractor extractor = new WordExtractor(doc); String fileData = extractor.getText(); Paragraph documentParagraph = new Paragraph(fileData + "\n", myfont); documentParagraph.setAlignment(Element.ALIGN_JUSTIFIED); document.add(documentParagraph); inputStream.close(); } catch (IOException | DocumentException e) { e.printStackTrace(); } }
return new WordExtractor(docStream);
return new WordExtractor(poifsDir); } catch (OldWordFileFormatException e) { return new Word6Extractor(poifsDir);
private void currentWordExtraction(final InputStream inputStream, final ParserResultBuilder resultBuilder) throws IOException { try (final WordExtractor word = new WordExtractor(inputStream)) { final SummaryInformation info = word.getSummaryInformation(); if (info != null) { final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]); metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(KEYWORDS, info.getKeywords()); } final ParserFieldsBuilder document = resultBuilder.newDocument(); final String[] paragraphes = word.getParagraphText(); if (paragraphes != null) for (String paragraph : paragraphes) document.add(CONTENT, paragraph); document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000)); } }