org.apache.poi.hwpf.extractor.WordExtractor.<init> java code examples

WordExtractor we = new WordExtractor(new HWPFDocument(fis));

/**
 * {@inheritDoc}
 */
@Override
protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options)
  throws Exception {
 // DocumentEntry documentEntry = (DocumentEntry)
 // poiFs.getRoot().getEntry(POIFS_WORD_DOC);
 // DocumentInputStream documentInputStream =
 // poiFs.createDocumentInputStream(POIFS_ENTRY);
 WordExtractor extractor = new WordExtractor(poiFs);
 return extractor.getText();
}

/**
 * {@inheritDoc}
 * Returns an empty reader if an error occured extracting text from
 * the word document.
 */
public Reader extractText(InputStream stream,
             String type,
             String encoding) throws IOException {
  try {
    return new StringReader(new WordExtractor(stream).getText());
  } catch (Exception e) {
    logger.warn("Failed to extract Word text content", e);
    return new StringReader("");
  } finally {
    stream.close();
  }
}

/**
 * Command line extractor, so people will stop moaning that they can't just
 * run this.
 */
public static void main( String[] args ) throws IOException
{
  if ( args.length == 0 )
  {
    System.err.println( "Use:" );
    System.err
        .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
    System.exit( 1 );
  }
  // Process the first argument as a file
  FileInputStream fin = new FileInputStream( args[0] );
  WordExtractor extractor = new WordExtractor( fin );
  System.out.println( extractor.getText() );
}

public static String docText(File f) {
  try {
    if (toLowerCase(f.getName()).endsWith(FILE_DOC)) {
      FileInputStream fis = new FileInputStream(f);
      WordExtractor ex = new WordExtractor(fis);
      String text = ex.getText();
      text = text.replaceAll("(\\r\\n){2,}", "\r\n").replaceAll("(\\n){2,}", "\n");
      fis.close();
      return trim(text);
    }
  } catch (Exception e) {
    LOG.error(e.getLocalizedMessage(), e);
  }
  return EMPTY;
}

/**
 * Command line extractor, so people will stop moaning that they can't just
 * run this.
 */
public static void main( String[] args ) throws IOException {
  if ( args.length == 0 ) {
    System.err.println( "Use:" );
    System.err
        .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
    System.exit( 1 );
  }
  // Process the first argument as a file
  InputStream fin = new FileInputStream( args[0] );
  WordExtractor extractor = new WordExtractor( fin );
  try {
    System.out.println( extractor.getText() );
  } finally {
    extractor.close();
  }
}

@Override
public ExtractData getText(final InputStream in,
    final Map<String, String> params) {
  if (in == null) {
    throw new RobotSystemException("The inputstream is null.");
  }
  try {
    return new ExtractData(
      new org.apache.poi.hwpf.extractor.WordExtractor(in).getText());
  } catch (final IOException e) {
    throw new ExtractException(e);
  }
}

@Override
public ExtractData getText(final InputStream in,
    final Map<String, String> params) {
  if (in == null) {
    throw new RobotSystemException("The inputstream is null.");
  }
  try {
    return new ExtractData(
      new org.apache.poi.hwpf.extractor.WordExtractor(in).getText());
  } catch (final IOException e) {
    throw new ExtractException(e);
  }
}

@Override
public ExtractData getText(final InputStream in,
    final Map<String, String> params) {
  if (in == null) {
    throw new RobotSystemException("The inputstream is null.");
  }
  try {
    return new ExtractData(
        new org.apache.poi.hwpf.extractor.WordExtractor(in)
            .getText());
  } catch (final IOException e) {
    throw new ExtractException(e);
  }
}

public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
 try {
  WordExtractor extractor = new WordExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
  String s = extractor.getText();
  char[] c = s.toCharArray();
  handler.startRegion("document");
  handler.text(c, 0, c.length);
  handler.endRegion();
 } catch (Exception e) {
  throw new OntopiaRuntimeException(e);
 }    
}

public IndexDocument getIndexedDocument(File2Index fileData)
    throws SolrException {
  try {
    POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
    WordExtractor extractor = new WordExtractor(fs);
    String wordText = extractor.getText();
    return new IndexDocument(fileData.path, wordText, null);
  } catch (IOException e) {
    String msg = "Failed to write to the index";
    log.error(msg, e);
    throw new SolrException(ErrorCode.SERVER_ERROR, msg);
  }
}

@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
 try {
  WordExtractor extractor = new WordExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
  String s = extractor.getText();
  char[] c = s.toCharArray();
  handler.startRegion("document");
  handler.text(c, 0, c.length);
  handler.endRegion();
 } catch (Exception e) {
  throw new OntopiaRuntimeException(e);
 }    
}

new org.apache.poi.hwpf.extractor.WordExtractor(document);

/**
 * Extrae el texto de un fichero word.
 * @param in
 * @return String. Devuelve el texto crudo
 * @throws Exception
 */
public static String extractText(InputStream in) throws Exception {
  String result = "";
  HWPFDocument doc = new HWPFDocument(in);
  WordExtractor we = new WordExtractor(doc);
  result = we.getText();
  // Eliminamos los caracteres que no nos sirven para indexar.
  result = ExtractorUtil.removeControlChars(result);
  return result;
}

 /**
  * initialize the word document from an input stream
  * 
  * @param is
  */
 public void init(InputStream is) {
  try {
   POIFSFileSystem fs = new POIFSFileSystem(is);
   doc = new HWPFDocument(fs);
   we = new WordExtractor(doc);
   range = doc.getRange();
  } catch (Throwable th) {
   error = th;
  }
 }
}

@Override
public ExtractData getText(final InputStream in,
    final Map<String, String> params) {
  if (in == null) {
    throw new CrawlerSystemException("The inputstream is null.");
  }
  try {
    @SuppressWarnings("resource")
    final org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(in);
    return new ExtractData(wordExtractor.getText());
  } catch (final IOException e) {
    throw new ExtractException(e);
  }
}

/**
 * Read the .doc file and put it in document
 *
 * @param uri      URL to create PDF
 * @param document PDF Document
 * @param myfont   Font style in PDF
 */
private void readDocFile(Uri uri, Document document, Font myfont) {
  InputStream inputStream;
  try {
    inputStream = mContext.getContentResolver().openInputStream(uri);
    if (inputStream == null)
      return;
    HWPFDocument doc = new HWPFDocument(inputStream);
    WordExtractor extractor = new WordExtractor(doc);
    String fileData = extractor.getText();
    Paragraph documentParagraph = new Paragraph(fileData + "\n", myfont);
    documentParagraph.setAlignment(Element.ALIGN_JUSTIFIED);
    document.add(documentParagraph);
    inputStream.close();
  } catch (IOException | DocumentException e) {
    e.printStackTrace();
  }
}

return new WordExtractor(docStream);

  return new WordExtractor(poifsDir);
} catch (OldWordFileFormatException e) {
  return new Word6Extractor(poifsDir);

private void currentWordExtraction(final InputStream inputStream, final ParserResultBuilder resultBuilder)
    throws IOException {
  try (final WordExtractor word = new WordExtractor(inputStream)) {
    final SummaryInformation info = word.getSummaryInformation();
    if (info != null) {
      final ParserFieldsBuilder metas = resultBuilder.metas();
      metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]);
      metas.add(TITLE, info.getTitle());
      metas.add(AUTHOR, info.getAuthor());
      metas.add(SUBJECT, info.getSubject());
      metas.add(CREATION_DATE, info.getCreateDateTime());
      metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
      metas.add(KEYWORDS, info.getKeywords());
    }
    final ParserFieldsBuilder document = resultBuilder.newDocument();
    final String[] paragraphes = word.getParagraphText();
    if (paragraphes != null)
      for (String paragraph : paragraphes)
        document.add(CONTENT, paragraph);
    document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000));
  }
}

Javadoc

Create a new Word Extractor

Popular methods of WordExtractor

getText
Grab the text, based on the WordToTextConverter. Shouldn't include any crud, but slower than getText
getParagraphText
close
getFootnoteText
appendHeaderFooter
Add the header/footer text, if it's not empty
getCommentsText
getEndnoteText
getMainTextboxText
getSummaryInformation
getTextFromPieces
Grab the text out of the text pieces. Might also include various bits of crud, but will work in case
stripFields
Removes any fields (eg macros, page markers etc) from the string.
getDocSummaryInformation

Popular in Java

Reading from database using SQL prepared statement
getContentResolver (Context)
onRequestPermissionsResult (Fragment)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Date (java.sql)
A class which can consume and produce dates in SQL Date format. Dates are represented in SQL as yyyy
SQLException (java.sql)
An exception that indicates a failed JDBC operation. It provides the following information about pro
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
JTextField (javax.swing)
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Top PhpStorm plugins

How to use org.apache.poi.hwpf.extractor.WordExtractorconstructor

Best Java code snippets using org.apache.poi.hwpf.extractor.WordExtractor.<init> (Showing top 20 results out of 315)

How to use
org.apache.poi.hwpf.extractor.WordExtractor
constructor