@Override protected String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (pp != null ) { text = pp.getText(); } return text; }
@Override protected String getPlainDocumentText(Page page) { ParsedPage pp = parser.parse(page.getText()); if (pp != null) { return pp.getText(); } else { return ""; } }
public static void main(String[] args) throws Exception { String logConfig = System.getProperty("log-config"); if (logConfig == null) { logConfig = "configuration/log-config.txt"; } PropertyConfigurator.configure(logConfig); if (args.length != 2) { logger.info("java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.util.PageTypeExtractor wiki-file title"); System.exit(1); } File file = new File(args[0]); String text = FileUtils.read(file); WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance(); ParsedPage parsedPage = wikiMarkupParser.parsePage(text); String title = args[1]; ParsedPageTitle parsedPageTitle = new ParsedPageTitle(title); logger.debug(title + "\t" + parsedPageTitle); PageTypeExtractor pageTypeExtractor = new PageTypeExtractor(parsedPage.getText(), parsedPageTitle.getForm()); logger.debug(parsedPage.getText()); logger.info(title + " is " + (pageTypeExtractor.isNominal() ? "nominal" : " not nominal)")); } }
/** * Returns a plain text output of the body of this RawPage * @return */ public String getPlainText(boolean includeTemplates) { if (body.isEmpty()) { return ""; } else { MediaWikiParserFactory factory = new MediaWikiParserFactory(); if (!includeTemplates) { factory.setTemplateParserClass(FlushTemplates.class); } return factory.createParser().parse(body).getText(); } }
private String getText(Revision rev) { String text = rev.getRevisionText(); if (outputPlainText) { text = StringEscapeUtils.unescapeHtml4(text); ParsedPage pp = parser.parse(text); if (pp == null) { return ""; } text = pp.getText(); // text = WikiUtils.mediaWikiMarkup2PlainText(text); // replace multiple white space with single white space text = WikiUtils.cleanText(text); } return text; }
private String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; }
/** * Adds as example the text extracted from the page. * * @param parsedPage the text * @param parsedPageTitle the parsed page title * @throws IOException */ private void addTextExample(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException { try { if (parsedPageTitle.isCompliant()) { Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, parsedPage.getText(), Example.CONTENT_FROM_TEXT); exampleList.add(example); addNominalVariantExample(example); addPersonSurnameExample(example); addRedirectLinkExamples(example); //logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP"); } else { //logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED"); } } catch (Exception ex) { logger.error("Exception adding text examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex); } }
@Override protected String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; }
ParsedPage pp = parser.parse(p.getText()); System.out.println(pp.getText());