private void setParagraphs(Article article, ParsedPage page) { final List<String> paragraphs = new ArrayList<String>(page.nrOfParagraphs()); int paragraphId = 0; for (final Paragraph p : page.getParagraphs()) { String text = p.getText(); // text = removeTemplates(text); text = text.replace("\n", " ").trim(); if (!text.isEmpty()){ paragraphs.add(text); } paragraphId++; } article.setParagraphs(paragraphs); }
private void setHighlights(Article article, ParsedPage page) { final List<String> highlights = new ArrayList<String>(20); for (final Paragraph p : page.getParagraphs()) { for (final Span t : p.getFormatSpans(Content.FormatType.BOLD)) { highlights.add(t.getText(p.getText())); } for (final Span t : p.getFormatSpans(Content.FormatType.ITALIC)) { highlights.add(t.getText(p.getText())); } } article.setHighlights(highlights); }
public String getGloss(Entity entity) throws LexicalSemanticResourceException { if (!this.containsEntity(entity)) { return null; } Page p = WikipediaArticleUtils.entityToPage(wiki, entity, isCaseSensitive); ParsedPage pp = p.getParsedPage(); if (pp == null) { return ""; } Paragraph paragraph = pp.getFirstParagraph(); if (paragraph == null) { return ""; } return paragraph.getText(); }
private String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; }
SpanManager ptext = new SpanManager(p.getText()); List<Span> delete = new ArrayList<Span>(); ptext.manageList(delete);
@Override protected String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; }
String text = ppage.getFirstParagraph().getText();