public static void checkRange( ParsedPage pp ){ for( Section s: pp.getSections() ){ if( s.getClass() == SectionContent.class ) checkRange( (SectionContent)s ); else checkRange( (SectionContainer)s ); } }
/** * Process a section data from the raw page input */ public ArrayList<String> sectionData(String text, String title) throws IOException { logger.trace("Processing page: " + title); ParsedPage parsedPage = WikiMarkupParser.getInstance().parsePage(text); ArrayList<String> sections = new ArrayList<>(); for (Section section : parsedPage.getSections()) { appendSection(title, section, null, sections); } return sections; }
/** * @param page */ private void setSections(Article article, ParsedPage page) { final List<String> sections = new ArrayList<String>(10); for (final Section s : page.getSections()) { if ((s == null) || (s.getTitle() == null)) { continue; } sections.add(s.getTitle()); } article.setSections(sections); }
/** * Returns the section titles. From this... */ public String sectionTitle(String text, String title) throws IOException { logger.debug(title); StringBuilder sb = new StringBuilder(); WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance(); ParsedPage parsedPage = wikiMarkupParser.parsePage(text); String sectionTitle; for (Section section : parsedPage.getSections()) { sectionTitle = section.getTitle(); if (sectionTitle != null && sectionTitle.trim().length() > 0) { if (sectionTitleSkipPattern != null) { Matcher matcher = sectionTitleSkipPattern.matcher(sectionTitle); if (matcher.find()) { continue; } } sb.append(title); sb.append(CharacterTable.HORIZONTAL_TABULATION); sb.append(sectionTitle); sb.append(CharacterTable.LINE_FEED); } } return sb.toString(); }
/** * Returns the section titles. From this... */ public String pageAbstract(String text, String title) throws IOException { StringBuilder sb = new StringBuilder(); WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance(); String[] prefixes = {imagePrefix, filePrefix}; ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes); //logger.debug("+++"); //logger.debug(title); try { List<Section> sectionList = parsedPage.getSections(); if (sectionList.size() > 0) { Section section = sectionList.get(0); //logger.debug(section.getTitle()); //logger.debug(section.getText()); //logger.debug("---"); sb.append(title); sb.append(CharacterTable.HORIZONTAL_TABULATION); sb.append(section.getText().replace(CharacterTable.LINE_FEED, CharacterTable.SPACE).trim()); sb.append(CharacterTable.LINE_FEED); } } catch (Exception e) { logger.warn(e.getMessage()); } return sb.toString(); }
ParsedPage parsedPage = wikiMarkupParser.parsePage(wikiText); for (Section section : parsedPage.getSections()) { System.out.println(section.getText());
StringBuilder sb = new StringBuilder(); String sectionTitle; for (Section section : parsedPage.getSections()) { sectionTitle = section.getTitle();
/** * Returns the whole content of the page tokenized in a single line. * The first token is the page title (with underscores) */ private String tokenizedText(ParsedPage parsedPage, String title) throws IOException { StringBuilder sb = new StringBuilder(); sb.append(title); sb.append(CharacterTable.SPACE); Tokenizer tokenizer = HardTokenizer.getInstance(); String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE)); sb.append(tokenizedTitle); String rawContent; String tokenizedContent; List<Content> list; for (Section section : parsedPage.getSections()) { list = section.getContentList(); for (int i = 0; i < list.size(); i++) { rawContent = list.get(i).getText(); if (rawContent.length() > 0) { tokenizedContent = tokenizer.tokenizedString(rawContent); if (tokenizedContent.length() > 0) { sb.append(CharacterTable.SPACE); sb.append(tokenizedContent); } } } } return sb.toString(); }
/** * Returns the whole content of the page tokenized in a single line. * The first token is the page title (with underscores) */ private String tokenizedText(ParsedPage parsedPage, String title) throws IOException { StringBuilder sb = new StringBuilder(); sb.append(title); sb.append(CharacterTable.SPACE); Tokenizer tokenizer = HardTokenizer.getInstance(); String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE)); sb.append(tokenizedTitle); String rawContent; String tokenizedContent; List<Content> list; for (Section section : parsedPage.getSections()) { list = section.getContentList(); for (int i = 0; i < list.size(); i++) { rawContent = list.get(i).getText(); if (rawContent.length() > 0) { tokenizedContent = tokenizer.tokenizedString(rawContent); if (tokenizedContent.length() > 0) { sb.append(CharacterTable.SPACE); sb.append(tokenizedContent); } } } } return sb.toString(); }
String tokenizedContent; List<Content> list; for (Section section : parsedPage.getSections()) { list = section.getContentList(); for (int i = 0; i < list.size(); i++) {
String target; String redirect; for (Section section : pp.getSections()) { List<Link> internalLinks = section.getLinks(Link.type.INTERNAL); for (Link link : internalLinks) {
for( Section s: pp.getSections() ) { result.append( sectionToHtml( s ));
String redirect; if (pp != null) { List<Section> sections = pp.getSections(); if (sections != null) { for (Section section : sections) {
/** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); //get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); //get the sections for(Section section : pp.getSections()) { System.out.println("section : " + section.getTitle()); System.out.println(" nr of paragraphs : " + section.nrOfParagraphs()); System.out.println(" nr of tables : " + section.nrOfTables()); System.out.println(" nr of nested lists : " + section.nrOfNestedLists()); System.out.println(" nr of definition lists: " + section.nrOfDefinitionLists()); } } }
if( firstParagraphHandling != null ){ handleContent( pp.getFirstParagraph(), firstParagraphHandling, sb ); deleteParagraph( pp.getFirstParagraphNr(), pp.getSections() ); for( Section s: pp.getSections() ) handleSection( s, sb );
/** * Prints the targets of the internal links found in the page <i>Germany</i>. * @param args * @throws WikiApiException */ public static void main(String[] args) throws WikiApiException { // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); // only the links to other Wikipedia language editions for (Link language : pp.getLanguages()) { System.out.println(language.getTarget()); } //get the internal links of each section for (Section section : pp.getSections()){ System.out.println("Section: " + section.getTitle()); for (Link link : section.getLinks(Link.type.INTERNAL)) { System.out.println(" " + link.getTarget()); } } } }
String rightContext; Matcher matcher; for (Section section : parsedPage.getSections()) { try { sectionTitle = section.getTitle();
for (Section section : parsedPage.getSections()) { internalLinks = section.getLinks(Link.type.INTERNAL);
public static void main(String[] args) throws WikiApiException { //db connection settings DatabaseConfiguration dbConfig = new DatabaseConfiguration(); dbConfig.setDatabase("DATABASE"); dbConfig.setHost("HOST"); dbConfig.setUser("USER"); dbConfig.setPassword("PASSWORD"); dbConfig.setLanguage(Language.english); //initialize a wiki Wikipedia wiki = new Wikipedia(dbConfig); //get the page 'Dog' Page p = wiki.getPage("Dog"); //get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(p.getText()); //get the sections of the page List<Section> sections = pp.getSections(); for(Section section : sections) { System.out.println(section.getTitle()); } } }
ParsedPageTitle parsedLinkTitle = null; String sectionTitle; for (Section section : parsedPage.getSections()) { internalLinks = section.getLinks(Link.type.INTERNAL); sectionTitle = section.getTitle();