@Override public String toString(){ StringBuilder result = new StringBuilder(); result.append( "LI_TYPE: "+t ); result.append("\nLI_TARGET: \""+ target + "\""); result.append("\nLI_TEXT: \""+ getText() +"\""); result.append("\nLI_POSITION: \""+ pos + "\""); result.append( "\nLI_PARAMETERS: "+parameters.size() ); for( String s: parameters ) { result.append("\nLI_PARAMETER: \""+ s +"\"" ); } return result.toString(); } }
private static void handleLinks( List<Link> links, boolean linktext, StringBuilder sb ){ for( Link l: links ){ switch( l.getType() ){ case INTERNAL: String lText = l.getText(); String lTarget = l.getTarget(); if( linktext ) sb.append( lText+" " ); if( !lText.equals( lTarget )) sb.append( lTarget+" " ); break; case EXTERNAL: sb.append( l.getText()+" " ); break; case IMAGE: case AUDIO: case VIDEO: // do nothing ! break; } } }
public ParsedPageLink(Link link) { if (link.getTarget() != null) { page = normalizePageName(link.getTarget().trim()); } else { page = StringTable.EMPTY_STRING; } if (link.getText() != null) { form = removeSuffix(removeQuotes(link.getText().trim())); } String context = link.getHomeElement().getText(); if (context != null) { Span span = link.getPos(); leftContext = context.substring(0, span.getStart()); rightContext = context.substring(span.getEnd(), context.length()); } }
/** * Generates HTML Output for a {@link Link}. */ private static String linkToHtml( Link l ){ if( l == null ) { return "null"; } StringBuilder result = new StringBuilder(); result.append("<div class=\"Link\"><b class=\"Link\">Link:</b>" + l.getType() + ": \"" + convertTags( l.getText() )+ "\" -> \"" + convertTags( l.getTarget() ) +"\""); if( l.getParameters().size() != 0 ){ for( String parameter: l.getParameters() ) { result.append("<br>\nPARAMETER: \""+convertTags( parameter )+"\""); } } result.append("</div>\n"); return result.toString(); }
public static void main(String[] args){ // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); // Link Context (return 1 token left, 2 token right of the link) for (Link link : pp.getLinks()) { System.out.println( link.getContext(1, 0) + "<" + link.getText().toString().toUpperCase() + ">" + link.getContext(0, 2) ); } } }
private void setCategories(Article article, ParsedPage page) { final ArrayList<Link> categories = new ArrayList<Link>(10); for (final de.tudarmstadt.ukp.wikipedia.parser.Link c : page.getCategories()) { categories.add(new Link(c.getTarget(), c.getText(), c.getPos().getStart(), c.getPos().getEnd(), Link.Type.CATEGORY)); } article.setCategories(categories); }
protected void setEnWikiTitle(Article article, ParsedPage page) { if (article.isLang(Language.EN)) { return; } try { if (page.getLanguages() == null) { article.setEnWikiTitle(""); return; } } catch (final NullPointerException e) { // FIXME title is always null! logger.warn("no languages for page {} ", article.getTitle()); return; } for (final de.tudarmstadt.ukp.wikipedia.parser.Link l : page.getLanguages()) { if (l.getText().startsWith("en:")) { article.setEnWikiTitle(l.getTarget().substring(3)); break; } } }
/** * Note that this method only returns the anchors that are not equal to the page's title. * Anchors might contain references to sections in an article in the form of "Page#Section". * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. * * @return A set of strings used as anchor texts in links pointing to that page. * @throws WikiTitleParsingException */ public Set<String> getInlinkAnchors(Page page) throws WikiTitleParsingException { Set<String> inAnchors = new HashSet<String>(); for (Page p : page.getInlinks()) { ParsedPage pp = parser.parse(p.getText()); if (pp == null) { return inAnchors; } for (Link l : pp.getLinks()) { String pageTitle = page.getTitle().getPlainTitle(); String anchorText = l.getText(); if (l.getTarget().equals(pageTitle) && !anchorText.equals(pageTitle)) { inAnchors.add(anchorText); } } } return inAnchors; }
begin = jcas.getDocumentText().indexOf(link.getText(), begin); if (begin == -1) { begin = jcas.getDocumentText().indexOf(link.getText()); end = begin + link.getText().length(); if (end >= jcas.getDocumentText().length()) { end = begin; wikipediaLink.setLinkType(link.getType().name()); wikipediaLink.setTarget(link.getTarget()); wikipediaLink.setAnchor(link.getText()); wikipediaLink.addToIndexes();
private Link addLink(final List<Link> links, final List<Link> externalLinks, final de.tudarmstadt.ukp.wikipedia.parser.Link link, final Link.Type jsonWikipediaType){ if (link.getTarget().isEmpty()){ return null; } final Link jsonWikipediaLink; if (link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL){ jsonWikipediaLink = new Link(link.getTarget(), link.getText(), link.getPos().getStart(), link.getPos().getEnd(), jsonWikipediaType); links.add(jsonWikipediaLink); } else if (link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL){ jsonWikipediaLink = new Link(link.getTarget(), link.getText(), link.getPos().getStart(), link.getPos().getEnd(), jsonWikipediaType); externalLinks.add(new Link(link.getTarget(), link.getText(), link.getPos().getStart(), link.getPos().getEnd(), jsonWikipediaType)); } else if (link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.IMAGE){ jsonWikipediaLink = new Link(link.getTarget(), link.getText(), link.getPos().getStart(), link.getPos().getEnd(), Link.Type.IMAGE); links.add(jsonWikipediaLink); } else { jsonWikipediaLink = null; } return jsonWikipediaLink; }
String anchorText = l.getText(); if (!anchorText.equals(targetTitle)) { Set<String> anchors;
LOG.debug("Found link with empty target: \t" + xml + "\t text=" + curLink.getText()); continue; visitLink(location, destTitle, curLink.getText(), linkSubType); } catch (WikiBrainException e) { LOG.warn(String.format("Could not process link\t%s\t%s", xml, curLink.toString()), e); if (type == NameSpace.ARTICLE){ ParsedLocation location = new ParsedLocation(xml, secNum, paraNum, t.getSrcSpan().getStart()); visitLink(location, destTitle, templateLink.getText(), tempSubType); } else if (type == NameSpace.CATEGORY){ ParsedCategory pc = new ParsedCategory(); String linkText = cat.getText(); if (linkText.contains(Pattern.quote("|"))){ continue;