/** * Tests if the link is javascript * @return flag indicating if the link is a javascript code */ public boolean isJavascriptLink() { getLink (); // force an evaluation of the booleans return (javascriptLink); }
@Test public void testLinkExtraction() throws ParserException { Parser parser = new Parser("http://synyx.de"); ObjectFindingVisitor visitor = new ObjectFindingVisitor(LinkTag.class); parser.visitAllNodesWith(visitor); Node[] links = visitor.getTags(); // TODO this could use some more meaningful assertions assertTrue(links.length > 0); for (int i = 0; i < links.length; i++) { LinkTag linkTag = (LinkTag) links[i]; System.out.print("\"" + linkTag.getLinkText() + "\" => "); System.out.println(linkTag.getLink()); } } }
/** * Is this a mail address * @return boolean true/false */ public boolean isMailLink() { getLink (); // force an evaluation of the booleans return (mailLink); }
/** * Is this a mail address * @return boolean true/false */ public boolean isMailLink() { getLink (); // force an evaluation of the booleans return (mailLink); }
/** * Tests if the link is javascript * @return flag indicating if the link is a javascript code */ public boolean isJavascriptLink() { getLink (); // force an evaluation of the booleans return (javascriptLink); }
/** * Tests if the link is an FTP link. * * @return flag indicating if this link is an FTP link */ public boolean isFTPLink() { return getLink ().indexOf("ftp://")==0; }
/** * Tests if the link is an HTTPS link. * * @return flag indicating if this link is an HTTPS link */ public boolean isHTTPSLink() { return getLink ().indexOf("https://")==0; }
/** * Tests if the link is an HTTPS link. * * @return flag indicating if this link is an HTTPS link */ public boolean isHTTPSLink() { return getLink ().indexOf("https://")==0; }
/** * Tests if the link is an IRC link. * @return flag indicating if this link is an IRC link */ public boolean isIRCLink() { return getLink ().indexOf("irc://")==0; }
/** * Tests if the link is an FTP link. * * @return flag indicating if this link is an FTP link */ public boolean isFTPLink() { return getLink ().indexOf("ftp://")==0; }
/** * Tests if the link is an IRC link. * @return flag indicating if this link is an IRC link */ public boolean isIRCLink() { return getLink ().indexOf("irc://")==0; }
@Override public boolean accept(Node node) { if (!(node instanceof LinkTag)) return false; final LinkTag link = (LinkTag) node; return pattern.matcher(link.getLink()).matches(); } }
/** * Accept nodes that are a LinkTag and have a URL * that matches the regex pattern supplied in the constructor. * @param node The node to check. * @return <code>true</code> if the node is a link with the pattern. */ public boolean accept (Node node) { boolean ret; ret = false; if (LinkTag.class.isAssignableFrom (node.getClass ())) { String link = ((LinkTag)node).getLink (); Matcher matcher = mRegex.matcher (link); ret = matcher.find (); } return (ret); } }
@Override public void visitTag(Tag tag) { if (tag instanceof LinkTag) { LinkTag linkTag = (LinkTag) tag; if (linkTag.getLink().startsWith(baseUrl) && isProbablyHtml(linkTag.getLink())) { logger.debug("Using link pointing to {}", linkTag.getLink()); linksToVisit.add(linkTag.getLink()); } else { logger.debug("Skipping link pointing to {}", linkTag.getLink()); } } else if (tag instanceof TitleTag) { TitleTag titleTag = (TitleTag) tag; title = titleTag.getTitle(); } else if (tag instanceof BodyTag) { BodyTag bodyTag = (BodyTag) tag; content = bodyTag.toPlainTextString(); } }
/** * Tries to find a link that has an URI following the given pattern * * @param pattern * the pattern * @return the link content, if found. <code>null</code> otherwise */ public String findLink(final Pattern pattern) { for (final LinkTag tag : filter(LinkTag.class, new LinkPatternFilter( pattern))) { return tag.getLink(); } return null; }
public static List<String> getLinks(String url) throws ParserException { Parser htmlParser = new Parser(url); List<String> links = new LinkedList<String>(); NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int m = 0; m < tagNodeList.size(); m++) { LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m); String linkName = loopLinks.getLink(); links.add(linkName); } return links; }
public static List<String> getLinksOnPage(final String url) { final Parser htmlParser = new Parser(url); final List<String> result = new LinkedList<String>(); try { final NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int j = 0; j < tagNodeList.size(); j++) { final LinkTag loopLink = (LinkTag) tagNodeList.elementAt(j); final String loopLinkStr = loopLink.getLink(); result.add(loopLinkStr); } } catch (ParserException e) { e.printStackTrace(); // TODO handle error } return result; }
public static List<String> getLinksOnPage(final String url) { final Parser htmlParser = new Parser(url); final List<String> result = new LinkedList<String>(); try { final NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int j = 0; j < tagNodeList.size(); j++) { final LinkTag loopLink = (LinkTag) tagNodeList.elementAt(j); final String loopLinkStr = loopLink.getLink(); result.add(loopLinkStr); } } catch (ParserException e) { e.printStackTrace(); // TODO handle error } return result; }
public void visitTag(Tag tag) { if (tag instanceof LinkTag) ((LinkTag)tag).setLink(linkPrefix + ((LinkTag)tag).getLink()); else if (tag instanceof ImageTag) ((ImageTag)tag).setImageURL(linkPrefix + ((ImageTag)tag).getImageURL()); // process only those nodes that won't be processed by an end tag, // nodes without parents or parents without an end tag, since // the complete processing of all children should happen before // we turn this node back into html text if (null == tag.getParent () && (!(tag instanceof CompositeTag) || null == ((CompositeTag)tag).getEndTag ())) modifiedResult.append(tag.toHtml()); }
public void visitTag(Tag tag) { if (tag instanceof LinkTag) ((LinkTag)tag).setLink(linkPrefix + ((LinkTag)tag).getLink()); else if (tag instanceof ImageTag) ((ImageTag)tag).setImageURL(linkPrefix + ((ImageTag)tag).getImageURL()); // process only those nodes that won't be processed by an end tag, // nodes without parents or parents without an end tag, since // the complete processing of all children should happen before // we turn this node back into html text if (null == tag.getParent () && (!(tag instanceof CompositeTag) || null == ((CompositeTag)tag).getEndTag ())) modifiedResult.append(tag.toHtml()); }