/** * {@inheritDoc} */ @Override public Iterator<URL> getEmbeddedResourceURLs(String userAgent, byte[] html, URL baseUrl, URLCollection urls, String encoding) throws HTMLParseException { Document dom; try { dom = (Document) getDOM(html, encoding); } catch (SAXException se) { throw new HTMLParseException(se); } // Now parse the DOM tree scanNodes(dom, urls, baseUrl); return urls.iterator(); }
/** * Returns a node representing a whole xml given an xml document. * * @param text * an xml document (as a byte array) * @return a node representing a whole xml * * @throws SAXException * indicates an error parsing the xml document */ private static Node getDOM(byte[] text, String encoding) throws SAXException { log.debug("Start : getDOM"); Node node = getTidyParser(encoding).parseDOM(new ByteArrayInputStream(text), null); if (log.isDebugEnabled()) { log.debug("node : " + node); } log.debug("End : getDOM"); return node; } }
scanNodes(((Document) node).getDocumentElement(), urls, baseUrl); break; String tmp = getValue(attrs, ATT_HREF); if (tmp != null) { try { urls.addURL(getValue(attrs, ATT_SRC), baseUrl); break; urls.addURL(getValue(attrs, "code"), baseUrl); break; String data = getValue(attrs, "codebase"); if(!StringUtils.isEmpty(data)) { urls.addURL(data, baseUrl); data = getValue(attrs, "data"); if(!StringUtils.isEmpty(data)) { urls.addURL(data, baseUrl); String src = getValue(attrs, ATT_SRC); String typ = getValue(attrs, ATT_TYPE); if ((src != null) && ATT_IS_IMAGE.equalsIgnoreCase(typ)) { urls.addURL(src, baseUrl); if (TAG_LINK.equalsIgnoreCase(name) && STYLESHEET.equalsIgnoreCase(getValue(attrs, ATT_REL))) { urls.addURL(getValue(attrs, ATT_HREF), baseUrl); break;