/** * Forwards the call to the delegated parser and post-processes the * results as described above. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { ContentHandler body = new BodyContentHandler(); ContentHandler tee = new TeeContentHandler(handler, body); super.parse(stream, tee, metadata, context); String content = body.toString(); metadata.set("fulltext", content); int length = Math.min(content.length(), 500); metadata.set("summary", content.substring(0, length)); for (String link : RegexUtils.extractLinks(content)) { metadata.add("outlinks", link); } }
/** * Test {@link RegexUtils#extractLinks(String)} for ftp. */ @Test public void testExtractLinksFtp() { List<String> links = RegexUtils.extractLinks( "Test with ftp://www.nutch.org is it found? " + "What about www.google.com at ftp://www.google.de"); assertTrue("Url not found!", links.size() == 2); assertEquals("Wrong URL", "ftp://www.nutch.org", links.get(0)); assertEquals("Wrong URL", "ftp://www.google.de", links.get(1)); } }
/** * Test {@link RegexUtils#extractLinks(String)} with no links. */ @Test public void testExtractLinksNone() { List<String> links = null; links = RegexUtils.extractLinks(null); assertNotNull(links); assertEquals(0, links.size()); links = RegexUtils.extractLinks(""); assertNotNull(links); assertEquals(0, links.size()); links = RegexUtils.extractLinks( "Test with no links " + "What about www.google.com"); assertNotNull(links); assertEquals(0, links.size()); }
/** * Test {@link RegexUtils#extractLinks(String)} for http. */ @Test public void testExtractLinksHttp() { List<String> links = RegexUtils.extractLinks( "Test with http://www.nutch.org/index.html is it found? " + "What about www.google.com at http://www.google.de " + "A longer URL could be http://www.sybit.com/solutions/portals.html"); assertTrue("Url not found!", links.size() == 3); assertEquals("Wrong URL", "http://www.nutch.org/index.html", links.get(0)); assertEquals("Wrong URL", "http://www.google.de", links.get(1)); assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", links.get(2)); }
/** * Forwards the call to the delegated parser and post-processes the * results as described above. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { ContentHandler body = new BodyContentHandler(); ContentHandler tee = new TeeContentHandler(handler, body); super.parse(stream, tee, metadata, context); String content = body.toString(); metadata.set("fulltext", content); int length = Math.min(content.length(), 500); metadata.set("summary", content.substring(0, length)); for (String link : RegexUtils.extractLinks(content)) { metadata.add("outlinks", link); } }
/** * Forwards the call to the delegated parser and post-processes the * results as described above. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { ContentHandler body = new BodyContentHandler(); ContentHandler tee = new TeeContentHandler(handler, body); super.parse(stream, tee, metadata, context); String content = body.toString(); metadata.set("fulltext", content); int length = Math.min(content.length(), 500); metadata.set("summary", content.substring(0, length)); for (String link : RegexUtils.extractLinks(content)) { metadata.add("outlinks", link); } }