private static List<Parser> makeParsers(String[] charsets) { // One more TXTParser than we have charsets, for the real thing List<Parser> parsers = new ArrayList<>(charsets.length+1); for (int i=0; i<charsets.length+1; i++) { parsers.set(i, new TXTParser()); } return parsers; }
new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
public static void useCompositeParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); ParseContext context = new ParseContext(); Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); CompositeParser parser = new CompositeParser(); parser.setParsers(parsersByType); parser.setFallback(new TXTParser()); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html"); parser.parse(stream, handler, metadata, context); }
break; case "txt": parser = new TXTParser(); break; default:
public MetaContentExtractor() throws Exception{ _detector = DetectorFactory.getInstance().buildDetector(); _autoParser = new AutoDetectParser(_detector); _txtParser = new TXTParser(); // the config file and the url // TODO: should refactor here to take some sort of configuration object String jsonConfig = "{\"components\": [{"+ "\"name\": \"meaningfulweb\","+ "\"class\": \"org.meaningfulweb.cext.processors.MeaningfulwebCompositeProcessor\"}]}"; processorFactory = new HtmlContentProcessorFactory(jsonConfig); htmlExtractor = new HtmlExtractor(); htmlExtractor.setProcessorFactory(processorFactory); }