@Override public void startPrefixMapping(String prefix, String uri) throws SAXException { super.startPrefixMapping(prefix, uri); delegate.startPrefixMapping(prefix, uri); }
@Override public void startDocument() throws SAXException { super.startDocument(); delegate.startDocument(); inHeader = true; inFooter = false; headerCharOffset = 0; if (includeMarkup) { elements = new ArrayList<>(); } }
@Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { super.startElement(uri, localName, qName, atts); if (inHeader) { delegate.startElement(uri, localName, qName, atts); } else if (inFooter) { // Do nothing } else if (includeMarkup) { elements.add(new RecordedElement(uri, localName, qName, atts)); } else { // This happens for the <body> element, if we're not doing markup. delegate.startElement(uri, localName, qName, atts); } }
public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }
protected void addTagAction(final String tag, final TagAction action) { TagAction previousAction = get(tag); if(previousAction == null) { setTagAction(tag, action); } else { setTagAction(tag, new CommonTagActions.Chained(previousAction, action)); } } }
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); instance.addLabelAction(action); return false; }
/** * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler. */ public BoilerpipeHTMLParser() { this(new BoilerpipeHTMLContentHandler()); }
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); return false; }
/** * Creates a new {@link HTMLHighlighter}, which is set-up to return the full * HTML text, with the extracted text portion <b>highlighted</b>. */ public static HTMLHighlighter newHighlightingInstance() { return new HTMLHighlighter(false); }
/** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { return contentHandler.toTextDocument(); } }
@Override public void characters(char[] chars, int offset, int length) throws SAXException { super.characters(chars, offset, length); if (inHeader) { delegate.characters(chars, offset, length); headerCharOffset++; } else if (inFooter) { // Do nothing } else if (includeMarkup) { RecordedElement element = elements.get(elements.size() - 1); char[] characters = new char[length]; System.arraycopy(chars, offset, characters, 0, length); element.getCharacters().add(characters); } }
@Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); if (inHeader) { delegate.endElement(uri, localName, qName); inHeader = !localName.equals("head"); } else if (inFooter) { // Do nothing } else if (localName.equals("body")) { inFooter = true; } else if (includeMarkup) { // Add the end element, and the continuation from the previous element elements.add(new RecordedElement(uri, localName, qName)); elements.add(new RecordedElement()); } }
@Override public void endDocument() throws SAXException { super.endDocument();
/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }
/** * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler. */ public BoilerpipeHTMLParser() { this(new BoilerpipeHTMLContentHandler()); }
/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }
/** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); }