/** Default constructor. */ public SAXParser() { super(new HTMLConfiguration()); } // <init>()
protected HTMLConfiguration newConfiguration() { HTMLConfiguration config = new HTMLConfiguration(); // Maintain original case for elements and attributes config.setProperty("http://cyberneko.org/html/properties/names/elems", "match"); config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change"); // Get notified of entity and character references config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true); config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true); config.setFeature("http://xml.org/sax/features/namespaces", true); return config; }
addComponent(fDocumentScanner); addComponent(fTagBalancer); addComponent(fNamespaceBinder); BALANCE_TAGS, }; addRecognizedFeatures(recognizedFeatures); setFeature(AUGMENTATIONS, false); setFeature(NAMESPACES, true); setFeature(VALIDATION, false); setFeature(REPORT_ERRORS, false); setFeature(SIMPLE_ERROR_FORMAT, false); setFeature(BALANCE_TAGS, true); "http://apache.org/xml/features/scanner/notify-builtin-refs", }; addRecognizedFeatures(recognizedFeatures); "http://apache.org/xml/features/scanner/notify-char-refs", }; addRecognizedFeatures(recognizedFeatures); ERROR_REPORTER, }; addRecognizedProperties(recognizedProperties); setProperty(NAMES_ELEMS, "upper"); setProperty(NAMES_ATTRS, "lower"); setProperty(ERROR_REPORTER, fErrorReporter);
public TemplateParserConfiguration(HTMLScanner scanner, String templateDefaultCharset, boolean balanceTag) { AdditionalHandlerFilter starter = new AdditionalHandlerFilter(); addComponent(starter); setProperty(TemplateScanner.HTML_NAMES_ELEMS, "match"); setProperty(TemplateScanner.HTML_NAMES_ATTRS, "no-change"); /* テンプレート上にエンコーディング指定がなければUTF-8と見なす */ setProperty(TemplateScanner.HTML_DEFAULT_ENCODING, templateDefaultCharset); setProperty(TemplateScanner.FILTERS, new XMLDocumentFilter[] { starter }); /* 元のテンプレート内容を忠実に再現させるオプション。 * ただし、</html>の後ろは無視される。false(デフォルト)の場合は、 * </body>と</html>の後につづくものをnekoがむりやり前に持ってくる */ setFeature(IGNORE_OUTSIDE_CONTENT, true); /* <html>や<body>が無い場合もそのままにするオプション。 * これが無いと勝手に付与されてしまう。 */ setFeature(DOCUMENT_FRAGMENT, true); /* HTMLの省略可能な閉じタグなどを自動的に付与するオプション。 * これをfalseにするべきではないが、HTML5の場合にはaタグがblock要素になっているが * NekoHTMLはinlineとして見てしまうため意図しない動きをするため、HTMLのバランスを * 作成者側で保証することとしてfalseにする。 */ setFeature(BALANCE_TAGS, balanceTag); fDocumentScanner = scanner; fDocumentScanner.reset(this); }
final HTMLConfiguration configuration = new HTMLConfiguration(); if (!HTMLParserFactory.getHTMLParserListeners().isEmpty() || HTMLParserFactory.isParserWarningsEnabled()) { configuration.setErrorHandler( new ErrorHandler( url ) ); configuration.setFeature( REPORT_ERRORS, true); configuration.setFeature( AUGMENTATIONS, true ); final ScriptFilter javaScriptFilter = new ScriptFilter( configuration ); configuration.setProperty( FILTERS, new XMLDocumentFilter[] { javaScriptFilter } ); if (HTMLParserFactory.isPreserveTagCase()) { configuration.setProperty( TAG_NAME_CASE, "match" ); configuration.setProperty( ATTRIBUTE_NAME_CASE, "no-change" ); } else { configuration.setProperty( TAG_NAME_CASE, "lower" ); configuration.setProperty( ATTRIBUTE_NAME_CASE, "lower" ); configuration.setProperty(TAG_NAME_CASE, "upper"); configuration.setProperty(ATTRIBUTE_NAME_CASE, "upper"); configuration.setProperty(TAG_NAME_CASE, "lower"); configuration.setProperty(ATTRIBUTE_NAME_CASE, "lower");
/** * Returns a parser suitable for parsing HTML documents. * The NekoHTML parser is used with some settings to * preserve case of tag names and disable namespace processing. * This method is used by {@link #parseHTML}. * @return instance of <code>org.apache.xerces.parsers.DOMParser</code> * with Neko configuration */ public static DOMParser getHTMLParser() { try { HTMLConfiguration config = new HTMLConfiguration(); config.setProperty("http://cyberneko.org/html/properties/names/elems", "match"); config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change"); DOMParser parser = new DOMParser(config); return parser; } catch(Exception exc) { throw new NestedApplicationException(exc); } }
@Override protected DocumentFragment parseFragmentImpl(String source) throws GadgetException { DocumentHandler handler; HTMLConfiguration config = newConfiguration(); // http://cyberneko.org/html/features/balance-tags/document-fragment // deprecated http://cyberneko.org/html/features/document-fragment config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); config.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack", new QName[]{new QName(null, "HTML", "HTML", null), new QName(null, "BODY", "BODY", null)}); try { handler = parseHtmlImpl(source, config, new NekoPatchTagBalancer()); } catch (IOException ioe) { return null; } return handler.getFragment(); }
/** * @return a new instance of a DOMParser suitable for converting a full HTML document to a XHTML document. * @throws ImportExportException if there is a problem constructing the parser. */ public static HtmlToDomParser getHtmlToXhtmlParser(LinkFixer linkFixer) throws ImportExportException { HTMLConfiguration config = new HTMLConfiguration(); config.addRecognizedProperties(new String[]{STYLECOLLECTOR_KEY, LINKFIXER_KEY}); DOMParser parser = new DOMParser(config); StringBuffer styleCollector = new StringBuffer(); try { parser.setProperty("http://cyberneko.org/html/properties/filters", new XMLDocumentFilter[]{new ConfluenceHtmlToXmlFilter()}); parser.setFeature("http://cyberneko.org/html/features/override-doctype", true); parser.setProperty("http://cyberneko.org/html/properties/doctype/pubid", "-//W3C//DTD XHTML 1.0 Transitional//EN"); parser.setProperty("http://cyberneko.org/html/properties/doctype/sysid", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"); parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8"); parser.setProperty(STYLECOLLECTOR_KEY, styleCollector); parser.setProperty(LINKFIXER_KEY, linkFixer); } catch (SAXNotRecognizedException ex) { throw new ImportExportException(ex); } catch (SAXNotSupportedException ex) { throw new ImportExportException(ex); } return new HtmlToDomParser(parser); } }
/** Sets a property. */ public void setProperty(String propertyId, Object value) throws XMLConfigurationException { super.setProperty(propertyId, value); if (propertyId.equals(FILTERS)) { XMLDocumentFilter[] filters = (XMLDocumentFilter[])getProperty(FILTERS); if (filters != null) { for (int i = 0; i < filters.length; i++) { XMLDocumentFilter filter = filters[i]; if (filter instanceof HTMLComponent) { addComponent((HTMLComponent)filter); } } } } int size = fHTMLComponents.size(); for (int i = 0; i < size; i++) { HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i); component.setProperty(propertyId, value); } } // setProperty(String,Object)
final HTMLConfiguration configuration = new HTMLConfiguration(); if (!HTMLParserFactory.getHTMLParserListeners().isEmpty() || HTMLParserFactory.isParserWarningsEnabled()) { configuration.setErrorHandler( new ErrorHandler( url ) ); configuration.setFeature( REPORT_ERRORS, true); configuration.setFeature( AUGMENTATIONS, true ); final ScriptFilter javaScriptFilter = new ScriptFilter( configuration ); configuration.setProperty( FILTERS, new XMLDocumentFilter[] { javaScriptFilter } ); if (HTMLParserFactory.isPreserveTagCase()) { configuration.setProperty( TAG_NAME_CASE, "match" ); configuration.setProperty( ATTRIBUTE_NAME_CASE, "no-change" ); } else { configuration.setProperty( TAG_NAME_CASE, "lower" ); configuration.setProperty( ATTRIBUTE_NAME_CASE, "lower" ); configuration.setProperty(TAG_NAME_CASE, "upper"); configuration.setProperty(ATTRIBUTE_NAME_CASE, "upper"); configuration.setProperty(TAG_NAME_CASE, "lower"); configuration.setProperty(ATTRIBUTE_NAME_CASE, "lower");
/** * Returns a parser suitable for parsing HTML documents. * The NekoHTML parser is used with some settings to * preserve case of tag names and disable namespace processing. * This method is used by {@link #parseHTML}. * @return instance of <code>org.apache.xerces.parsers.DOMParser</code> * with Neko configuration */ public static DOMParser getHTMLParser() { try { HTMLConfiguration config = new HTMLConfiguration(); config.setProperty("http://cyberneko.org/html/properties/names/elems", "match"); config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change"); DOMParser parser = new DOMParser(config); return parser; } catch(Exception exc) { throw new NestedApplicationException(exc); } }
@Override protected DocumentFragment parseFragmentImpl(String source) throws GadgetException { DocumentHandler handler; HTMLConfiguration config = newConfiguration(); // http://cyberneko.org/html/features/balance-tags/document-fragment // deprecated http://cyberneko.org/html/features/document-fragment config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); config.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack", new QName[]{new QName(null, "HTML", "HTML", null), new QName(null, "BODY", "BODY", null)}); try { handler = parseHtmlImpl(source, config, new NekoPatchTagBalancer()); } catch (IOException ioe) { return null; } return handler.getFragment(); }
/** Sets a property. */ public void setProperty(String propertyId, Object value) throws XMLConfigurationException { super.setProperty(propertyId, value); if (propertyId.equals(FILTERS)) { XMLDocumentFilter[] filters = (XMLDocumentFilter[])getProperty(FILTERS); if (filters != null) { for (int i = 0; i < filters.length; i++) { XMLDocumentFilter filter = filters[i]; if (filter instanceof HTMLComponent) { addComponent((HTMLComponent)filter); } } } } int size = fHTMLComponents.size(); for (int i = 0; i < size; i++) { HTMLComponent component = (HTMLComponent)fHTMLComponents.elementAt(i); component.setProperty(propertyId, value); } } // setProperty(String,Object)
package sample; import org.apache.xerces.parsers.AbstractSAXParser; import org.cyberneko.html.HTMLConfiguration; public class HTMLSAXParser extends AbstractSAXParser { public HTMLSAXParser() { super(new HTMLConfiguration()); } }
protected HTMLConfiguration newConfiguration() { HTMLConfiguration config = new HTMLConfiguration(); // Maintain original case for elements and attributes config.setProperty("http://cyberneko.org/html/properties/names/elems", "match"); config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change"); // Get notified of entity and character references config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true); config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true); config.setFeature("http://xml.org/sax/features/namespaces", true); return config; }
final HTMLConfiguration configuration = new HTMLConfiguration(); if (!HTMLParserFactory.getHTMLParserListeners().isEmpty() || HTMLParserFactory.isParserWarningsEnabled()) { configuration.setErrorHandler( new ErrorHandler( url ) ); configuration.setFeature( REPORT_ERRORS, true); configuration.setFeature( AUGMENTATIONS, true ); final ScriptFilter javaScriptFilter = new ScriptFilter( configuration ); configuration.setProperty( FILTERS, new XMLDocumentFilter[] { javaScriptFilter } ); if (HTMLParserFactory.isPreserveTagCase()) { configuration.setProperty( TAG_NAME_CASE, "match" ); configuration.setProperty( ATTRIBUTE_NAME_CASE, "no-change" ); } else { configuration.setProperty( TAG_NAME_CASE, "lower" ); configuration.setProperty( ATTRIBUTE_NAME_CASE, "lower" ); configuration.setProperty(TAG_NAME_CASE, "upper"); configuration.setProperty(ATTRIBUTE_NAME_CASE, "upper"); configuration.setProperty(TAG_NAME_CASE, "lower"); configuration.setProperty(ATTRIBUTE_NAME_CASE, "lower");
addComponent(fDocumentScanner); addComponent(fTagBalancer); addComponent(fNamespaceBinder); BALANCE_TAGS, }; addRecognizedFeatures(recognizedFeatures); setFeature(AUGMENTATIONS, false); setFeature(NAMESPACES, true); setFeature(VALIDATION, false); setFeature(REPORT_ERRORS, false); setFeature(SIMPLE_ERROR_FORMAT, false); setFeature(BALANCE_TAGS, true); "http://apache.org/xml/features/scanner/notify-builtin-refs", }; addRecognizedFeatures(recognizedFeatures); "http://apache.org/xml/features/scanner/notify-char-refs", }; addRecognizedFeatures(recognizedFeatures); ERROR_REPORTER, }; addRecognizedProperties(recognizedProperties); setProperty(NAMES_ELEMS, "upper"); setProperty(NAMES_ATTRS, "lower"); setProperty(ERROR_REPORTER, fErrorReporter);
@Override protected DocumentFragment parseFragmentImpl(String source) throws GadgetException { DocumentHandler handler; HTMLConfiguration config = newConfiguration(); // http://cyberneko.org/html/features/balance-tags/document-fragment // deprecated http://cyberneko.org/html/features/document-fragment config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); config.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack", new QName[]{new QName(null, "HTML", "HTML", null), new QName(null, "BODY", "BODY", null)}); try { handler = parseHtmlImpl(source, config, new NekoPatchTagBalancer()); } catch (IOException ioe) { return null; } return handler.getFragment(); }
/** Default constructor. */ public SAXParser() { super(new HTMLConfiguration()); } // <init>()
protected HTMLConfiguration newConfiguration() { HTMLConfiguration config = new HTMLConfiguration(); // Maintain original case for elements and attributes config.setProperty("http://cyberneko.org/html/properties/names/elems", "match"); config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change"); // Get notified of entity and character references config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true); config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true); config.setFeature("http://xml.org/sax/features/namespaces", true); return config; }