/** * Initializes a decoder from external decl. */ protected Encoding encodingFromExternalDeclaration(String encoding) throws SAXException { if (encoding == null) { return null; } encoding = Encoding.toAsciiLowerCase(encoding); try { Encoding cs = Encoding.forName(encoding); if ("utf-16".equals(cs.getCanonName()) || "utf-32".equals(cs.getCanonName())) { swallowBom = false; } return whineAboutEncodingAndReturnActual(encoding, cs); } catch (UnsupportedCharsetException e) { tokenizer.err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will sniff."); swallowBom = true; } return null; // keep the compiler happy }
public void Notify(String charsetName) { try { Encoding enc = Encoding.forName(charsetName); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } returnValue = enc; } catch (UnsupportedCharsetException e) { returnValue = null; } } }
public static void main(String[] args) { for (Map.Entry<String, Encoding> entry : encodingByCookedName.entrySet()) { String name = entry.getKey(); Encoding enc = entry.getValue(); System.out.printf( "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n", name, enc.getCanonName(), enc.isObscure(), enc.isRegistered(), enc.isAsciiSuperset(), enc.isLikelyEbcdic()); } }
protected boolean tryCharset(String encoding) throws SAXException { encoding = Encoding.toAsciiLowerCase(encoding); try { return true; } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); if (!cs.isAsciiSuperset()) { err("The encoding \u201C" + encoding if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { err("The encoding \u201C" } else if (!cs.getCanonName().equals(encoding)) { err("The encoding \u201C" + encoding + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" + canonName + "\u201D. (Charmod C024)"); if (cs.isShouldNot()) { warn("Authors should not use the character encoding \u201C" + encoding + "\u201D. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isObscure()) { warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) {
characterEncoding = Encoding.toAsciiLowerCase(actualName); encoding = Encoding.toAsciiLowerCase(encoding); try { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); if (requireAsciiSuperset) { if (!cs.isAsciiSuperset()) { fatal("The encoding \u201C" + actualName if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { err("The encoding \u201C" + "\u201D instead, which is an incompatibility risk."); Encoding htmlActual = cs.getActualHtmlEncoding(); if (htmlActual != null) { handler.warn("Documents encoded as \u201C" + htmlActual.getCanonName() + "\u201D are often mislabeled as \u201C" + actualName + "\u201D, which is the declared encoding of this document."); CharsetDecoder decoder = cs.newDecoder(); decoder.onMalformedInput(CodingErrorAction.REPORT); decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
String canonName = cs.getCanonName(); if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { tokenizer.err("The encoding \u201C" + canonName + "\u201D. (Charmod C024)"); if (cs.isShouldNot()) { tokenizer.warn("Authors should not use the character encoding \u201C" + encoding + "\u201D. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isLikelyEbcdic()) { tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isObscure()) { tokenizer.warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) { return cs; } else { tokenizer.warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D.");
public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { try { internalCharset = Encoding.toAsciiLowerCase(internalCharset); Encoding cs; if ("utf-16".equals(internalCharset) internalCharset = "utf-8"; } else { cs = Encoding.forName(internalCharset); Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) { actual = cs; if (!actual.isAsciiSuperset()) { tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" + internalCharset + internalCharset + "\u201D disagrees with the actual encoding of the document (\u201C" + characterEncoding.getCanonName() + "\u201D)."); } else { Encoding newEnc = whineAboutEncodingAndReturnActual(
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }
} else if (encoding != Encoding.UTF8) { err("Legacy encoding \u201C" + encoding.getCanonName() + "\u201D used. Documents must use UTF-8."); err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D."); + encoding.getCanonName() + "\u201D used. Documents must use UTF-8."); if (driver != null) { this.decoder = encoding.newDecoder(); sniffing = false; position = 0;
public void switchEncoding(Encoding newEnc) { this.decoder = newEnc.newDecoder(); initDecoder(); } }
public String getCharacterEncoding() throws SAXException { return characterEncoding.getCanonName(); }
public Encoding sniff() throws IOException { nsDetector detector = new nsDetector(nsPSMDetector.ALL); detector.Init(this); detector.DoIt(source, length, false); detector.DataEnd(); if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) { return returnValue; } else { return null; } }
/** * @param testBuf * @param cs */ private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) { CharsetDecoder dec = cs.newDecoder(); dec.onMalformedInput(CodingErrorAction.REPORT); dec.onUnmappableCharacter(CodingErrorAction.REPORT); Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec); try { for (int i = 0; i < 0x7F; i++) { if (isAsciiSupersetnessSensitive(i)) { if (r.read() != i) { return false; } } else { if (r.read() != 0x20) { return false; } } } } catch (IOException e) { return false; } catch (Exception e) { return false; } catch (CoderMalfunctionError e) { return false; } return true; }
protected boolean tryCharset(String encoding) throws SAXException { encoding = Encoding.toAsciiLowerCase(encoding); try { return true; } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); if (!cs.isAsciiSuperset()) { err("The encoding \u201C" + encoding if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { err("The encoding \u201C" } else if (!cs.getCanonName().equals(encoding)) { err("The encoding \u201C" + encoding + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" + canonName + "\u201D. (Charmod C024)"); if (cs.isShouldNot()) { warn("Authors should not use the character encoding \u201C" + encoding + "\u201D. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isObscure()) { warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) {
String canonName = cs.getCanonName(); if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { tokenizer.err("The encoding \u201C" + canonName + "\u201D. (Charmod C024)"); if (cs.isShouldNot()) { tokenizer.warn("Authors should not use the character encoding \u201C" + encoding + "\u201D. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isLikelyEbcdic()) { tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isObscure()) { tokenizer.warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) { return cs; } else { tokenizer.warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D.");
public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { try { internalCharset = Encoding.toAsciiLowerCase(internalCharset); Encoding cs; if ("utf-16".equals(internalCharset) internalCharset = "utf-8"; } else { cs = Encoding.forName(internalCharset); Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) { actual = cs; if (!actual.isAsciiSuperset()) { tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" + internalCharset + internalCharset + "\u201D disagrees with the actual encoding of the document (\u201C" + characterEncoding.getCanonName() + "\u201D)."); } else { Encoding newEnc = whineAboutEncodingAndReturnActual(
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }
} else if (encoding != Encoding.UTF8) { err("Legacy encoding \u201C" + encoding.getCanonName() + "\u201D used. Documents must use UTF-8."); err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D."); + encoding.getCanonName() + "\u201D used. Documents must use UTF-8."); if (driver != null) { this.decoder = encoding.newDecoder(); sniffing = false; position = 0;
public void switchEncoding(Encoding newEnc) { this.decoder = newEnc.newDecoder(); initDecoder(); } }
public String getCharacterEncoding() throws SAXException { return characterEncoding.getCanonName(); }