nu.validator.htmlparser.io.Encoding java code examples

/**
 * Initializes a decoder from external decl.
 */
protected Encoding encodingFromExternalDeclaration(String encoding)
    throws SAXException {
  if (encoding == null) {
    return null;
  }
  encoding = Encoding.toAsciiLowerCase(encoding);
  try {
    Encoding cs = Encoding.forName(encoding);
    if ("utf-16".equals(cs.getCanonName())
        || "utf-32".equals(cs.getCanonName())) {
      swallowBom = false;
    }
    return whineAboutEncodingAndReturnActual(encoding, cs);
  } catch (UnsupportedCharsetException e) {
    tokenizer.err("Unsupported character encoding name: \u201C" + encoding
        + "\u201D. Will sniff.");
    swallowBom = true;
  }
  return null; // keep the compiler happy
}

  public void Notify(String charsetName) {
    try {
      Encoding enc = Encoding.forName(charsetName);
      Encoding actual = enc.getActualHtmlEncoding();
      if (actual != null) {
        enc = actual;
      }
      returnValue = enc;
    } catch (UnsupportedCharsetException e) {
      returnValue = null;
    }
  }
}

public static void main(String[] args) {
  for (Map.Entry<String, Encoding> entry : encodingByCookedName.entrySet()) {
    String name = entry.getKey();
    Encoding enc = entry.getValue();
    System.out.printf(
        "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n",
        name, enc.getCanonName(), enc.isObscure(),
        enc.isRegistered(), enc.isAsciiSuperset(),
        enc.isLikelyEbcdic());
  }
}

protected boolean tryCharset(String encoding) throws SAXException {
  encoding = Encoding.toAsciiLowerCase(encoding);
  try {
      return true;
    } else {
      Encoding cs = Encoding.forName(encoding);
      String canonName = cs.getCanonName();
      if (!cs.isAsciiSuperset()) {
        err("The encoding \u201C"
              + encoding
      if (!cs.isRegistered()) {
        if (encoding.startsWith("x-")) {
          err("The encoding \u201C"
      } else if (!cs.getCanonName().equals(encoding)) {
        err("The encoding \u201C" + encoding
            + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
            + canonName + "\u201D. (Charmod C024)");
      if (cs.isShouldNot()) {
        warn("Authors should not use the character encoding \u201C"
            + encoding
            + "\u201D. It is recommended to use \u201CUTF-8\u201D.");                
      } else if (cs.isObscure()) {
        warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
      Encoding actual = cs.getActualHtmlEncoding();
      if (actual == null) {

characterEncoding = Encoding.toAsciiLowerCase(actualName);
encoding = Encoding.toAsciiLowerCase(encoding);
try {
  Encoding cs = Encoding.forName(encoding);
  String canonName = cs.getCanonName();
  if (requireAsciiSuperset) {
    if (!cs.isAsciiSuperset()) {
      fatal("The encoding \u201C"
          + actualName
  if (!cs.isRegistered()) {
    if (encoding.startsWith("x-")) {
      err("The encoding \u201C"
        + "\u201D instead, which is an incompatibility risk.");
  Encoding htmlActual = cs.getActualHtmlEncoding();
  if (htmlActual != null) {
    handler.warn("Documents encoded as \u201C"
        + htmlActual.getCanonName()
        + "\u201D are often mislabeled as \u201C"
        + actualName
        + "\u201D, which is the declared encoding of this document.");
  CharsetDecoder decoder = cs.newDecoder();
  decoder.onMalformedInput(CodingErrorAction.REPORT);
  decoder.onUnmappableCharacter(CodingErrorAction.REPORT);

String canonName = cs.getCanonName();
if (!cs.isRegistered()) {
  if (encoding.startsWith("x-")) {
    tokenizer.err("The encoding \u201C"
      + canonName + "\u201D. (Charmod C024)");
if (cs.isShouldNot()) {
  tokenizer.warn("Authors should not use the character encoding \u201C"
      + encoding
      + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isLikelyEbcdic()) {
  tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isObscure()) {
  tokenizer.warn("The character encoding \u201C"
      + encoding
      + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
Encoding actual = cs.getActualHtmlEncoding();
if (actual == null) {
  return cs;
} else {
  tokenizer.warn("Using \u201C" + actual.getCanonName()
      + "\u201D instead of the declared encoding \u201C"
      + encoding + "\u201D.");

public boolean internalEncodingDeclaration(String internalCharset)
    throws SAXException {
  try {
    internalCharset = Encoding.toAsciiLowerCase(internalCharset);
    Encoding cs;
    if ("utf-16".equals(internalCharset)
      internalCharset = "utf-8";
    } else {
      cs = Encoding.forName(internalCharset);
    Encoding actual = cs.getActualHtmlEncoding();
    if (actual == null) {
      actual = cs;
    if (!actual.isAsciiSuperset()) {
      tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
          + internalCharset
          + internalCharset
          + "\u201D disagrees with the actual encoding of the document (\u201C"
          + characterEncoding.getCanonName() + "\u201D).");
    } else {
      Encoding newEnc = whineAboutEncodingAndReturnActual(

public Encoding sniff() throws IOException {
  try {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(this);
    CharsetMatch match = detector.detect();
    Encoding enc = Encoding.forName(match.getName());
    Encoding actual = enc.getActualHtmlEncoding();
    if (actual != null) {
      enc = actual;
    }
    if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
      return enc;
    } else {
      return null;
    }
  } catch (Exception e) {
    return null;
  }
}

  } else if (encoding != Encoding.UTF8) {
    err("Legacy encoding \u201C"
        + encoding.getCanonName()
        + "\u201D used. Documents must use UTF-8.");
    err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D.");
        + encoding.getCanonName()
        + "\u201D used. Documents must use UTF-8.");
    if (driver != null) {
this.decoder = encoding.newDecoder();
sniffing = false;
position = 0;

  public void switchEncoding(Encoding newEnc) {
    this.decoder = newEnc.newDecoder();
    initDecoder();
  }
}

public String getCharacterEncoding() throws SAXException {
  return characterEncoding.getCanonName();
}

public Encoding sniff() throws IOException {
  nsDetector detector = new nsDetector(nsPSMDetector.ALL);
  detector.Init(this);
  detector.DoIt(source, length, false);
  detector.DataEnd();
  if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
    return returnValue;
  } else {
    return null;
  }
}

/**
 * @param testBuf
 * @param cs
 */
private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
  CharsetDecoder dec = cs.newDecoder();
  dec.onMalformedInput(CodingErrorAction.REPORT);
  dec.onUnmappableCharacter(CodingErrorAction.REPORT);
  Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
  try {
    for (int i = 0; i < 0x7F; i++) {
      if (isAsciiSupersetnessSensitive(i)) {
        if (r.read() != i) {
          return false;
        }
      } else {
        if (r.read() != 0x20) {
          return false;
        }
      }
    }
  } catch (IOException e) {
    return false;
  } catch (Exception e) {
    return false;
  } catch (CoderMalfunctionError e) {
    return false;
  }
  return true;
}

protected boolean tryCharset(String encoding) throws SAXException {
  encoding = Encoding.toAsciiLowerCase(encoding);
  try {
      return true;
    } else {
      Encoding cs = Encoding.forName(encoding);
      String canonName = cs.getCanonName();
      if (!cs.isAsciiSuperset()) {
        err("The encoding \u201C"
              + encoding
      if (!cs.isRegistered()) {
        if (encoding.startsWith("x-")) {
          err("The encoding \u201C"
      } else if (!cs.getCanonName().equals(encoding)) {
        err("The encoding \u201C" + encoding
            + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
            + canonName + "\u201D. (Charmod C024)");
      if (cs.isShouldNot()) {
        warn("Authors should not use the character encoding \u201C"
            + encoding
            + "\u201D. It is recommended to use \u201CUTF-8\u201D.");                
      } else if (cs.isObscure()) {
        warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
      Encoding actual = cs.getActualHtmlEncoding();
      if (actual == null) {

String canonName = cs.getCanonName();
if (!cs.isRegistered()) {
  if (encoding.startsWith("x-")) {
    tokenizer.err("The encoding \u201C"
      + canonName + "\u201D. (Charmod C024)");
if (cs.isShouldNot()) {
  tokenizer.warn("Authors should not use the character encoding \u201C"
      + encoding
      + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isLikelyEbcdic()) {
  tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
} else if (cs.isObscure()) {
  tokenizer.warn("The character encoding \u201C"
      + encoding
      + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
Encoding actual = cs.getActualHtmlEncoding();
if (actual == null) {
  return cs;
} else {
  tokenizer.warn("Using \u201C" + actual.getCanonName()
      + "\u201D instead of the declared encoding \u201C"
      + encoding + "\u201D.");

public boolean internalEncodingDeclaration(String internalCharset)
    throws SAXException {
  try {
    internalCharset = Encoding.toAsciiLowerCase(internalCharset);
    Encoding cs;
    if ("utf-16".equals(internalCharset)
      internalCharset = "utf-8";
    } else {
      cs = Encoding.forName(internalCharset);
    Encoding actual = cs.getActualHtmlEncoding();
    if (actual == null) {
      actual = cs;
    if (!actual.isAsciiSuperset()) {
      tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
          + internalCharset
          + internalCharset
          + "\u201D disagrees with the actual encoding of the document (\u201C"
          + characterEncoding.getCanonName() + "\u201D).");
    } else {
      Encoding newEnc = whineAboutEncodingAndReturnActual(

public Encoding sniff() throws IOException {
  try {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(this);
    CharsetMatch match = detector.detect();
    Encoding enc = Encoding.forName(match.getName());
    Encoding actual = enc.getActualHtmlEncoding();
    if (actual != null) {
      enc = actual;
    }
    if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
      return enc;
    } else {
      return null;
    }
  } catch (Exception e) {
    return null;
  }
}

  } else if (encoding != Encoding.UTF8) {
    err("Legacy encoding \u201C"
        + encoding.getCanonName()
        + "\u201D used. Documents must use UTF-8.");
    err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D.");
        + encoding.getCanonName()
        + "\u201D used. Documents must use UTF-8.");
    if (driver != null) {
this.decoder = encoding.newDecoder();
sniffing = false;
position = 0;

  public void switchEncoding(Encoding newEnc) {
    this.decoder = newEnc.newDecoder();
    initDecoder();
  }
}

public String getCharacterEncoding() throws SAXException {
  return characterEncoding.getCanonName();
}

Most used methods

forName
getActualHtmlEncoding
Returns the actualHtmlEncoding.
getCanonName
Returns the canonName.
isAsciiSuperset
Returns the asciiSuperset.
isRegistered
newDecoder
toAsciiLowerCase
isAsciiSupersetnessSensitive
isLikelyEbcdic
isObscure
isShouldNot
toNameKey

Popular in Java

Making http requests using okhttp
setScale (BigDecimal)
setContentView (Activity)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Response (javax.ws.rs.core)
Defines the contract between a returned instance and the runtime when an application needs to provid
Runner (org.openjdk.jmh.runner)
From CI to AI: The AI layer in your organization

How to useEncoding in nu.validator.htmlparser.io

Best Java code snippets using nu.validator.htmlparser.io.Encoding (Showing top 20 results out of 315)

How to use
Encoding
in
nu.validator.htmlparser.io