@Test public void testCleaningCharsetName() { assertEquals("UTF-8", CharsetUtils.clean("utf-8")); assertEquals(null, CharsetUtils.clean("")); assertEquals(null, CharsetUtils.clean(null)); assertEquals("US-ASCII", CharsetUtils.clean(" us-ascii ")); assertEquals("UTF-8", CharsetUtils.clean("\"utf-8\"")); assertEquals("ISO-8859-1", CharsetUtils.clean("ISO-8859-1, latin1")); }
private static Charset getCharset(String name) { try { return CharsetUtils.forName(name); } catch (Exception e) { return ASCII; } }
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { continue; } if ("x-user-defined".equalsIgnoreCase(candCharset)) { candCharset = "windows-1252"; } if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
@Test public void testInvalidCharset() { assertFalse(CharsetUtils.isSupported(" utf-8")); assertFalse(CharsetUtils.isSupported("my charset name")); assertFalse(CharsetUtils.isSupported("charset1; charset2")); assertFalse(CharsetUtils.isSupported(null)); assertFalse(CharsetUtils.isSupported("")); }
@Test public void testValidCharset() { assertTrue(CharsetUtils.isSupported("UTF-8")); assertFalse(CharsetUtils.isSupported("bogus")); }
/** * Handle various common charset name errors, and return something * that will be considered valid (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name */ public static String clean(String charsetName) { try { return forName(charsetName).name(); } catch (Exception e) { return null; } }
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
@Test public void testFunkyNames() { assertEquals(null, CharsetUtils.clean("none")); assertEquals(null, CharsetUtils.clean("no")); assertEquals("UTF-8", CharsetUtils.clean("utf-8>")); assertEquals("ISO-8859-1", CharsetUtils.clean("iso-8851-1")); assertEquals("ISO-8859-15", CharsetUtils.clean("8859-15")); assertEquals("windows-1251", CharsetUtils.clean("cp-1251")); assertEquals("windows-1251", CharsetUtils.clean("win1251")); assertEquals("windows-1251", CharsetUtils.clean("WIN-1251")); assertEquals("windows-1251", CharsetUtils.clean("win-1251")); assertEquals("windows-1252", CharsetUtils.clean("Windows")); assertEquals("KOI8-R", CharsetUtils.clean("koi8r")); }
public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { // Use the encoding hint when available name = hint; } else if (statistics.count('\r') == 0) { // If there are no CR(LF)s, then the encoding is more // likely to be ISO-8859-1(5) than windows-1252 if (statistics.count(0xa4) > 0) { // currency/euro sign // The general currency sign is hardly ever used in // ISO-8859-1, so it's more likely that we're dealing // with ISO-8859-15, where the character is used for // the euro symbol, which is more commonly used. name = CHARSET_ISO_8859_15; } else { name = CHARSET_ISO_8859_1; } } } try { this.charset = CharsetUtils.forName(name); } catch (Exception e) { // ignore } }
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
if (charset != null) { try { return CharsetUtils.forName(charset); } catch (Exception e) {
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { continue; } if ("x-user-defined".equalsIgnoreCase(candCharset)) { candCharset = "windows-1252"; } if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
outputCharset = StandardCharsets.UTF_8.toString(); outputCharset = CharsetUtils.clean(outputCharset);
private static Charset getCharset(String name) { try { return CharsetUtils.forName(name); } catch (Exception e) { return ASCII; } }