private static Charset getCharset(String name) { try { return CharsetUtils.forName(name); } catch (Exception e) { return ASCII; } }
/** * Handle various common charset name errors, and return something * that will be considered valid (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name */ public static String clean(String charsetName) { try { return forName(charsetName).name(); } catch (Exception e) { return null; } }
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { continue; } if ("x-user-defined".equalsIgnoreCase(candCharset)) { candCharset = "windows-1252"; } if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { // Use the encoding hint when available name = hint; } else if (statistics.count('\r') == 0) { // If there are no CR(LF)s, then the encoding is more // likely to be ISO-8859-1(5) than windows-1252 if (statistics.count(0xa4) > 0) { // currency/euro sign // The general currency sign is hardly ever used in // ISO-8859-1, so it's more likely that we're dealing // with ISO-8859-15, where the character is used for // the euro symbol, which is more commonly used. name = CHARSET_ISO_8859_15; } else { name = CHARSET_ISO_8859_1; } } } try { this.charset = CharsetUtils.forName(name); } catch (Exception e) { // ignore } }
if (charset != null) { try { return CharsetUtils.forName(charset); } catch (Exception e) {
private static Charset getCharset(String name) { try { return CharsetUtils.forName(name); } catch (Exception e) { return ASCII; } }
private static Charset getCharset(String name) { try { return CharsetUtils.forName(name); } catch (Exception e) { return ASCII; } }
return CharsetUtils.forName(match.getName()); } catch (Exception e) {
/** * Handle various common charset name errors, and return something * that will be considered valid (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name */ public static String clean(String charsetName) { try { return forName(charsetName).name(); } catch (Exception e) { return null; } }
/** * Handle various common charset name errors, and return something * that will be considered valid (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name */ public static String clean(String charsetName) { try { return forName(charsetName).name(); } catch (Exception e) { return null; } }
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); //iterate through charset= and return the first match //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { continue; } if ("x-user-defined".equalsIgnoreCase(candCharset)) { candCharset = "windows-1252"; } if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); } catch (Exception e) { //ignore } } } } return null; }
@Override public Charset detect(final InputStream input, final Metadata metadata) throws IOException { // Check if there is user defined encoding in metadata if (metadata != null && metadata.get(Metadata.CONTENT_ENCODING) != null) { try { return CharsetUtils.forName(metadata.get(Metadata.CONTENT_ENCODING)); } catch (final Exception e) { // ignore any exception } } return null; } }
@Override public Charset detect(final InputStream input, final Metadata metadata) throws IOException { // Check if there is user defined encoding in metadata if (metadata != null && metadata.get(Metadata.CONTENT_ENCODING) != null) { try { return CharsetUtils.forName(metadata.get(Metadata.CONTENT_ENCODING)); } catch (final Exception e) { // ignore any exception } } return null; } }
public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { // Use the encoding hint when available name = hint; } else if (statistics.count('\r') == 0) { // If there are no CR(LF)s, then the encoding is more // likely to be ISO-8859-1(5) than windows-1252 if (statistics.count(0xa4) > 0) { // currency/euro sign // The general currency sign is hardly ever used in // ISO-8859-1, so it's more likely that we're dealing // with ISO-8859-15, where the character is used for // the euro symbol, which is more commonly used. name = CHARSET_ISO_8859_15; } else { name = CHARSET_ISO_8859_1; } } } try { this.charset = CharsetUtils.forName(name); } catch (Exception e) { // ignore } }
public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { // Use the encoding hint when available name = hint; } else if (statistics.count('\r') == 0) { // If there are no CR(LF)s, then the encoding is more // likely to be ISO-8859-1(5) than windows-1252 if (statistics.count(0xa4) > 0) { // currency/euro sign // The general currency sign is hardly ever used in // ISO-8859-1, so it's more likely that we're dealing // with ISO-8859-15, where the character is used for // the euro symbol, which is more commonly used. name = CHARSET_ISO_8859_15; } else { name = CHARSET_ISO_8859_1; } } } try { this.charset = CharsetUtils.forName(name); } catch (Exception e) { // ignore } }
public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { // Use the encoding hint when available name = hint; } else if (statistics.count('\r') == 0) { // If there are no CR(LF)s, then the encoding is more // likely to be ISO-8859-1(5) than windows-1252 if (statistics.count(0xa4) > 0) { // currency/euro sign // The general currency sign is hardly ever used in // ISO-8859-1, so it's more likely that we're dealing // with ISO-8859-15, where the character is used for // the euro symbol, which is more commonly used. name = CHARSET_ISO_8859_15; } else { name = CHARSET_ISO_8859_1; } } } try { this.charset = CharsetUtils.forName(name); } catch (Exception e) { // ignore } }
@Override public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { // Use the encoding hint when available name = hint; } else if (statistics.count('\r') == 0) { // If there are no CR(LF)s, then the encoding is more // likely to be ISO-8859-1(5) than windows-1252 if (statistics.count(0xa4) > 0) { // currency/euro sign // The general currency sign is hardly ever used in // ISO-8859-1, so it's more likely that we're dealing // with ISO-8859-15, where the character is used for // the euro symbol, which is more commonly used. name = CHARSET_ISO_8859_15; } else { name = CHARSET_ISO_8859_1; } } } try { this.charset = CharsetUtils.forName(name); } catch (Exception e) { // ignore } }
if (charset != null) { try { return CharsetUtils.forName(charset); } catch (Exception e) {