detector.enableInputFilter(true);
public String guessEncoding(InputStream is) throws IOException { CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) ); charsetDetector.enableInputFilter(true); CharsetMatch cm = charsetDetector.detect(); return cm.getName(); }
/** * Detects the character encoding of a string. When the character * encoding of what the input is supposed to be is known, specifying * it as a declared encoding will influence the detection result. * @param input the input to detect encoding on * @param declaredEncoding declared input encoding, if known * @return the character encoding official name or <code>null</code> * if the input is null or blank * @throws IOException if there is a problem find the character encoding */ public static String detectCharset( String input, String declaredEncoding) throws IOException { if (StringUtils.isBlank(input)) { return null; } CharsetDetector cd = new CharsetDetector(); if (StringUtils.isNotBlank(declaredEncoding)) { cd.setDeclaredEncoding(declaredEncoding); } String charset = null; cd.enableInputFilter(true); cd.setText(input.getBytes("UTF-8")); CharsetMatch match = cd.detect(); charset = match.getName(); if (LOG.isDebugEnabled()) { LOG.debug("Detected encoding: " + charset); } return charset; }
cd.enableInputFilter(true); cd.setText(input); rewind(input);
@Override public List<ImporterDocument> parseDocument(ImporterDocument doc, Writer output) throws DocumentParserException { try { //TODO have a generic utility method for this? BufferedInputStream is = new BufferedInputStream(doc.getContent()); CharsetDetector detector = new CharsetDetector(); detector.enableInputFilter(true); detector.setText(is); CharsetMatch match = detector.detect(); String charset = StandardCharsets.UTF_8.toString(); if (match != null && Charset.isSupported(match.getName())) { charset = match.getName(); } BufferedReader reader = new BufferedReader( new InputStreamReader(is, charset)); parse(reader, output, doc.getMetadata()); } catch (IOException | ParserConfigurationException | SAXException e) { throw new DocumentParserException( "Could not parse " + doc.getReference(), e); } return null; }
detector.enableInputFilter(true);