BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(bis); CharsetMatch cm = cd.detect(); if (cm != null) { reader = cm.getReader(); charset = cm.getName(); }else { throw new UnsupportedCharsetException() }
public static void XMLtoString(File file) { String encoding = ""; String str = ""; try { // detect the encoding of the file CharsetDetector cd = new CharsetDetector().setText(new BufferedInputStream(new FileInputStream(file))); encoding = cd.detect().getName(); // to avoid the BOM ("byte order mark") being added to the String, encoding is specified as a parameter str = FileUtils.readFileToString(file, encoding); } catch (IOException e) { System.err.println("Caught IOException: " + e.getMessage()); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
CharsetDetector detector; CharsetMatch match; byte[] byteData = ...; detector = new CharsetDetector(); detector.setText(byteData); match = detector.detect();
@Override public Set<Charset> detect(InputStream source) throws CharsetDetectorException { Set<Charset> set = new HashSet<Charset>(); com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector(); try { charsetDetector.setText(new BufferedInputStream(source)); CharsetMatch[] charsetMatchs = charsetDetector.detectAll(); for (CharsetMatch match : charsetMatchs) { set.add(Charset.forName(match.getName())); } } catch (IOException e) { throw new CharsetDetectorException(e.getMessage(), e); } return set; }
protected String detectEncoding(InputStream in) throws IOException, ConversionException { if (!in.markSupported()) { // detector.setText requires mark in = new BufferedInputStream(in); } CharsetDetector detector = new CharsetDetector(); detector.setText(in); CharsetMatch charsetMatch = detector.detect(); if (charsetMatch == null) { throw new ConversionException("Cannot detect source charset."); } return charsetMatch.getName(); }
/** * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection * Return the file contents as a String. */ public static String fileAnyEncodingToString(File f) throws IOException { byte[] byteData = IOUtils.toByteArray(new FileInputStream(f)); CharsetDetector detector = new CharsetDetector(); String unicodeData = detector.getString(byteData, null); // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator"); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 60) { LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName()); if (match.getLanguage() != null) { LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage()); } } return unicodeData; }
byte[] thisAppCanBreak = "this app can break" .getBytes("ISO-8859-1"); CharsetDetector detector = new CharsetDetector(); detector.setText(thisAppCanBreak); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); }
public static void main(String[] args) throws IOException { InputStream file = new FileInputStream(args[0]); try { file = new BufferedInputStream(file); CharsetDetector detector = new CharsetDetector(); detector.setText(file); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); } } finally { file.close(); } }
public static String getClipboardCharset () throws UnsupportedCharsetException, UnsupportedFlavorException, IOException { String clipText = null; final Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard(); final Transferable contents = clipboard.getContents(null); if ((contents != null) && contents.isDataFlavorSupported(DataFlavor.stringFlavor)) clipText = (String) contents.getTransferData(DataFlavor.stringFlavor); if (contents!=null && clipText!=null) { final CharsetDetector cd = new CharsetDetector(); cd.setText(clipText.getBytes()); final CharsetMatch cm = cd.detect(); if (cm != null) return cm.getName(); } throw new UnsupportedCharsetException("Unknown"); }
protected String suggestEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); return charSet; }
public String autoDetectEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); setSelectedItem(charSet); return charSet; }
@Override public void getNext(CAS aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); try (InputStream is = new BufferedInputStream( CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { String text; if (ENCODING_AUTO.equals(sourceEncoding)) { CharsetDetector detector = new CharsetDetector(); text = IOUtils.toString(detector.getReader(is, null)); } else { text = IOUtils.toString(is, sourceEncoding); } aJCas.setDocumentText(text); } } }
public static String getEncode(byte[] data){ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; }
public String detect(InputStream fin, byte[] fileContent) throws IOException { String charset = "ISO-8859-1"; fin.read(fileContent); byte[] data = fileContent; CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch cm = detector.detect(); if (cm != null) { int confidence = cm.getConfidence(); //System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%"); if (confidence > 50) { charset = cm.getName(); } } return charset; } }
public static String getEncode(InputStream data) throws IOException{ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; } }
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }