com.ibm.icu.text.CharsetDetector.<init> java code examples

 BufferedInputStream bis = new BufferedInputStream(input);
CharsetDetector cd = new CharsetDetector();
cd.setText(bis);
CharsetMatch cm = cd.detect();

if (cm != null) {
  reader = cm.getReader();
  charset = cm.getName();
}else {
  throw new UnsupportedCharsetException()
}

 public static void XMLtoString(File file) {

  String encoding = "";
  String str = "";

  try {
    // detect the encoding of the file
    CharsetDetector cd = new CharsetDetector().setText(new BufferedInputStream(new FileInputStream(file)));
    encoding = cd.detect().getName();

    // to avoid the BOM ("byte order mark") being added to the String, encoding is specified as a parameter
    str = FileUtils.readFileToString(file, encoding);
  }
  catch (IOException e) {
    System.err.println("Caught IOException: " + e.getMessage());
  }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

 CharsetDetector detector;
CharsetMatch match;
byte[] byteData = ...;

detector = new CharsetDetector();

detector.setText(byteData);
match = detector.detect();

@Override
public Set<Charset> detect(InputStream source) throws CharsetDetectorException {
  Set<Charset> set = new HashSet<Charset>();
  com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector();
  try {
    charsetDetector.setText(new BufferedInputStream(source));
    CharsetMatch[] charsetMatchs = charsetDetector.detectAll();
    for (CharsetMatch match : charsetMatchs) {
      set.add(Charset.forName(match.getName()));
    }
  } catch (IOException e) {
    throw new CharsetDetectorException(e.getMessage(), e);
  }
  return set;
}

protected String detectEncoding(InputStream in) throws IOException, ConversionException {
  if (!in.markSupported()) {
    // detector.setText requires mark
    in = new BufferedInputStream(in);
  }
  CharsetDetector detector = new CharsetDetector();
  detector.setText(in);
  CharsetMatch charsetMatch = detector.detect();
  if (charsetMatch == null) {
    throw new ConversionException("Cannot detect source charset.");
  }
  return charsetMatch.getName();
}

/**
 * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection
 * Return the file contents as a String.
 */
public static String fileAnyEncodingToString(File f) throws IOException {
 byte[] byteData = IOUtils.toByteArray(new FileInputStream(f));
 CharsetDetector detector = new CharsetDetector();
 String unicodeData = detector.getString(byteData, null);
 // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF
 unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator");
 CharsetMatch match = detector.detect();
 if (match != null && match.getConfidence() > 60) {
  LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName());
  if (match.getLanguage() != null) {
   LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage());
  }
 }
 return unicodeData;
}

byte[] thisAppCanBreak = "this app can break"
   .getBytes("ISO-8859-1");
 CharsetDetector detector = new CharsetDetector();
 detector.setText(thisAppCanBreak);
 String tableTemplate = "%10s %10s %8s%n";
 System.out.format(tableTemplate, "CONFIDENCE",
   "CHARSET", "LANGUAGE");
 for (CharsetMatch match : detector.detectAll()) {
  System.out.format(tableTemplate, match
    .getConfidence(), match.getName(), match
    .getLanguage());
 }

public static void main(String[] args) throws IOException {
 InputStream file = new FileInputStream(args[0]);
 try {
  file = new BufferedInputStream(file);
  CharsetDetector detector = new CharsetDetector();
  detector.setText(file);
  String tableTemplate = "%10s %10s %8s%n";
  System.out.format(tableTemplate, "CONFIDENCE",
    "CHARSET", "LANGUAGE");
  for (CharsetMatch match : detector.detectAll()) {
   System.out.format(tableTemplate, match
     .getConfidence(), match.getName(), match
     .getLanguage());
  }
 } finally {
  file.close();
 }
}

 public static String getClipboardCharset () throws UnsupportedCharsetException, UnsupportedFlavorException, IOException {
  String clipText = null;
  final Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard();
  final Transferable contents = clipboard.getContents(null);
  if ((contents != null) && contents.isDataFlavorSupported(DataFlavor.stringFlavor))
    clipText = (String) contents.getTransferData(DataFlavor.stringFlavor);

  if (contents!=null && clipText!=null) {
    final CharsetDetector cd = new CharsetDetector();
    cd.setText(clipText.getBytes());
    final CharsetMatch cm = cd.detect();

    if (cm != null)
      return cm.getName();
  }

  throw new UnsupportedCharsetException("Unknown");
}

protected String suggestEncoding(final byte[] bytes) {
  final CharsetDetector cd = new CharsetDetector();
  cd.setText(bytes);
  final CharsetMatch charsetMatch = cd.detect();
  final String charSet = charsetMatch.getName();
  final int confidence = charsetMatch.getConfidence();
  logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
  return charSet;
}

public String autoDetectEncoding(final byte[] bytes) {
  final CharsetDetector cd = new CharsetDetector();
  cd.setText(bytes);
  final CharsetMatch charsetMatch = cd.detect();
  final String charSet = charsetMatch.getName();
  final int confidence = charsetMatch.getConfidence();
  logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
  setSelectedItem(charSet);
  return charSet;
}

  @Override
  public void getNext(CAS aJCas)
    throws IOException, CollectionException
  {
    Resource res = nextFile();
    initCas(aJCas, res);

    try (InputStream is = new BufferedInputStream(
        CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) {
      String text;

      if (ENCODING_AUTO.equals(sourceEncoding)) {
        CharsetDetector detector = new CharsetDetector();
        text = IOUtils.toString(detector.getReader(is, null));
      }
      else {
        text = IOUtils.toString(is, sourceEncoding);
      }
      
      aJCas.setDocumentText(text);        
    }
  }
}

public static String getEncode(byte[] data){
  CharsetDetector detector = new CharsetDetector();
  detector.setText(data);
  CharsetMatch match = detector.detect();
  String encoding = match.getName();
  System.out.println("The Content in " + match.getName());
  CharsetMatch[] matches = detector.detectAll();
  System.out.println("All possibilities");
  for (CharsetMatch m : matches) {
  System.out.println("CharsetName:" + m.getName() + " Confidence:"
   + m.getConfidence());
  }
  return encoding;
}

  public String detect(InputStream fin, byte[] fileContent) throws IOException
  {        
    
    String charset = "ISO-8859-1";
    fin.read(fileContent);

    byte[] data =  fileContent;

    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();

    if (cm != null) {
      int confidence = cm.getConfidence();
      //System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%");
      if (confidence > 50) {
        charset = cm.getName();
      }
    }        
    return charset;
  }
}

public static String getEncode(InputStream data) throws IOException{
  CharsetDetector detector = new CharsetDetector();
  detector.setText(data);
  CharsetMatch match = detector.detect();
  String encoding = match.getName();
  System.out.println("The Content in " + match.getName());
  CharsetMatch[] matches = detector.detectAll();
  System.out.println("All possibilities");
  for (CharsetMatch m : matches) {
  System.out.println("CharsetName:" + m.getName() + " Confidence:"
   + m.getConfidence());
  }
  return encoding;
}
}

public Encoding sniff() throws IOException {
  try {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(this);
    CharsetMatch match = detector.detect();
    Encoding enc = Encoding.forName(match.getName());
    Encoding actual = enc.getActualHtmlEncoding();
    if (actual != null) {
      enc = actual;
    }
    if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
      return enc;
    } else {
      return null;
    }
  } catch (Exception e) {
    return null;
  }
}

public Encoding sniff() throws IOException {
  try {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(this);
    CharsetMatch match = detector.detect();
    Encoding enc = Encoding.forName(match.getName());
    Encoding actual = enc.getActualHtmlEncoding();
    if (actual != null) {
      enc = actual;
    }
    if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
      return enc;
    } else {
      return null;
    }
  } catch (Exception e) {
    return null;
  }
}

Javadoc

Constructor

Popular methods of CharsetDetector

setText
Set the input text (byte) data whose charset is to be detected.
detect
Return the charset that best matches the supplied input data. Note though, that because the detectio
detectAll
Return an array of all charsets that appear to be plausible matches with the input data. The array i
getAllDetectableCharsets
Get the names of all charsets supported by CharsetDetector class.Note: Multiple different charset en
setDeclaredEncoding
Set the declared encoding for charset detection. The declared encoding of an input text is an encodi
enableInputFilter
Enable filtering of input text. If filtering is enabled, text within angle brackets ("<" and ">") wi
MungeInput
getReader
Autodetect the charset of an inputStream, and return a Java Reader to access the converted input dat
getString
Autodetect the charset of an inputStream, and return a String containing the converted input data. T

Popular in Java

Parsing JSON documents to java classes using gson
getSupportFragmentManager (FragmentActivity)
notifyDataSetChanged (ArrayAdapter)
requestLocationUpdates (LocationManager)
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
DataSource (javax.sql)
An interface for the creation of Connection objects which represent a connection to a database. This
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
Top Sublime Text plugins

How to use com.ibm.icu.text.CharsetDetectorconstructor

Best Java code snippets using com.ibm.icu.text.CharsetDetector.<init> (Showing top 20 results out of 315)

How to use
com.ibm.icu.text.CharsetDetector
constructor