org.apache.tika.parser.txt.CharsetMatch java code examples

charsetMatch = csr.match(this);
if (charsetMatch != null) {
  confidence = charsetMatch.getConfidence() & 0x000000ff;
  if (confidence > 0) {
    CharsetMatch m = new CharsetMatch(this, csr, confidence, charsetMatch.getName(), charsetMatch.getLanguage());
    matches.add(m);

StringBuilder sb = new StringBuilder();
char[] buffer = new char[1024];
Reader reader = getReader();
int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
int bytesRead = 0;
String name = getName();

/**
 * Create a Java String from Unicode character data corresponding
 * to the original byte data supplied to the Charset detect operation.
 *
 * @return a String created from the converted input data.
 * @stable ICU 3.4
 */
public String getString() throws java.io.IOException {
  return getString(-1);
}

/**
 * Create a java.io.Reader for reading the Unicode character data corresponding
 * to the original byte data supplied to the Charset detect operation.
 * <p>
 * CAUTION:  if the source of the byte data was an InputStream, a Reader
 * can be created for only one matching char set using this method.  If more
 * than one charset needs to be tried, the caller will need to reset
 * the InputStream and create InputStreamReaders itself, based on the charset name.
 *
 * @return the Reader for the Unicode character data.
 * @stable ICU 3.4
 */
public Reader getReader() {
  InputStream inputStream = fInputStream;
  if (inputStream == null) {
    inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
  }
  try {
    inputStream.reset();
    return new InputStreamReader(inputStream, getName());
  } catch (IOException e) {
    return null;
  }
}

detector.setText(text.getRawValue());
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 35 &&
    tryToSet7BitEncoding(msg, match.getName())) {
  return;

return confidence == 0 ? null : new CharsetMatch(det, this, confidence);

  public String toString() {
    String s = "Match of " + fCharsetName;
    if (getLanguage() != null) {
      s += " in " + getLanguage();
    }
    s += " with confidence " + fConfidence;
    return s;
  }
}

  return match.getReader();
} catch (IOException e) {
  return null;

  return CharsetUtils.forName(match.getName());
} catch (Exception e) {

  encoding = match.getName();
int confidence = match.getConfidence();

  public CharsetMatch match(CharsetDetector det) {
    String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8-I";
    int confidence = match(det, ngrams, byteMap);
    return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
  }
}

  public String toString() {
    String s = "Match of " + fCharsetName;
    if (getLanguage() != null) {
      s += " in " + getLanguage();
    }
    s += " with confidence " + fConfidence;
    return s;
  }
}

  return match.getReader();
} catch (IOException e) {
  return null;

charsetMatch = csr.match(this);
if (charsetMatch != null) {
  confidence = charsetMatch.getConfidence() & 0x000000ff;
  if (confidence > 0) {
    CharsetMatch m = new CharsetMatch(this, csr, confidence, charsetMatch.getName(), charsetMatch.getLanguage());
    matches.add(m);

/**
 * Create a java.io.Reader for reading the Unicode character data corresponding
 * to the original byte data supplied to the Charset detect operation.
 * <p>
 * CAUTION:  if the source of the byte data was an InputStream, a Reader
 * can be created for only one matching char set using this method.  If more
 * than one charset needs to be tried, the caller will need to reset
 * the InputStream and create InputStreamReaders itself, based on the charset name.
 *
 * @return the Reader for the Unicode character data.
 * @stable ICU 3.4
 */
public Reader getReader() {
  InputStream inputStream = fInputStream;
  if (inputStream == null) {
    inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
  }
  try {
    inputStream.reset();
    return new InputStreamReader(inputStream, getName());
  } catch (IOException e) {
    return null;
  }
}

detector.setText(text.getRawValue());
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 35 &&
    tryToSet7BitEncoding(msg, match.getName())) {
  return;

StringBuilder sb = new StringBuilder();
char[] buffer = new char[1024];
Reader reader = getReader();
int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
int bytesRead = 0;
String name = getName();

  public CharsetMatch match(CharsetDetector det) {
    String name = det.fC1Bytes ? "windows-1254" : "ISO-8859-9";
    int confidence = match(det, ngrams, byteMap);
    return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "tr");
  }
}

/**
 * Autodetect the charset of an inputStream, and return a String
 * containing the converted input data.
 * <p>
 * This is a convenience method that is equivalent to
 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
 * <p>
 * Raise an exception if no charsets appear to match the input data.
 *
 * @param in               The source of the byte data in the unknown charset.
 * @param declaredEncoding A declared encoding for the data, if available,
 *                         or null or an empty string if none is available.
 * @stable ICU 3.4
 */
public String getString(byte[] in, String declaredEncoding) {
  fDeclaredEncoding = declaredEncoding;
  try {
    setText(in);
    CharsetMatch match = detect();
    if (match == null) {
      return null;
    }
    return match.getString(-1);
  } catch (IOException e) {
    return null;
  }
}

  public String toString() {
    String s = "Match of " + fCharsetName;
    if (getLanguage() != null) {
      s += " in " + getLanguage();
    }
    s += " with confidence " + fConfidence;
    return s;
  }
}

Javadoc

This class represents a charset that has been identified by a CharsetDetector as a possible encoding for a set of input data. From an instance of this class, you can ask for a confidence level in the charset identification, or for Java Reader or String to access the original byte data in Unicode form.

Instances of this class are created only by CharsetDetectors.

Note: this class has a natural ordering that is inconsistent with equals. The natural ordering is based on the match confidence value.

Most used methods

getName
Get the name of the detected charset. The name will be one that can be used with other APIs on the p
getConfidence
Get an indication of the confidence in the charset detected. Confidence values range from 0-100, wit
<init>
getLanguage
Get the ISO code for the language of the detected charset.
getReader
Create a java.io.Reader for reading the Unicode character data corresponding to the original byte da
getString
Create a Java String from Unicode character data corresponding to the original byte data supplied to

Popular in Java

Parsing JSON documents to java classes using gson
notifyDataSetChanged (ArrayAdapter)
findViewById (Activity)
requestLocationUpdates (LocationManager)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
NumberFormat (java.text)
The abstract base class for all number formats. This class provides the interface for formatting and
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
LinkedHashMap (java.util)
LinkedHashMap is an implementation of Map that guarantees iteration order. All optional operations a
Github Copilot alternatives

How to useCharsetMatch in org.apache.tika.parser.txt

Best Java code snippets using org.apache.tika.parser.txt.CharsetMatch (Showing top 20 results out of 315)

How to use
CharsetMatch
in
org.apache.tika.parser.txt