org.apache.lucene.util.UnicodeUtil java code examples

/**
 * Utility method for {@link #UTF8toUTF16(byte[], int, int, char[])}
 * @see #UTF8toUTF16(byte[], int, int, char[])
 */
public static int UTF8toUTF16(BytesRef bytesRef, char[] chars) {
 return UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars);
}

/**
 * Initialize the byte[] from the UTF8 bytes
 * for the provided String.  
 * 
 * @param text This must be well-formed
 * unicode text, with no unpaired surrogates.
 */
public BytesRef(CharSequence text) {
 this(new byte[UnicodeUtil.maxUTF8Length(text.length())]);
 length = UnicodeUtil.UTF16toUTF8(text, 0, text.length(), bytes);
}

 new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);
String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
for (int i = 0; i <= maxEdits; i++) {
 Automaton a = builder.toAutomaton(i, prefix);

@Override
public void writeString(String string) throws IOException {
 int maxLen = UnicodeUtil.maxUTF8Length(string.length());
 if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING)  {
  // string is small enough that we don't need to save memory by falling back to double-pass approach
  // this is just an optimized writeString() that re-uses scratchBytes.
  if (scratchBytes == null) {
   scratchBytes = new byte[ArrayUtil.oversize(maxLen, Character.BYTES)];
  } else {
   scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
  }
  int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
  writeVInt(len);
  writeBytes(scratchBytes, len);
 } else  {
  // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
  int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
  writeVInt(numBytes);
  bytes = ArrayUtil.grow(bytes, length + numBytes);
  length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
 }
}

/** Encode characters from this String, starting at offset
 *  for length characters. It is the responsibility of the
 *  caller to make sure that the destination array is large enough.
 */
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
 return UTF16toUTF8(s, offset, length, out, 0);
}

public final void read(IndexInput input, FieldInfos fieldInfos)
 throws IOException {
 this.term = null;                           // invalidate cache
 int start = input.readVInt();
 int length = input.readVInt();
 int totalLength = start + length;
 if (preUTF8Strings) {
  text.setLength(totalLength);
  input.readChars(text.result, start, length);
 } else {
  if (dirty) {
   // Fully convert all bytes since bytes is dirty
   UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
   bytes.setLength(totalLength);
   input.readBytes(bytes.result, start, length);
   UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
   dirty = false;
  } else {
   // Incrementally convert only the UTF8 bytes that are new:
   bytes.setLength(totalLength);
   input.readBytes(bytes.result, start, length);
   UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
  }
 }
 this.field = fieldInfos.fieldName(input.readVInt());
}

 @Override
 public void writeString(String string) throws IOException {
  int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
  if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING)  {
   // string is small enough that we don't need to save memory by falling back to double-pass approach
   // this is just an optimized writeString() that re-uses scratchBytes.
   scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
   int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
   writeVInt(len);
   writeBytes(scratchBytes, len);
  } else  {
   // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
   int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
   writeVInt(numBytes);
   bytes = ArrayUtil.grow(bytes, length + numBytes);
   length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
  }
 }
}

/**
 * Copy the given UTF-8 bytes into this builder. Works as if the bytes were
 * first converted from UTF-8 to UTF-32 and then copied into this builder.
 */
public void copyUTF8Bytes(BytesRef bytes) {
 grow(bytes.length);
 ref.length = UnicodeUtil.UTF8toUTF32(bytes, ref.ints);
}

 boostAtt.setBoost(1.0F);
} else {
 final int codePointCount = UnicodeUtil.codePointCount(term);
 int minTermLength = Math.min(codePointCount, termLength);

@Override
public void writeString(String v) {
 try {
  final int MAX_CHARS_PER_WINDOW = 1024;
  if (v.length() <= MAX_CHARS_PER_WINDOW) {
   final BytesRef utf8 = new BytesRef(v);
   writeVInt(utf8.length);
   writeBytes(utf8.bytes, utf8.offset, utf8.length);
  } else {
   writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
   final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
   UTF16toUTF8(v, 0, v.length(), buf, (len) -> {
    writeBytes(buf, 0, len);
   });
  }
 } catch (IOException e) {
  throw new UncheckedIOException(e);
 }    
}

/** Encode characters from this String, starting at offset
 *  for length characters. It is the responsibility of the
 *  caller to make sure that the destination array is large enough.
 */
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
 return UTF16toUTF8(s, offset, length, out, 0);
}

public final void read(IndexInput input, FieldInfos fieldInfos)
 throws IOException {
 this.term = null;                           // invalidate cache
 int start = input.readVInt();
 int length = input.readVInt();
 int totalLength = start + length;
 if (preUTF8Strings) {
  text.setLength(totalLength);
  input.readChars(text.result, start, length);
 } else {
  if (dirty) {
   // Fully convert all bytes since bytes is dirty
   UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
   bytes.setLength(totalLength);
   input.readBytes(bytes.result, start, length);
   UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
   dirty = false;
  } else {
   // Incrementally convert only the UTF8 bytes that are new:
   bytes.setLength(totalLength);
   input.readBytes(bytes.result, start, length);
   UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
  }
 }
 this.field = fieldInfos.fieldName(input.readVInt());
}

@Override
public void writeString(String string) throws IOException {
 int maxLen = UnicodeUtil.maxUTF8Length(string.length());
 if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING)  {
  // string is small enough that we don't need to save memory by falling back to double-pass approach
  // this is just an optimized writeString() that re-uses scratchBytes.
  if (scratchBytes == null) {
   scratchBytes = new byte[ArrayUtil.oversize(maxLen, Character.BYTES)];
  } else {
   scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
  }
  int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
  writeVInt(len);
  writeBytes(scratchBytes, len);
 } else  {
  // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
  int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
  writeVInt(numBytes);
  bytes = ArrayUtil.grow(bytes, length + numBytes);
  length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
 }
}

 @Override
 public void writeString(String string) throws IOException {
  int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
  if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING)  {
   // string is small enough that we don't need to save memory by falling back to double-pass approach
   // this is just an optimized writeString() that re-uses scratchBytes.
   scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
   int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
   writeVInt(len);
   writeBytes(scratchBytes, len);
  } else  {
   // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
   int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
   writeVInt(numBytes);
   bytes = ArrayUtil.grow(bytes, length + numBytes);
   length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
  }
 }
}

/**
 * Copy the given UTF-8 bytes into this builder. Works as if the bytes were
 * first converted from UTF-8 to UTF-32 and then copied into this builder.
 */
public void copyUTF8Bytes(BytesRef bytes) {
 grow(bytes.length);
 ref.length = UnicodeUtil.UTF8toUTF32(bytes, ref.ints);
}

final int codePointCount = UnicodeUtil.codePointCount(term);
final float similarity = 1.0f - ((float) ed / (float) 
  (Math.min(codePointCount, termLength)));

@Override
public void writeString(String v) {
 try {
  final int MAX_CHARS_PER_WINDOW = 1024;
  if (v.length() <= MAX_CHARS_PER_WINDOW) {
   final BytesRef utf8 = new BytesRef(v);
   writeVInt(utf8.length);
   writeBytes(utf8.bytes, utf8.offset, utf8.length);
  } else {
   writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
   final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
   UTF16toUTF8(v, 0, v.length(), buf, (len) -> {
    writeBytes(buf, 0, len);
   });
  }
 } catch (IOException e) {
  throw new UncheckedIOException(e);
 }    
}

/** Interprets stored bytes as UTF8 bytes, returning the
 *  resulting string */
public String utf8ToString() {
 final char[] ref = new char[length];
 final int len = UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref);
 return new String(ref, 0, len);
}

/**
 * Replace the content of this buffer with UTF-8 encoded bytes that would
 * represent the provided text.
 */
public void copyChars(CharSequence text, int off, int len) {
 grow(UnicodeUtil.maxUTF8Length(len));
 ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes);
}

/** Encode characters from this String, starting at offset
 *  for length characters. It is the responsibility of the
 *  caller to make sure that the destination array is large enough.
 */
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
 return UTF16toUTF8(s, offset, length, out, 0);
}

Javadoc

Class to encode java's UTF16 char[] into UTF8 byte[] without always allocating a new byte[] as String.getBytes("UTF-8") does.

Most used methods

UTF8toUTF16
Interprets the given byte array as UTF-8 and converts to UTF-16. It is the responsibility of the cal
UTF16toUTF8
Encode characters from a char[] source, starting at offset and stopping when the character 0xffff is
newString
Cover JDK 1.5 API. Create a String from an array of codePoints.
maxUTF8Length
Returns the maximum number of utf8 bytes required to encode a utf16 (e.g., java char[], String)
UTF8toUTF32
This method assumes valid UTF8 input. This methoddoes not perform full UTF8 validation, it will chec
calcUTF16toUTF8Length
Calculates the number of UTF8 bytes necessary to write a UTF16 string.
codePointCount
Returns the number of code points in this UTF8 sequence.This method assumes valid UTF8 input. This m
UTF16toUTF8WithHash

Popular in Java

Parsing JSON documents to java classes using gson
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
getResourceAsStream (ClassLoader)
getExternalFilesDir (Context)
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
ArrayList (java.util)
ArrayList is an implementation of List, backed by an array. All optional operations including adding
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
JCheckBox (javax.swing)
From CI to AI: The AI layer in your organization

How to useUnicodeUtil in org.apache.lucene.util

Best Java code snippets using org.apache.lucene.util.UnicodeUtil (Showing top 20 results out of 315)

How to use
UnicodeUtil
in
org.apache.lucene.util