/** * Utility method for {@link #UTF8toUTF16(byte[], int, int, char[])} * @see #UTF8toUTF16(byte[], int, int, char[]) */ public static int UTF8toUTF16(BytesRef bytesRef, char[] chars) { return UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars); }
/** * Initialize the byte[] from the UTF8 bytes * for the provided String. * * @param text This must be well-formed * unicode text, with no unpaired surrogates. */ public BytesRef(CharSequence text) { this(new byte[UnicodeUtil.maxUTF8Length(text.length())]); length = UnicodeUtil.UTF16toUTF8(text, 0, text.length(), bytes); }
@Override public void writeString(String string) throws IOException { int maxLen = UnicodeUtil.maxUTF8Length(string.length()); if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) { // string is small enough that we don't need to save memory by falling back to double-pass approach // this is just an optimized writeString() that re-uses scratchBytes. if (scratchBytes == null) { scratchBytes = new byte[ArrayUtil.oversize(maxLen, Character.BYTES)]; } else { scratchBytes = ArrayUtil.grow(scratchBytes, maxLen); } int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes); writeVInt(len); writeBytes(scratchBytes, len); } else { // use a double pass approach to avoid allocating a large intermediate buffer for string encoding int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length()); writeVInt(numBytes); bytes = ArrayUtil.grow(bytes, length + numBytes); length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length); } }
/** Encode characters from this String, starting at offset * for length characters. It is the responsibility of the * caller to make sure that the destination array is large enough. */ public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) { return UTF16toUTF8(s, offset, length, out, 0); }
public final void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; if (preUTF8Strings) { text.setLength(totalLength); input.readChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.setLength(totalLength); input.readBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.setLength(totalLength); input.readBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.fieldName(input.readVInt()); }
@Override public void writeString(String string) throws IOException { int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR; if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) { // string is small enough that we don't need to save memory by falling back to double-pass approach // this is just an optimized writeString() that re-uses scratchBytes. scratchBytes = ArrayUtil.grow(scratchBytes, maxLen); int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes); writeVInt(len); writeBytes(scratchBytes, len); } else { // use a double pass approach to avoid allocating a large intermediate buffer for string encoding int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length()); writeVInt(numBytes); bytes = ArrayUtil.grow(bytes, length + numBytes); length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length); } } }
/** * Copy the given UTF-8 bytes into this builder. Works as if the bytes were * first converted from UTF-8 to UTF-32 and then copied into this builder. */ public void copyUTF8Bytes(BytesRef bytes) { grow(bytes.length); ref.length = UnicodeUtil.UTF8toUTF32(bytes, ref.ints); }
boostAtt.setBoost(1.0F); } else { final int codePointCount = UnicodeUtil.codePointCount(term); int minTermLength = Math.min(codePointCount, termLength);
@Override public void writeString(String v) { try { final int MAX_CHARS_PER_WINDOW = 1024; if (v.length() <= MAX_CHARS_PER_WINDOW) { final BytesRef utf8 = new BytesRef(v); writeVInt(utf8.length); writeBytes(utf8.bytes, utf8.offset, utf8.length); } else { writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length())); final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW]; UTF16toUTF8(v, 0, v.length(), buf, (len) -> { writeBytes(buf, 0, len); }); } } catch (IOException e) { throw new UncheckedIOException(e); } }
/** Encode characters from this String, starting at offset * for length characters. It is the responsibility of the * caller to make sure that the destination array is large enough. */ public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) { return UTF16toUTF8(s, offset, length, out, 0); }
public final void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; if (preUTF8Strings) { text.setLength(totalLength); input.readChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.setLength(totalLength); input.readBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.setLength(totalLength); input.readBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.fieldName(input.readVInt()); }
@Override public void writeString(String string) throws IOException { int maxLen = UnicodeUtil.maxUTF8Length(string.length()); if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) { // string is small enough that we don't need to save memory by falling back to double-pass approach // this is just an optimized writeString() that re-uses scratchBytes. if (scratchBytes == null) { scratchBytes = new byte[ArrayUtil.oversize(maxLen, Character.BYTES)]; } else { scratchBytes = ArrayUtil.grow(scratchBytes, maxLen); } int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes); writeVInt(len); writeBytes(scratchBytes, len); } else { // use a double pass approach to avoid allocating a large intermediate buffer for string encoding int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length()); writeVInt(numBytes); bytes = ArrayUtil.grow(bytes, length + numBytes); length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length); } }
@Override public void writeString(String string) throws IOException { int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR; if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) { // string is small enough that we don't need to save memory by falling back to double-pass approach // this is just an optimized writeString() that re-uses scratchBytes. scratchBytes = ArrayUtil.grow(scratchBytes, maxLen); int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes); writeVInt(len); writeBytes(scratchBytes, len); } else { // use a double pass approach to avoid allocating a large intermediate buffer for string encoding int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length()); writeVInt(numBytes); bytes = ArrayUtil.grow(bytes, length + numBytes); length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length); } } }
/** * Copy the given UTF-8 bytes into this builder. Works as if the bytes were * first converted from UTF-8 to UTF-32 and then copied into this builder. */ public void copyUTF8Bytes(BytesRef bytes) { grow(bytes.length); ref.length = UnicodeUtil.UTF8toUTF32(bytes, ref.ints); }
final int codePointCount = UnicodeUtil.codePointCount(term); final float similarity = 1.0f - ((float) ed / (float) (Math.min(codePointCount, termLength)));
@Override public void writeString(String v) { try { final int MAX_CHARS_PER_WINDOW = 1024; if (v.length() <= MAX_CHARS_PER_WINDOW) { final BytesRef utf8 = new BytesRef(v); writeVInt(utf8.length); writeBytes(utf8.bytes, utf8.offset, utf8.length); } else { writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length())); final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW]; UTF16toUTF8(v, 0, v.length(), buf, (len) -> { writeBytes(buf, 0, len); }); } } catch (IOException e) { throw new UncheckedIOException(e); } }
/** Interprets stored bytes as UTF8 bytes, returning the * resulting string */ public String utf8ToString() { final char[] ref = new char[length]; final int len = UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref); return new String(ref, 0, len); }
/** * Replace the content of this buffer with UTF-8 encoded bytes that would * represent the provided text. */ public void copyChars(CharSequence text, int off, int len) { grow(UnicodeUtil.maxUTF8Length(len)); ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes); }
/** Encode characters from this String, starting at offset * for length characters. It is the responsibility of the * caller to make sure that the destination array is large enough. */ public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) { return UTF16toUTF8(s, offset, length, out, 0); }