/** * Returns true if the body in question probably contains human readable text. Uses a small sample * of code points to detect unicode control characters commonly used in binary file signatures. */ static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
@Test public void readEmptyBufferThrowsEofException() throws Exception { Buffer buffer = new Buffer(); try { buffer.readUtf8CodePoint(); fail(); } catch (EOFException expected) { } }
/** * Returns true if the body in question probably contains human readable text. Uses a small sample * of code points to detect unicode control characters commonly used in binary file signatures. */ private boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
/** * Returns true if the body in question probably contains human readable text. Uses a small sample * of code points to detect unicode control characters commonly used in binary file signatures. */ static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
/** * Returns true if the body in question probably contains human readable text. Uses a small sample * of code points to detect unicode control characters commonly used in binary file signatures. */ static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
@Test public void readTooLargeCodepointReturnsReplacementCharacter() throws Exception { // 5-byte and 6-byte code points are not supported. Buffer buffer = new Buffer(); buffer.write(ByteString.decodeHex("f888808080")); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertTrue(buffer.exhausted()); }
@Test public void readNonContinuationBytesReturnsReplacementCharacter() throws Exception { // Use a non-continuation byte where a continuation byte is expected. Buffer buffer = new Buffer(); buffer.write(ByteString.decodeHex("df20")); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertEquals(0x20, buffer.readUtf8CodePoint()); // Non-continuation character not consumed. assertTrue(buffer.exhausted()); }
@Test public void readLeadingContinuationByteReturnsReplacementCharacter() throws Exception { Buffer buffer = new Buffer(); buffer.writeByte(0xbf); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertTrue(buffer.exhausted()); }
private void assertCodePointDecoded(String hex, int... codePoints) throws Exception { Buffer buffer = new Buffer().write(ByteString.decodeHex(hex)); for (int codePoint : codePoints) { assertEquals(codePoint, buffer.readUtf8CodePoint()); } assertTrue(buffer.exhausted()); }
@Test public void readMissingContinuationBytesThrowsEofException() throws Exception { Buffer buffer = new Buffer(); buffer.writeByte(0xdf); try { buffer.readUtf8CodePoint(); fail(); } catch (EOFException expected) { } assertFalse(buffer.exhausted()); // Prefix byte wasn't consumed. }
@Test public void readCodePointBeyondUnicodeMaximum() throws Exception { // A 4-byte encoding with data above the U+10ffff Unicode maximum. Buffer buffer = new Buffer(); buffer.write(ByteString.decodeHex("f4908080")); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertTrue(buffer.exhausted()); }
@Test public void readOverlongCodePoint() throws Exception { // Use 2 bytes to encode data that only needs 1 byte. Buffer buffer = new Buffer(); buffer.write(ByteString.decodeHex("c080")); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertTrue(buffer.exhausted()); }
@Test public void readSurrogateCodePoint() throws Exception { Buffer buffer = new Buffer(); buffer.write(ByteString.decodeHex("eda080")); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertTrue(buffer.exhausted()); buffer.write(ByteString.decodeHex("edbfbf")); assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint()); assertTrue(buffer.exhausted()); }
static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } } }
/** * Returns true if the body in question probably contains human readable text. Uses a small sample * of code points to detect unicode control characters commonly used in binary file signatures. */ static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
private static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
static boolean isPlaintext(Buffer buffer) { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; // Truncated UTF-8 sequence. } }
private static boolean isPlaintext(Buffer buffer) throws EOFException { try { Buffer prefix = new Buffer(); long byteCount = buffer.size() < 64 ? buffer.size() : 64; buffer.copyTo(prefix, 0, byteCount); for (int i = 0; i < 16; i++) { if (prefix.exhausted()) { break; } int codePoint = prefix.readUtf8CodePoint(); if (Character.isISOControl(codePoint) && !Character.isWhitespace(codePoint)) { return false; } } return true; } catch (EOFException e) { return false; } } }
@Override public int readUtf8CodePoint() throws IOException { require(1); byte b0 = buffer.getByte(0); if ((b0 & 0xe0) == 0xc0) { require(2); } else if ((b0 & 0xf0) == 0xe0) { require(3); } else if ((b0 & 0xf8) == 0xf0) { require(4); } return buffer.readUtf8CodePoint(); }