final public int getChar() throws EOFException, ParseException, EncodingException { int val = 0; if (offset >= endOffset) throw e; int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff); if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate offset += 2; return temp; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 BE encoding error: should never happen"); val = temp; temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be a low surrogate here throw new EncodingException("UTF 16 BE encoding error: should never happen"); } val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000; offset += 4; return val; } } final public boolean skipChar(int ch)
public void parse(byte[] data, byte[] identifier, JCas jcas) { try { VTDGen vg = new VTDGen(); // needed for extraction of mixed-content-XML // when there is a whitespace only between two // tags, e.g. ...</s> <s id=".">... vg.enableIgnoredWhiteSpace(true); vg.setDoc(data); vg.parse(true); VTDNav vn = vg.getNav(); buildTypes(identifier, jcas, vn); } catch (EncodingException e) { e.printStackTrace(); } catch (EOFException e) { e.printStackTrace(); } catch (EntityException e) { LOG.error(String.format("Document %s could not be parsed due to an EntityError. Document text is:\n%s", new String(identifier), new String(data)), e); } catch (CollectionException e) { e.printStackTrace(); } catch (ParseException e) { LOG.error(String.format("Document %s could not be parsed due to a general parsing error. Document text is:\n%s", new String(identifier), new String(data)), e); } }
public static VTDNav getVTDNav(InputStream is, int bufferSize) throws ParseException, FileTooBigException { VTDGen vg = null; try { byte[] data = readStream(is, bufferSize); vg = new VTDGen(); vg.setDoc(data); vg.parse(true); } catch (EncodingException e) { e.printStackTrace(); } catch (EOFException e) { e.printStackTrace(); } catch (EntityException e) { e.printStackTrace(); } catch (FileTooBigException e) { throw e; } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { String message = e.getMessage(); if (message.contains("file size too big")) throw new FileTooBigException(message); } return vg.getNav(); }
final public int getChar() throws EOFException, ParseException, EncodingException { int val = 0; if (offset >= endOffset) throw e; int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff); if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate offset += 2; return temp; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 BE encoding error: should never happen"); val = temp; temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be a low surrogate here throw new EncodingException("UTF 16 BE encoding error: should never happen"); } val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000; offset += 4; return val; } } final public boolean skipChar(int ch)
final public int getChar() throws EOFException, ParseException, EncodingException { int val = 0; if (offset >= endOffset) throw e; int temp = (XMLDoc[offset + 1] &0xff) << 8 | (XMLDoc[offset]& 0xff); if (temp < 0xd800 || temp > 0xdfff) { // check for low surrogate offset += 2; return temp; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 LE encoding error: should never happen"); val = temp; temp = (XMLDoc[offset + 3] &0xff) << 8 | (XMLDoc[offset + 2]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be high surrogate throw new EncodingException("UTF 16 LE encoding error: should never happen"); } val = ((val - 0xd800) <<10) + (temp - 0xdc00) + 0x10000; offset += 4; return val; } } final public boolean skipChar(int ch)
final public int getChar() throws EOFException, ParseException, EncodingException { int val = 0; if (offset >= endOffset) throw e; int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff); if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate offset += 2; return temp; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 BE encoding error: should never happen"); val = temp; temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be a low surrogate here throw new EncodingException("UTF 16 BE encoding error: should never happen"); } val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000; offset += 4; return val; } } final public boolean skipChar(int ch)
final public int getChar() throws EOFException, ParseException, EncodingException { int val = 0; if (offset >= endOffset) throw e; int temp = (XMLDoc[offset + 1] &0xff) << 8 | (XMLDoc[offset]& 0xff); if (temp < 0xd800 || temp > 0xdfff) { // check for low surrogate offset += 2; return temp; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 LE encoding error: should never happen"); val = temp; temp = (XMLDoc[offset + 3] &0xff) << 8 | (XMLDoc[offset + 2]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be high surrogate throw new EncodingException("UTF 16 LE encoding error: should never happen"); } val = ((val - 0xd800) <<10) + (temp - 0xdc00) + 0x10000; offset += 4; return val; } } final public boolean skipChar(int ch)
final public int getChar() throws EOFException, ParseException, EncodingException { int val = 0; if (offset >= endOffset) throw e; int temp = (XMLDoc[offset + 1] &0xff) << 8 | (XMLDoc[offset]& 0xff); if (temp < 0xd800 || temp > 0xdfff) { // check for low surrogate offset += 2; return temp; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 LE encoding error: should never happen"); val = temp; temp = (XMLDoc[offset + 3] &0xff) << 8 | (XMLDoc[offset + 2]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be high surrogate throw new EncodingException("UTF 16 LE encoding error: should never happen"); } val = ((val - 0xd800) <<10) + (temp - 0xdc00) + 0x10000; offset += 4; return val; } } final public boolean skipChar(int ch)
final public boolean skipChar(int ch) throws EOFException, EncodingException, ParseException { int temp = (XMLDoc[offset + 1]&0xff) << 8 | (XMLDoc[offset]&0xff); if (temp < 0xd800 ||temp > 0xdfff) { // check for low surrogate if (temp == ch) { offset += 2; return true; } else { return false; } } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 LE encoding error: should never happen"); int val = temp; temp = (XMLDoc[offset + 3] &0xff)<< 8 | (XMLDoc[offset + 2]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be high surrogate throw new EncodingException("UTF 16 LE encoding error: should never happen"); } val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000; if (val == ch) { offset += 4; return true; } else return false; } } final public char decode(int offset){
final public boolean skipChar(int ch) throws EOFException, EncodingException, ParseException { int temp = (XMLDoc[offset + 1]&0xff) << 8 | (XMLDoc[offset]&0xff); if (temp < 0xd800 ||temp > 0xdfff) { // check for low surrogate if (temp == ch) { offset += 2; return true; } else { return false; } } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 LE encoding error: should never happen"); int val = temp; temp = (XMLDoc[offset + 3] &0xff)<< 8 | (XMLDoc[offset + 2]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be high surrogate throw new EncodingException("UTF 16 LE encoding error: should never happen"); } val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000; if (val == ch) { offset += 4; return true; } else return false; } } final public char decode(int offset){
final public boolean skipChar(int ch) throws EOFException, ParseException, EncodingException { // implement UTF-16BE to UCS4 conversion int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff); if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate //offset += 2; if (temp == ch) { offset += 2; return true; } else return false; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 BE encoding error: should never happen"); int val = temp; temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be a low surrogate here throw new EncodingException("UTF 16 BE encoding error: should never happen"); } val = ((val - 0xd800) << 10) + (temp - 0xdc00) + 0x10000; if (val == ch) { offset += 4; return true; } else return false; } } final public char decode(int offset){
final public boolean skipChar(int ch) throws EOFException, ParseException, EncodingException { // implement UTF-16BE to UCS4 conversion int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff); if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate //offset += 2; if (temp == ch) { offset += 2; return true; } else return false; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 BE encoding error: should never happen"); int val = temp; temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be a low surrogate here throw new EncodingException("UTF 16 BE encoding error: should never happen"); } val = ((val - 0xd800) << 10) + (temp - 0xdc00) + 0x10000; if (val == ch) { offset += 4; return true; } else return false; } } final public char decode(int offset){
final public boolean skipChar(int ch) throws EOFException, EncodingException, ParseException { int temp = (XMLDoc[offset + 1]&0xff) << 8 | (XMLDoc[offset]&0xff); if (temp < 0xd800 ||temp > 0xdfff) { // check for low surrogate if (temp == ch) { offset += 2; return true; } else { return false; } } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 LE encoding error: should never happen"); int val = temp; temp = (XMLDoc[offset + 3] &0xff)<< 8 | (XMLDoc[offset + 2]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be high surrogate throw new EncodingException("UTF 16 LE encoding error: should never happen"); } val = ((val - 0xd800)<<10) + (temp - 0xdc00) + 0x10000; if (val == ch) { offset += 4; return true; } else return false; } } final public char decode(int offset){
final public boolean skipChar(int ch) throws EOFException, ParseException, EncodingException { // implement UTF-16BE to UCS4 conversion int temp = (XMLDoc[offset]&0xff) << 8 | (XMLDoc[offset + 1]&0xff); if ((temp < 0xd800) || (temp > 0xdfff)) { // not a high surrogate //offset += 2; if (temp == ch) { offset += 2; return true; } else return false; } else { if (temp<0xd800 || temp>0xdbff) throw new EncodingException("UTF 16 BE encoding error: should never happen"); int val = temp; temp = (XMLDoc[offset + 2]&0xff) << 8 | (XMLDoc[offset + 3]&0xff); if (temp < 0xdc00 || temp > 0xdfff) { // has to be a low surrogate here throw new EncodingException("UTF 16 BE encoding error: should never happen"); } val = ((val - 0xd800) << 10) + (temp - 0xdc00) + 0x10000; if (val == ch) { offset += 4; return true; } else return false; } } final public char decode(int offset){
throw new EncodingException("Document is zero sized "); if (XMLDoc[offset] == -2) { increment = 2; r = new UTF16BEReader(); } else throw new EncodingException("Unknown Character encoding: should be 0xff 0xfe"); } else if (XMLDoc[offset] == -1) { increment = 2; r = new UTF16LEReader(); } else throw new EncodingException("Unknown Character encoding: not UTF-16LE"); } else if (XMLDoc[offset] == -17){ if (XMLDoc[offset+1] == -69 && XMLDoc[offset+2]==-65){ throw new EncodingException("Unknown Character encoding: not UTF-8"); throw new EncodingException("Unknown Character encoding: not UTF-16BE");
throw new EncodingException("Document is zero sized "); if (XMLDoc[offset] == -2) { increment = 2; r = new UTF16BEReader(); } else throw new EncodingException("Unknown Character encoding: should be 0xff 0xfe"); } else if (XMLDoc[offset] == -1) { increment = 2; r = new UTF16LEReader(); } else throw new EncodingException("Unknown Character encoding: not UTF-16LE"); } else if (XMLDoc[offset] == -17){ if (XMLDoc[offset+1] == -69 && XMLDoc[offset+2]==-65){ throw new EncodingException("Unknown Character encoding: not UTF-8"); throw new EncodingException("Unknown Character encoding: not UTF-16BE");
throw new EncodingException("Document is zero sized "); if (XMLDoc[offset] == -2) { increment = 2; r = new UTF16BEReader(); } else throw new EncodingException("Unknown Character encoding: should be 0xff 0xfe"); } else if (XMLDoc[offset] == -1) { increment = 2; r = new UTF16LEReader(); } else throw new EncodingException("Unknown Character encoding: not UTF-16LE"); } else if (XMLDoc[offset] == -17){ if (XMLDoc[offset+1] == -69 && XMLDoc[offset+2]==-65){ throw new EncodingException("Unknown Character encoding: not UTF-8"); throw new EncodingException("Unknown Character encoding: not UTF-16BE");
if (singleByteEncoding) { if (must_utf_8) throw new EncodingException( "Can't switch from UTF-8" + formatLineNumber()); if (!singleByteEncoding) { if (!BOM_detected) throw new EncodingException( "BOM not detected for UTF-16" + formatLineNumber());
if (singleByteEncoding) { if (must_utf_8) throw new EncodingException( "Can't switch from UTF-8" + formatLineNumber()); if (!singleByteEncoding) { if (!BOM_detected) throw new EncodingException( "BOM not detected for UTF-16" + formatLineNumber());
if (singleByteEncoding) { if (must_utf_8) throw new EncodingException( "Can't switch from UTF-8" + formatLineNumber()); if (!singleByteEncoding) { if (!BOM_detected) throw new EncodingException( "BOM not detected for UTF-16" + formatLineNumber());