private static String getString( byte[] bs, Charset encoding ) throws UnsupportedEncodingException { if ( encoding == null ) { encoding = guess( bs ); } if ( encoding == null ) { encoding = Charset.forName( "UTF-8" ); } return new String( bs, encoding ); }
/** * @param bs * @return the guessed encoding, or null * @throws UnsupportedEncodingException * if you don't have i18n.jar from Sun's JDK, I guess */ public static Charset guess( byte[] bs ) throws UnsupportedEncodingException { TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>(); for ( byte b : bs ) { if ( ( b & 0xff ) < 128 ) { continue; } if ( map.containsKey( (int) b ) ) { map.put( (int) b, map.get( (int) b ) + 1 ); } else { map.put( (int) b, 1 ); } } return guess( map ); }
/** * @param in * @return the guessed encoding, or null, if none was determined * @throws IOException * @throws UnsupportedEncodingException * if you don't have i18n.jar from Sun's JDK, I guess */ public static Charset guess( InputStream in ) throws IOException { TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>(); int b; while ( ( b = in.read() ) != -1 ) { if ( b < 128 ) { continue; } if ( map.containsKey( b ) ) { map.put( b, map.get( b ) + 1 ); } else { map.put( b, 1 ); } } return guess( map ); }
private static void printSummary( String s ) throws IOException { boolean dbf = s.toLowerCase().endsWith( ".dbf" ); BufferedInputStream in = new BufferedInputStream( new FileInputStream( s ) ); if ( dbf ) { if ( in.skip( 32 ) != 32 ) { LOG.warn( "Could not skip 32 bytes, is the dbf broken?" ); } int b; while ( ( b = in.read() ) != -1 ) { if ( b == 13 ) { break; } } } LOG.info( "Encoding for '" + s + "': " + guess( in ) ); }