public static ArgumentEncoder newTextEncoder(int position, String name, String analyzerClass) throws SchemaParseException { ArgumentEncoder r = new ArgumentEncoder(position, name); LuceneTextValueEncoder enc = new LuceneTextValueEncoder(name); Analyzer analyzer; try { analyzer = (Analyzer) Class.forName(analyzerClass).getConstructor(Version.class).newInstance(Version.LUCENE_31); } catch (InstantiationException e) { throw new SchemaParseException("Can't construct analyzer for class " + analyzerClass, e); } catch (IllegalAccessException e) { throw new SchemaParseException("Can't access analyzer class " + analyzerClass, e); } catch (ClassNotFoundException e) { throw new SchemaParseException("Can't find analyzer class " + analyzerClass, e); } catch (NoSuchMethodException e) { throw new SchemaParseException("Can't find constructor for analyzer class " + analyzerClass, e); } catch (InvocationTargetException e) { throw new SchemaParseException("Can't construct analyzer object " + analyzerClass, e); } enc.setAnalyzer(analyzer); r.encoder = enc; return r; }
@Test public void testLuceneEncoding() throws Exception { LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text"); enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46)); Vector v1 = new DenseVector(200); enc.addToVector("test1 and more", v1); enc.flush(1, v1); enc.addToVector("", v1); enc.flush(1, v1); assertEquals(0.0, v1.norm(1), 0); assertEquals(0.0, v1.maxValue(), 0); builder.append("token_").append(i).append(' '); enc.addToVector(builder.toString(), v1); enc.flush(1, v1);
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); sequentialVectors = conf.getBoolean(USE_SEQUENTIAL, false); namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false); String analyzerName = conf.get(ANALYZER_NAME, StandardAnalyzer.class.getName()); Analyzer analyzer; try { analyzer = AnalyzerUtils.createAnalyzer(analyzerName); } catch (ClassNotFoundException e) { //TODO: hmmm, don't like this approach throw new IOException("Unable to create Analyzer for name: " + analyzerName, e); } String encoderName = conf.get(ENCODER_FIELD_NAME, "text"); cardinality = conf.getInt(CARDINALITY, 5000); String encClass = conf.get(ENCODER_CLASS); encoder = ClassUtils.instantiateAs(encClass, FeatureVectorEncoder.class, new Class[]{String.class}, new Object[]{encoderName}); if (encoder instanceof LuceneTextValueEncoder) { ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer); } }
/** * Tokenizes a string using the simplest method. This should be over-ridden for more subtle * tokenization. */ @Override protected Iterable<String> tokenize(CharSequence originalForm) { try { TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm)); ts.addAttribute(CharTermAttribute.class); return new LuceneTokenIterable(ts, false); } catch (IOException ex) { throw new IllegalStateException(ex); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); sequentialVectors = conf.getBoolean(USE_SEQUENTIAL, false); namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false); String analyzerName = conf.get(ANALYZER_NAME, StandardAnalyzer.class.getName()); Analyzer analyzer; try { analyzer = AnalyzerUtils.createAnalyzer(analyzerName); } catch (ClassNotFoundException e) { //TODO: hmmm, don't like this approach throw new IOException("Unable to create Analyzer for name: " + analyzerName, e); } String encoderName = conf.get(ENCODER_FIELD_NAME, "text"); cardinality = conf.getInt(CARDINALITY, 5000); String encClass = conf.get(ENCODER_CLASS); encoder = ClassUtils.instantiateAs(encClass, FeatureVectorEncoder.class, new Class[]{String.class}, new Object[]{encoderName}); if (encoder instanceof LuceneTextValueEncoder) { ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer); } }
/** * Tokenizes a string using the simplest method. This should be over-ridden for more subtle * tokenization. */ @Override protected Iterable<String> tokenize(CharSequence originalForm) { try { TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm)); ts.addAttribute(CharTermAttribute.class); return new LuceneTokenIterable(ts, false); } catch (IOException ex) { throw new IllegalStateException(ex); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); sequentialVectors = conf.getBoolean(USE_SEQUENTIAL, false); namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false); String analyzerName = conf.get(ANALYZER_NAME, StandardAnalyzer.class.getName()); Analyzer analyzer; try { analyzer = AnalyzerUtils.createAnalyzer(analyzerName); } catch (ClassNotFoundException e) { //TODO: hmmm, don't like this approach throw new IOException("Unable to create Analyzer for name: " + analyzerName, e); } String encoderName = conf.get(ENCODER_FIELD_NAME, "text"); cardinality = conf.getInt(CARDINALITY, 5000); String encClass = conf.get(ENCODER_CLASS); encoder = ClassUtils.instantiateAs(encClass, FeatureVectorEncoder.class, new Class[]{String.class}, new Object[]{encoderName}); if (encoder instanceof LuceneTextValueEncoder) { ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer); } }
/** * Tokenizes a string using the simplest method. This should be over-ridden for more subtle * tokenization. */ @Override protected Iterable<String> tokenize(CharSequence originalForm) { TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm)); ts.addAttribute(CharTermAttribute.class); return new LuceneTokenIterable(ts, false); }