/** * @param language * @param indexField * @param analyzer * @param fst * @param allowCreation */ protected CorpusCreationInfo(String language, String indexField, String storeField, FieldType fieldType, File fst){ this.language = language; this.indexedField = indexField; this.storedField = storeField; this.fst = fst; this.analyzer = fieldType.getAnalyzer(); }
/** * @param language * @param indexField * @param analyzer * @param fst * @param allowCreation */ protected CorpusCreationInfo(String language, String indexField, String storeField, FieldType fieldType, File fst){ this.language = language; this.indexedField = indexField; this.storedField = storeField; this.fst = fst; this.analyzer = fieldType.getAnalyzer(); }
protected Analyzer getAnalyzer(String fieldName) { Analyzer analyzer = analyzers.get(fieldName); return analyzer!=null ? analyzer : getDynamicFieldType(fieldName).getAnalyzer(); }
/** * @param language * @param indexField * @param analyzer * @param fst * @param allowCreation */ protected CorpusInfo(String language, String indexField, String storeField, FieldType fieldType, File fst, boolean allowCreation){ this.language = language; this.indexedField = indexField; this.storedField = storeField; this.fst = fst; this.allowCreation = allowCreation; this.analyzer = fieldType.getAnalyzer(); this.taggingAnalyzer = fieldType.getQueryAnalyzer(); this.fstDate = fst.isFile() ? new Date(fst.lastModified()) : null; } /**
protected HashMap<String,Analyzer> analyzerCache() { HashMap<String,Analyzer> cache = new HashMap<String,Analyzer>(); for (SchemaField f : getFields().values()) { Analyzer analyzer = f.getType().getAnalyzer(); cache.put(f.getName(), analyzer); } return cache; }
protected void checkAllowLeadingWildcards() { boolean allow = false; for (Entry<String, FieldType> e : schema.getFieldTypes().entrySet()) { Analyzer a = e.getValue().getAnalyzer(); if (a instanceof TokenizerChain) { // examine the indexing analysis chain if it supports leading wildcards TokenizerChain tc = (TokenizerChain)a; TokenFilterFactory[] factories = tc.getTokenFilterFactories(); for (TokenFilterFactory factory : factories) { if (factory instanceof ReversedWildcardFilterFactory) { allow = true; leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory); } } } } // XXX should be enabled on a per-field basis if (allow) { setAllowLeadingWildcard(true); } }
/** Add the xpathFields to the indexConfig using information about the field drawn from the schema. */ private void addXPathFields() { for (Entry<String,String> f : xpathFieldConfig) { SchemaField field = schema.getField(f.getKey()); FieldType fieldType = field.getType(); if (fieldType == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "Field " + f.getKey() + " declared in lux config, but not defined in schema"); } XPathField xpathField = new XPathField(f.getKey(), f.getValue(), fieldType.getAnalyzer(), field.stored() ? Store.YES : Store.NO, field); indexConfig.addField(xpathField); } }
Analyzer analyzer = getAnalyzer(); if (analyzer instanceof SolrAnalyzer) { ((SolrAnalyzer)analyzer).setPositionIncrementGap(Integer.parseInt(positionInc));
IndexWriter writer = new IndexWriter(ramDir, fieldType.getAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); writer.setMergeFactor(300);
for (String name : doc.getFieldNames()) { FieldType ft = schema.getFieldType(name); Analyzer analyzer = ft.getAnalyzer(); Collection<Object> vals = doc.getFieldValues(name); for (Object val : vals) {
public TokenizeText(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); this.inputFieldName = getConfigs().getString(config, "inputField"); this.outputFieldName = getConfigs().getString(config, "outputField"); String solrFieldType = getConfigs().getString(config, "solrFieldType"); Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator"); SolrLocator locator = new SolrLocator(solrLocatorConfig, context); LOG.debug("solrLocator: {}", locator); IndexSchema schema = locator.getIndexSchema(); FieldType fieldType = schema.getFieldTypeByName(solrFieldType); if (fieldType == null) { throw new MorphlineCompilationException("Missing Solr field type in schema.xml for name: " + solrFieldType, config); } this.analyzer = fieldType.getAnalyzer(); Preconditions.checkNotNull(analyzer); try { // register CharTermAttribute for later (implicit) reuse this.token = analyzer.tokenStream("content", reader).addAttribute(CharTermAttribute.class); } catch (IOException e) { throw new MorphlineCompilationException("Cannot create token stream", config, e); } Preconditions.checkNotNull(token); validateArguments(); }
field.add("uniqueKey", true); if (ft.getAnalyzer().getPositionIncrementGap(f.getName()) != 0) { field.add("positionIncrementGap", ft.getAnalyzer().getPositionIncrementGap(f.getName()));
field.add("tokenized", ft.isTokenized() ); field.add("className", ft.getClass().getName()); field.add("indexAnalyzer", getAnalyzerInfo(ft.getAnalyzer())); field.add("queryAnalyzer", getAnalyzerInfo(ft.getQueryAnalyzer())); types.add( ft.getTypeName(), field );
AnalysisContext context = new AnalysisContext(fieldName, fieldType, fieldType.getAnalyzer(), termsToMatch); NamedList analyzedTokens = analyzeValue(analysisRequest.getFieldValue(), context); analyzeResults.add("index", analyzedTokens);
Analyzer xmlQueryAnalyzer = null; if (schemaField != null) { xmlAnalyzer = schemaField.getType().getAnalyzer(); xmlQueryAnalyzer = schemaField.getType().getQueryAnalyzer(); if (xmlAnalyzer != null) { Analyzer analyzer = destination.getType().getAnalyzer(); if (analyzer == null) {
@Test public void testSirenFieldAnalyzer() throws Exception { final IndexSchema schema = h.getCore().getLatestSchema(); final SchemaField ntriple = schema.getField(JSON_FIELD); final FieldType tmp = ntriple.getType(); assertTrue(tmp.getAnalyzer() instanceof TokenizerChain); final TokenizerChain ts = (TokenizerChain) tmp.getAnalyzer(); assertNotNull(ts.getTokenizerFactory()); assertTrue(ts.getTokenizerFactory() instanceof ExtendedJsonTokenizerFactory); // 3 filters for index analyzer assertNotNull(ts.getTokenFilterFactories()); assertEquals(3, ts.getTokenFilterFactories().length); assertTrue(ts.getTokenFilterFactories()[0] instanceof DatatypeAnalyzerFilterFactory); assertTrue(ts.getTokenFilterFactories()[1] instanceof PositionAttributeFilterFactory); assertTrue(ts.getTokenFilterFactories()[2] instanceof SirenPayloadFilterFactory); }
@Test public void testSirenFieldDatatypeAnalyzer() throws Exception { final IndexSchema schema = h.getCore().getLatestSchema(); final SchemaField ntriple = schema.getField(JSON_FIELD); final FieldType tmp = ntriple.getType(); TokenizerChain ts = (TokenizerChain) tmp.getAnalyzer(); assertTrue(ts.getTokenFilterFactories()[0] instanceof DatatypeAnalyzerFilterFactory); final DatatypeAnalyzerFilterFactory f = (DatatypeAnalyzerFilterFactory) ts.getTokenFilterFactories()[0]; assertNotNull(f.getDatatypeAnalyzers()); assertEquals(9, f.getDatatypeAnalyzers().size()); assertNotNull(f.getDatatypeAnalyzers().get("http://json.org/field")); ts = (TokenizerChain) f.getDatatypeAnalyzers().get("http://json.org/field"); assertNotNull(ts.getTokenizerFactory()); assertTrue(ts.getTokenizerFactory() instanceof WhitespaceTokenizerFactory); assertNotNull(f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#string")); ts = (TokenizerChain) f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#string"); assertNotNull(ts.getTokenizerFactory()); assertTrue(ts.getTokenizerFactory() instanceof UAX29URLEmailTokenizerFactory); assertNotNull(f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#int")); assertTrue(f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#int") instanceof IntNumericAnalyzer); final IntNumericAnalyzer a = (IntNumericAnalyzer) f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#int"); assertEquals(8, a.getPrecisionStep()); assertEquals(32, a.getNumericParser().getValueSize()); assertEquals(NumericType.INT, a.getNumericParser().getNumericType()); }
@Test public void testSirenFieldDatatypeAnalyzer() throws Exception { final IndexSchema schema = h.getCore().getSchema(); final SchemaField ntriple = schema.getField(JSON_FIELD); final FieldType tmp = ntriple.getType(); TokenizerChain ts = (TokenizerChain) tmp.getAnalyzer(); assertTrue(ts.getTokenFilterFactories()[0] instanceof DatatypeAnalyzerFilterFactory); final DatatypeAnalyzerFilterFactory f = (DatatypeAnalyzerFilterFactory) ts.getTokenFilterFactories()[0]; assertNotNull(f.getDatatypeAnalyzers()); assertEquals(9, f.getDatatypeAnalyzers().size()); assertNotNull(f.getDatatypeAnalyzers().get("http://json.org/field")); ts = (TokenizerChain) f.getDatatypeAnalyzers().get("http://json.org/field"); assertNotNull(ts.getTokenizerFactory()); assertTrue(ts.getTokenizerFactory() instanceof WhitespaceTokenizerFactory); assertNotNull(f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#string")); ts = (TokenizerChain) f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#string"); assertNotNull(ts.getTokenizerFactory()); assertTrue(ts.getTokenizerFactory() instanceof UAX29URLEmailTokenizerFactory); assertNotNull(f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#int")); assertTrue(f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#int") instanceof IntNumericAnalyzer); final IntNumericAnalyzer a = (IntNumericAnalyzer) f.getDatatypeAnalyzers().get("http://www.w3.org/2001/XMLSchema#int"); assertEquals(8, a.getPrecisionStep()); assertEquals(32, a.getNumericParser().getValueSize()); assertEquals(NumericType.INT, a.getNumericParser().getNumericType()); }
@Test public void testConciseSirenFieldAnalyzer() throws Exception { final IndexSchema schema = h.getCore().getLatestSchema(); final SchemaField json = schema.getField("concise"); final FieldType tmp = json.getType(); assertTrue(tmp.getAnalyzer() instanceof TokenizerChain); final TokenizerChain ts = (TokenizerChain) tmp.getAnalyzer(); assertNotNull(ts.getTokenizerFactory()); assertTrue(ts.getTokenizerFactory() instanceof ConciseJsonTokenizerFactory); // 4 filters for index analyzer assertNotNull(ts.getTokenFilterFactories()); assertEquals(4, ts.getTokenFilterFactories().length); assertTrue(ts.getTokenFilterFactories()[0] instanceof DatatypeAnalyzerFilterFactory); assertTrue(ts.getTokenFilterFactories()[1] instanceof PathEncodingFilterFactory); assertTrue(ts.getTokenFilterFactories()[2] instanceof PositionAttributeFilterFactory); assertTrue(ts.getTokenFilterFactories()[3] instanceof SirenPayloadFilterFactory); }
@Test public void testSirenFieldAnalyzer() throws Exception { final IndexSchema schema = h.getCore().getSchema(); final SchemaField ntriple = schema.getField(JSON_FIELD); final FieldType tmp = ntriple.getType(); assertTrue(tmp.getAnalyzer() instanceof TokenizerChain); final TokenizerChain ts = (TokenizerChain) tmp.getAnalyzer(); assertNotNull(ts.getTokenizerFactory()); assertTrue(ts.getTokenizerFactory() instanceof JsonTokenizerFactory); // 3 filters for index analyzer assertNotNull(ts.getTokenFilterFactories()); assertEquals(3, ts.getTokenFilterFactories().length); assertTrue(ts.getTokenFilterFactories()[0] instanceof DatatypeAnalyzerFilterFactory); assertTrue(ts.getTokenFilterFactories()[1] instanceof PositionAttributeFilterFactory); assertTrue(ts.getTokenFilterFactories()[2] instanceof SirenPayloadFilterFactory); // no query analyzer assertNull(tmp.getQueryAnalyzer()); }