public void setTag( String tag, String value ) { if( tag.equals( WctFields.WCT_INSTANCE_ID ) ) { this.solr.addField( WctFields.WCT_INSTANCE_ID, value ); } else if( tag.equals( WctFields.WCT_TARGET_ID ) ) { this.solr.addField( WctFields.WCT_TARGET_ID, value ); } else if( tag.equals( WctFields.WCT_HARVEST_DATE ) ) { this.solr.addField( WctFields.WCT_HARVEST_DATE, value ); } else if( tag.equals( WctFields.WCT_AGENCY ) ) { this.solr.addField( WctFields.WCT_AGENCY, value ); } else if( tag.equals( WctFields.WCT_COLLECTIONS ) ) { this.solr.addField( WctFields.WCT_COLLECTIONS, value ); } else if( tag.equals( WctFields.WCT_SUBJECTS ) ) { this.solr.addField( WctFields.WCT_SUBJECTS, value ); } }
public void setTag( String tag, String value ) { if( tag.equals( WctFields.WCT_INSTANCE_ID ) ) { this.solr.addField( WctFields.WCT_INSTANCE_ID, value ); } else if( tag.equals( WctFields.WCT_TARGET_ID ) ) { this.solr.addField( WctFields.WCT_TARGET_ID, value ); } else if( tag.equals( WctFields.WCT_HARVEST_DATE ) ) { this.solr.addField( WctFields.WCT_HARVEST_DATE, value ); } else if( tag.equals( WctFields.WCT_AGENCY ) ) { this.solr.addField( WctFields.WCT_AGENCY, value ); } else if( tag.equals( WctFields.WCT_COLLECTIONS ) ) { this.solr.addField( WctFields.WCT_COLLECTIONS, value ); } else if( tag.equals( WctFields.WCT_SUBJECTS ) ) { this.solr.addField( WctFields.WCT_SUBJECTS, value ); } }
/** * * @param hint * @param e */ public void addParseException(String hint, Throwable e) { addField(SolrFields.PARSE_ERROR, e.getClass().getName() + " " + hint + ": " + e.getMessage()); }
/** * * @param e */ public void addParseException(Throwable e) { addField(SolrFields.PARSE_ERROR, e.getClass().getName() + ": " + e.getMessage()); }
/** * * @param e */ public void addParseException(Throwable e) { addField(SolrFields.PARSE_ERROR, e.getClass().getName() + ": " + e.getMessage()); }
/** * * @param hint * @param e */ public void addParseException(String hint, Throwable e) { addField(SolrFields.PARSE_ERROR, e.getClass().getName() + " " + hint + ": " + e.getMessage()); }
public void addWctMetadata( SolrRecord in ) { in.addField( WctFields.WCT_TARGET_ID, this.solr.getFieldValue( WctFields.WCT_TARGET_ID ).toString() ); in.addField( WctFields.WCT_TITLE, this.solr.getFieldValue( WctFields.WCT_TITLE ).toString() ); in.addField( WctFields.WCT_HARVEST_DATE, this.solr.getFieldValue( WctFields.WCT_HARVEST_DATE ).toString() ); in.addField( WctFields.WCT_COLLECTIONS, this.solr.getFieldValue( WctFields.WCT_COLLECTIONS ).toString() ); in.addField( WctFields.WCT_AGENCY, this.solr.getFieldValue( WctFields.WCT_AGENCY ).toString() ); in.addField( WctFields.WCT_SUBJECTS, this.solr.getFieldValue( WctFields.WCT_SUBJECTS ).toString() ); }
public void addWctMetadata( SolrRecord in ) { in.addField( WctFields.WCT_TARGET_ID, this.solr.getFieldValue( WctFields.WCT_TARGET_ID ).toString() ); in.addField( WctFields.WCT_TITLE, this.solr.getFieldValue( WctFields.WCT_TITLE ).toString() ); in.addField( WctFields.WCT_HARVEST_DATE, this.solr.getFieldValue( WctFields.WCT_HARVEST_DATE ).toString() ); in.addField( WctFields.WCT_COLLECTIONS, this.solr.getFieldValue( WctFields.WCT_COLLECTIONS ).toString() ); in.addField( WctFields.WCT_AGENCY, this.solr.getFieldValue( WctFields.WCT_AGENCY ).toString() ); in.addField( WctFields.WCT_SUBJECTS, this.solr.getFieldValue( WctFields.WCT_SUBJECTS ).toString() ); }
@Override public void analyse(String text, SolrRecord solr) { final long start = System.nanoTime(); // Postcode Extractor (based on text extracted by Tika) Matcher pcm = postcodePattern.matcher( text ); Set<String> pcs = new HashSet<String>(); while( pcm.find() ) pcs.add( pcm.group() ); for( String pc : pcs ) { solr.addField( SolrFields.POSTCODE, pc ); String pcd = pc.substring( 0, pc.lastIndexOf( " " ) ); solr.addField( SolrFields.POSTCODE_DISTRICT, pcd ); String location = pcg.getLatLogForPostcodeDistrict( pcd ); if( location != null ) solr.addField( SolrFields.LOCATIONS, location ); } Instrument.timeRel("TextAnalyzers#total", "PostcodeAnalyzer", start); }
@Override public void analyse(String text, SolrRecord solr) { final long start = System.nanoTime(); // Postcode Extractor (based on text extracted by Tika) Matcher pcm = postcodePattern.matcher( text ); Set<String> pcs = new HashSet<String>(); while( pcm.find() ) pcs.add( pcm.group() ); for( String pc : pcs ) { solr.addField( SolrFields.POSTCODE, pc ); String pcd = pc.substring( 0, pc.lastIndexOf( " " ) ); solr.addField( SolrFields.POSTCODE_DISTRICT, pcd ); String location = pcg.getLatLogForPostcodeDistrict( pcd ); if( location != null ) solr.addField( SolrFields.LOCATIONS, location ); } Instrument.timeRel("TextAnalyzers#total", "PostcodeAnalyzer", start); }
/** * Apply the rule to the given name. If it matches, add the content from the templates to solr. * @param name ARC path * @param solr destination for template values. * @return true if the rule was applies, else false. */ public boolean apply(String name, SolrRecord solr) { Matcher matcher = pattern.matcher(name); if (!matcher.matches()) { return false; } // Got a match. Apply all templates for (FieldTemplate ft: templates) { try { solr.addField(ft.field, matcher.replaceAll(ft.template)); } catch (Exception e) { log.warn(String.format( "Unable to apply replaceAll to '%s' with matching pattern '%s' and template '%s:%s': %s", name, pattern.pattern(), ft.field, ft.template, e.getMessage())); } } return true; } }
/** * Apply the rule to the given name. If it matches, add the content from the templates to solr. * @param name ARC path * @param solr destination for template values. * @return true if the rule was applies, else false. */ public boolean apply(String name, SolrRecord solr) { Matcher matcher = pattern.matcher(name); if (!matcher.matches()) { return false; } // Got a match. Apply all templates for (FieldTemplate ft: templates) { try { solr.addField(ft.field, matcher.replaceAll(ft.template)); } catch (Exception e) { log.warn(String.format( "Unable to apply replaceAll to '%s' with matching pattern '%s' and template '%s:%s': %s", name, pattern.pattern(), ft.field, ft.template, e.getMessage())); } } return true; } }
@Override public void analyse(String text, SolrRecord solr) { final long start = System.nanoTime(); try { LanguageResult li = ld.detect(text); if (li != null) { solr.addField(SolrFields.CONTENT_LANGUAGE, li.getLanguage()); } } catch (IllegalArgumentException e) { log.error("Exception when determining language of this item: " + e.getMessage(), e); solr.addParseException(e); } Instrument.timeRel("TextAnalyzers#total", "LanguageAnalyzer#total", start); }
solr.addField( SolrFields.SENTIMENT, sentiment );
@Override public void analyse(String text, SolrRecord solr) { final long start = System.nanoTime(); // Canonicalize the text - strip newlines etc. Pattern whitespace = Pattern.compile( "\\s+" ); Matcher matcher = whitespace.matcher( text ); text = matcher.replaceAll( " " ).toLowerCase().trim(); /* ---------------------------------------------------------- */ // Add SSDeep hash for the text, to spot similar texts. SSDeep ssd = new SSDeep(); FuzzyHash tfh; try { tfh = ssd.fuzzy_hash_buf( text.getBytes( "UTF-8" ) ); solr.addField( SolrFields.SSDEEP_PREFIX + tfh.getBlocksize(), tfh.getHash() ); solr.addField( SolrFields.SSDEEP_PREFIX + ( tfh.getBlocksize() * 2 ), tfh.getHash2() ); // solr.addField( SolrFields.SSDEEP_NGRAM_PREFIX + // tfh.getBlocksize(), tfh.getHash() ); // solr.addField( SolrFields.SSDEEP_NGRAM_PREFIX + ( // tfh.getBlocksize() * 2 ), tfh.getHash2() ); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } Instrument.timeRel("TextAnalyzers#total", "FuzzyHashAnalyzer", start); }
@Override public void analyse(String text, SolrRecord solr) { final long start = System.nanoTime(); // Canonicalize the text - strip newlines etc. Pattern whitespace = Pattern.compile( "\\s+" ); Matcher matcher = whitespace.matcher( text ); text = matcher.replaceAll( " " ).toLowerCase().trim(); /* ---------------------------------------------------------- */ // Add SSDeep hash for the text, to spot similar texts. SSDeep ssd = new SSDeep(); FuzzyHash tfh; try { tfh = ssd.fuzzy_hash_buf( text.getBytes( "UTF-8" ) ); solr.addField( SolrFields.SSDEEP_PREFIX + tfh.getBlocksize(), tfh.getHash() ); solr.addField( SolrFields.SSDEEP_PREFIX + ( tfh.getBlocksize() * 2 ), tfh.getHash2() ); // solr.addField( SolrFields.SSDEEP_NGRAM_PREFIX + // tfh.getBlocksize(), tfh.getHash() ); // solr.addField( SolrFields.SSDEEP_NGRAM_PREFIX + ( // tfh.getBlocksize() * 2 ), tfh.getHash2() ); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } Instrument.timeRel("TextAnalyzers#total", "FuzzyHashAnalyzer", start); }
@Override public void analyse(String text, SolrRecord solr) { if (!enabled) { return; } final long start = System.nanoTime(); try { LanguageResult li = ld.detect(text); if (li != null) { solr.addField(SolrFields.CONTENT_LANGUAGE, li.getLanguage()); } } catch (IllegalArgumentException e) { log.error("Exception when determining language of this item: " + e.getMessage(), e); solr.addParseException(e); } Instrument.timeRel("TextAnalyzers#total", "LanguageAnalyzer#total", start); }
@Override public void analyse(ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) { final long start = System.nanoTime(); Metadata metadata = new Metadata(); // Also attempt to grab the XML Root NS: if( this.extractXMLRootNamespace ) { ParseRunner parser = new ParseRunner( xrns, tikainput, metadata, solr ); try { TimeLimiter.run(parser, 30000L, false); } catch( Exception e ) { log.error( "WritableSolrRecord.extract(): " + e.getMessage() ); solr.addParseException("when parsing for XML Root Namespace", e); } solr.addField( SolrFields.XML_ROOT_NS, metadata.get(XMLRootNamespaceParser.XML_ROOT_NS)); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#total","XMLAnalyzer.analyze", start); }
@Override public void analyse(String text, SolrRecord solr) { // Find the entities: List<ResolvedNamedEntity> entities = oscar .findAndResolveNamedEntities(text); // Record them: Set<String> uniqueEntities = new HashSet<String>(); for (ResolvedNamedEntity ne : entities) { // e.g. 'acetone' uniqueEntities.add("OSCAR4:MATCH:" + ne.getSurface()); ChemicalStructure stdInchi = ne .getFirstChemicalStructure(FormatType.STD_INCHI); if (stdInchi != null) { // e.g. [Structure:STD_INCHI:InChI=1S/C3H6O/c1-3(2)4/h1-2H3] uniqueEntities.add("OSCAR4:" + stdInchi.getType() + ":" + stdInchi.getValue()); } } // Store in Solr records: for (String ent : uniqueEntities) { solr.addField(SolrFields.SOLR_TIKA_METADATA_LIST, ent); } }
@Override public void analyse(String source, ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) { final long start = System.nanoTime(); Metadata metadata = new Metadata(); // Also attempt to grab the XML Root NS: if( this.extractXMLRootNamespace ) { ParseRunner parser = new ParseRunner( xrns, tikainput, metadata, solr ); try { TimeLimiter.run(parser, 30000L, false); } catch( Exception e ) { log.error( "WritableSolrRecord.extract(): " + e.getMessage() ); solr.addParseException("when parsing for XML Root Namespace", e); } solr.addField( SolrFields.XML_ROOT_NS, metadata.get(XMLRootNamespaceParser.XML_ROOT_NS)); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#total","XMLAnalyzer.analyze", start); }