@Override public void analyse(String text, SolrRecord solr) { final long start = System.nanoTime(); try { LanguageResult li = ld.detect(text); if (li != null) { solr.addField(SolrFields.CONTENT_LANGUAGE, li.getLanguage()); } } catch (IllegalArgumentException e) { log.error("Exception when determining language of this item: " + e.getMessage(), e); solr.addParseException(e); } Instrument.timeRel("TextAnalyzers#total", "LanguageAnalyzer#total", start); }
@Override public void write( DataOutput output ) throws IOException { byte[] bytes = SerializationUtils.serialize( this.sr.getSolrDocument() ); output.writeInt( bytes.length ); output.write( bytes ); }
public SolrRecord createRecord() { return new SolrRecord(defaultMax, maxLengths); }
public void addWctMetadata( SolrRecord in ) { in.addField( WctFields.WCT_TARGET_ID, this.solr.getFieldValue( WctFields.WCT_TARGET_ID ).toString() ); in.addField( WctFields.WCT_TITLE, this.solr.getFieldValue( WctFields.WCT_TITLE ).toString() ); in.addField( WctFields.WCT_HARVEST_DATE, this.solr.getFieldValue( WctFields.WCT_HARVEST_DATE ).toString() ); in.addField( WctFields.WCT_COLLECTIONS, this.solr.getFieldValue( WctFields.WCT_COLLECTIONS ).toString() ); in.addField( WctFields.WCT_AGENCY, this.solr.getFieldValue( WctFields.WCT_AGENCY ).toString() ); in.addField( WctFields.WCT_SUBJECTS, this.solr.getFieldValue( WctFields.WCT_SUBJECTS ).toString() ); }
/** * Get the list of faces and the item identifier: */ public List<String> getFaces() { SolrInputField faces = getField(SolrFields.IMAGE_FACES); if (faces == null || faces.getValueCount() == 0) return null; // Otherwise, list 'em: List<String> hl = new ArrayList<String>(); this.gatherMatches(faces.getValues(), "cat", hl); this.gatherMatches(faces.getValues(), "human", hl); return hl; }
/** * Get the host->host links: */ public List<String> getHostLinks() { SolrInputField links = getField(SolrFields.SOLR_LINKS_HOSTS); if (links == null || links.getValueCount() == 0) return null; // Otherwise, build a list: List<String> hl = new ArrayList<String>(); for (Object v : links.getValues()) { hl.add(getHost() + "\t" + (String) v); } return hl; } }
String contentType = ( String ) solr.getFieldValue( SolrFields.SOLR_CONTENT_TYPE ); solr.setField( SolrFields.CONTENT_TYPE_TIKA, contentType ); if( solr.getField( SolrFields.CONTENT_TYPE_DROID ) != null ) { MediaType mt_droid = MediaType.parse( ( String ) solr.getField( SolrFields.CONTENT_TYPE_DROID ).getFirstValue() ); if( mt_tika == null || mt_tika.equals( MediaType.OCTET_STREAM ) ) { contentType = mt_droid.toString(); solr.addField( SolrFields.CONTENT_VERSION, mt_droid.getParameters().get( "version" ) ); solr.setField( SolrFields.FULL_CONTENT_TYPE, contentType ); solr.setField( SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll( ";.*$", "" ) ); solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image" ); solr.setField(SolrFields.SOLR_TYPE, "Image"); } else if (contentType.matches("^audio/.*$") || contentType.matches("^application/vnd.rn-realaudio$")) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio" ); solr.setField(SolrFields.SOLR_TYPE, "Audio"); } else if (contentType.matches("^video/.*$") || contentType.matches("^application/mp4$") || contentType.matches("^application/vnd.rn-realmedia$")) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video" ); solr.setField(SolrFields.SOLR_TYPE, "Video"); } else if (contentType.matches("^text/htm.*$") || contentType.matches("^application/xhtml.*$")) { solr.setField( SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html" );
} catch (Exception e) { log.error("WritableSolrRecord.extract(): " + e.getMessage()); solr.addParseException("when scanning for faces", e); solr.addField(SolrFields.IMAGE_HEIGHT, metadata.get(FaceDetectionParser.IMAGE_HEIGHT)); solr.addField(SolrFields.IMAGE_WIDTH, metadata.get(FaceDetectionParser.IMAGE_WIDTH)); solr.addField(SolrFields.IMAGE_SIZE, metadata.get(FaceDetectionParser.IMAGE_SIZE)); if (this.extractFaces) { .getValues(FaceDetectionParser.FACE_FRAGMENT_ID)) { log.debug("Found a face!"); solr.addField(SolrFields.IMAGE_FACES, face); solr.setField(SolrFields.IMAGE_FACES_COUNT, "" + faces); solr.addField(SolrFields.IMAGE_DOMINANT_COLOUR, metadata.get(FaceDetectionParser.DOM_COL)); solr.addField(SolrFields.IMAGE_HEIGHT, ""+height); solr.addField(SolrFields.IMAGE_WIDTH,""+width); solr.addField(SolrFields.IMAGE_SIZE,""+(height*width));
public void setTag( String tag, String value ) { if( tag.equals( WctFields.WCT_INSTANCE_ID ) ) { this.solr.addField( WctFields.WCT_INSTANCE_ID, value ); } else if( tag.equals( WctFields.WCT_TARGET_ID ) ) { this.solr.addField( WctFields.WCT_TARGET_ID, value ); } else if( tag.equals( WctFields.WCT_HARVEST_DATE ) ) { this.solr.addField( WctFields.WCT_HARVEST_DATE, value ); } else if( tag.equals( WctFields.WCT_AGENCY ) ) { this.solr.addField( WctFields.WCT_AGENCY, value ); } else if( tag.equals( WctFields.WCT_COLLECTIONS ) ) { this.solr.addField( WctFields.WCT_COLLECTIONS, value ); } else if( tag.equals( WctFields.WCT_SUBJECTS ) ) { this.solr.addField( WctFields.WCT_SUBJECTS, value ); } }
&& solr.getField(SolrFields.CONTENT_TYPE_SERVED) == null) { String servedType = h.getValue(); if (servedType.length() > 200) servedType = servedType.substring(0, 200); solr.addField(SolrFields.CONTENT_TYPE_SERVED, servedType); solr.addField( SolrFields.SERVER, h.getValue() ); if (h.getName().equalsIgnoreCase(HttpHeaders.SERVER)) solr.addField( SolrFields.SERVER, h.getValue() ); if (h.getName().equalsIgnoreCase(HttpHeaders.LOCATION)){ solr.setField(SolrFields.REDIRECT_TO_NORM, Normalisation.resolveRelative(targetUrl, location)); solr.addParseException("when parsing statusCode", e); } catch( Exception e ) { log.error( "Exception when parsing headers: " + e ); solr.addParseException("when parsing headers", e);
} catch (Exception e) { log.warn("Exception on record " + url + " from " + inFile.getName(), e); doc.addParseException(e); continue; } catch (OutOfMemoryError e) { log.warn("OutOfMemoryError on record " + url + " from " + inFile.getName(), e); doc.addParseException(e); File fileOutput = new File(outputWarcDir + "//" + "FILE_" + recordCount + ".xml"); if (!slashPages || (doc.getFieldValue(SolrFields.SOLR_URL_TYPE) != null && doc.getFieldValue(SolrFields.SOLR_URL_TYPE).equals(SolrFields.SOLR_URL_TYPE_SLASHPAGE))) { doc.writeXml(zipOut); } else if (solrUrl == null) { writeXMLToFile(doc.toXml(), fileOutput); } else { docs.add(doc.getSolrDocument()); checkSubmission(solrWeb, docs, batchSize, false);
UsableURI url = UsableURIFactory.getInstance(fullUrl); solr.setField(SolrFields.SOLR_URL_PATH, url.getPath()); solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE); solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT); } else { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_NORMAL); if (CANONICALISE_HOST) host = Normalisation.canonicaliseHost(host); solr.setField(SolrFields.SOLR_HOST, host); solr.removeField(SolrFields.SOLR_HOST_SURT); ImmutableList<String> levels = LinkExtractor.allLevels(host); if (levels != null) { for (String level : levels) { solr.addField(SolrFields.SOLR_HOST_SURT, SURT.toSURT(level)); solr.setField(SolrFields.DOMAIN, domain); solr.setField(SolrFields.PUBLIC_SUFFIX, LinkExtractor.extractPublicSuffixFromHost(host));
String wbd = s.getWaybackDate(); String year = wbd.substring(0, 4); new Text(s.getHost())); new Text(s.getFormatResults())); List<String> faces = s.getFaces(); if (faces != null) { for (String face : faces) { List<String> hl = s.getHostLinks(); if( hl != null ) {
@Override public void run() { try { input.reset(); parser.parse( input, null, metadata, null ); } catch( Exception e ) { log.error( parser.getClass().getName()+".parse(): " + e.getMessage() ); // Also record as a Solr PARSE_ERROR solr.addParseException("when parsing with " + parser.getClass().getName(), e); } } }
if (solr.containsKey(WctFields.WCT_INSTANCE_ID)) { wct = new WctEnricher(key.toString()); wct.addWctMetadata(solr); docs.add(solr.getSolrDocument()); + solr.getField("id").getFirstValue()); && solr.getSolrDocument().getFieldValue( SolrFields.SOLR_URL_TYPE) != null && solr.getSolrDocument() .getFieldValue(SolrFields.SOLR_URL_TYPE) .equals( new Text(""), new Text(MetadataBuilder.SolrDocumentToElement(solr .getSolrDocument())));
if( read >= 4 ) { String hexBytes = Hex.encodeHexString( ffb ); solr.addField( SolrFields.CONTENT_FFB, hexBytes.substring( 0, 2 * 4 ) ); StringBuilder separatedHexBytes = new StringBuilder(); for( String hexByte : Splitter.fixedLength( 2 ).split( hexBytes ) ) { solr.addField( SolrFields.CONTENT_FIRST_BYTES, separatedHexBytes.toString().trim() ); solr.addField( SolrFields.CONTENT_TYPE_DROID, mt.toString() ); Instrument.timeRel("WARCPayloadAnalyzers.analyze#droid", "WARCPayloadAnalyzers.analyze#droid_type=" + mt.toString(), String mime = ( String ) solr.getField( SolrFields.SOLR_CONTENT_TYPE ).getValue(); if( mime.startsWith( "text" ) || mime.startsWith("application/xhtml+xml") ) { html.analyse(header, tikainput, solr);
ArchiveRecordHeader header = value.getRecord().getHeader(); ArchiveRecord rec = value.getRecord(); SolrRecord solr = new SolrRecord(key.toString(), rec.getHeader()); LOG.error(e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset()); reporter.incrCounter(MyCounters.NUM_ERRORS, 1); solr.addParseException(e); } catch (OutOfMemoryError e) { LOG.error("OOME " + e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset()); reporter.incrCounter(MyCounters.NUM_ERRORS, 1); solr.addParseException(e);
/** * * @return */ public String getWaybackDate() { return (String) getField(SolrFields.WAYBACK_DATE).getFirstValue(); }
throws JSONException { MDX m = new MDX(); m.setHash(stringValueOrUnset(solr.getFieldValue(SolrFields.HASH))); m.setUrl(stringValueOrUnset(solr.getFieldValue(SolrFields.SOLR_URL))); m.setTs(stringValueOrUnset( solr.getFieldValue(SolrFields.WAYBACK_DATE))); m.setRecordType(stringValueOrUnset( solr.getFieldValue(SolrFields.SOLR_RECORD_TYPE))); for (String f : solr.getSolrDocument().getFieldNames()) { SolrInputField v = solr.getSolrDocument().get(f); if (v.getValueCount() > 1) { Iterator<Object> i = v.getValues().iterator();
private void gatherMatches(Collection<Object> strings, String prefix, List<String> hl) { StringBuilder sb = new StringBuilder(); sb.append(getUrl()); sb.append("\t"); sb.append(getWaybackDate()); sb.append("\t"); // Order: List<String> list = new ArrayList<String>(); for (Object v : strings) { String vs = (String) v; list.add(vs); } Collections.sort(list); // Go through: int i = 0; for (String vs : list) { if (i > 0) sb.append(" "); if (vs.startsWith(prefix)) { sb.append(vs); i++; } } if (i > 0) { hl.add(sb.toString()); } }