public EmptyParseImpl(ParseStatus status, Configuration conf) { data = new ParseData(status, "", new Outlink[0], new Metadata(), new Metadata()); }
@Override public void processData(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { // Get metadata Metadata metadata = parse.getData().getParseMeta(); try { // Initialize the writers // Only on the first execution initWriters(); for(Entry entry : entryList) { CSVPrint csvPrint = nameCsvPrintMap.get(entry.getParameterMap().get(NAME)); String[] fieldValues = new String[entry.getFieldList().size()]; List<Field> fieldList = entry.getFieldList(); for (int i = 0; i < fieldList.size(); i++) { fieldValues[i] = FilterUtils.getNullSafe(metadata.get(fieldList.get(i).getName()), ""); } // Write field values to CSV file csvPrint.println(fieldValues); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
@Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta();
Metadata metadata = parse.getData().getParseMeta();
StringUtil.toHexString(MD5Hash.digest(txtContent[0].getBytes()).getDigest())); ParseData data = new ParseData(new ParseStatus(ParseStatus.SUCCESS), txtContent[1], outlinks, contentMeta, new Metadata());
@SuppressWarnings("rawtypes") @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { Metadata metadata = parseResult.get(content.getUrl()).getData().getParseMeta(); byte[] rawContent = content.getContent();