/** * Default constructor */ public WarcHTMLResponseRecord() { createPatternSet(); }
public static void main(String[] args) throws IOException { // use a callback class for handling WARC record data: IProcessWarcRecord processor = new SampleProcessWarcRecord(); String inputWarcFile="CC-MAIN-20140305125104-00002-ip-10-183-142-35.ec2.internal.warc.gz"; GZIPInputStream gzInputStream=new GZIPInputStream(new FileInputStream(inputWarcFile)); DataInputStream inStream=new DataInputStream(gzInputStream); WarcRecord thisWarcRecord; while ((thisWarcRecord=WarcRecord.readNextWarcRecord(inStream))!=null) { System.out.println("%% thisWarcRecord.getHeaderRecordType() = " + thisWarcRecord.getHeaderRecordType()); if (thisWarcRecord.getHeaderRecordType().equals("response")) { WarcHTMLResponseRecord htmlRecord=new WarcHTMLResponseRecord(thisWarcRecord); String thisTargetURI=htmlRecord.getTargetURI(); String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8(); // handle WARC record content: processor.process(thisTargetURI, thisContentUtf8); } } inStream.close(); // done processing all WARC records: processor.done(); } }
Vector<String> retVec = new Vector<String>(); String baseURL = getTargetURI(); if ((baseURL == null) || (baseURL.length() == 0)) { return retVec; HashSet<String> retSet=getMatchesOutputSet(htmlTags, baseURL);
private HashSet<String> getMatchesOutputSet(Vector<String> tagSet, String baseURL) { HashSet<String> retSet=new HashSet<String>(); Iterator<String> vIter=tagSet.iterator(); while (vIter.hasNext()) { String thisCheckPiece=vIter.next(); Iterator<Pattern> pIter=patternSet.iterator(); boolean hasAdded=false; while (!hasAdded && pIter.hasNext()) { Pattern thisPattern=pIter.next(); Matcher matcher=thisPattern.matcher(thisCheckPiece); if (matcher.find() && (matcher.groupCount() > 0)) { String thisMatch=getNormalizedContentURL(baseURL, matcher.group(1)); if (HTTP_START_PATTERN.matcher(thisMatch).matches()) { if (!retSet.contains(thisMatch) && !baseURL.equals(thisMatch)) { retSet.add(thisMatch); hasAdded=true; } // end if (!retSet.contains(thisMatch)) } // end if (HTTP_START_PATTERN.matcher(thisMatch).matches()) } // end if (matcher.find() && (matcher.groupCount() > 0)) matcher.reset(); } // end while (!hasAdded && pIter.hasNext()) } // end while (vIter.hasNext()) return retSet; }
WarcHTMLResponseRecord htmlRecord = new WarcHTMLResponseRecord(thisWarcRecord); String thisTargetURI = htmlRecord.getTargetURI(); String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8();
List<String> retVec = new ArrayList<String>(); String baseURL = getTargetURI(); if ((baseURL == null) || (baseURL.length() == 0)) { return retVec; HashSet<String> retSet = getMatchesOutputSet(htmlTags, baseURL);
private HashSet<String> getMatchesOutputSet(List<String> tagSet, String baseURL) { HashSet<String> retSet = new HashSet<String>(); Iterator<String> vIter = tagSet.iterator(); while (vIter.hasNext()) { String thisCheckPiece = vIter.next(); Iterator<Pattern> pIter = patternSet.iterator(); boolean hasAdded = false; while (!hasAdded && pIter.hasNext()) { Pattern thisPattern = pIter.next(); Matcher matcher = thisPattern.matcher(thisCheckPiece); if (matcher.find() && (matcher.groupCount() > 0)) { String thisMatch = getNormalizedContentURL(baseURL, matcher.group(1)); if (HTTP_START_PATTERN.matcher(thisMatch).matches()) { if (!retSet.contains(thisMatch) && !baseURL.equals(thisMatch)) { retSet.add(thisMatch); hasAdded = true; } // end if (!retSet.contains(thisMatch)) } // end if // (HTTP_START_PATTERN.matcher(thisMatch).matches()) } // end if (matcher.find() && (matcher.groupCount() > 0)) matcher.reset(); } // end while (!hasAdded && pIter.hasNext()) } // end while (vIter.hasNext()) return retSet; }
/** * Default constructor */ public WarcHTMLResponseRecord() { createPatternSet(); }
/** * Copy constructor * @param o */ public WarcHTMLResponseRecord(WarcHTMLResponseRecord o) { this.warcRecord.set(o.warcRecord); createPatternSet(); }
/** * Copy constructor * * @param o */ public WarcHTMLResponseRecord(WarcHTMLResponseRecord o) { this.warcRecord.set(o.warcRecord); createPatternSet(); }
/** * Constructor creation from a generic WARC record * * @param o */ public WarcHTMLResponseRecord(WarcRecord o) { if (o.getHeaderRecordType().compareToIgnoreCase("response") == 0) { this.warcRecord.set(o); } createPatternSet(); }
/** * Constructor creation from a generic WARC record * @param o */ public WarcHTMLResponseRecord(WarcRecord o) { if (o.getHeaderRecordType().compareToIgnoreCase("response")==0) { this.warcRecord.set(o); } createPatternSet(); }