res = (String) TextUtils.unescapeHtml(res); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString();
/** * Get the 'form province' - either the configured (applicableSurtPrefix) * or inferred (full current server) range of URIs that is considered * covered by one form login * * @param curi * @return */ protected String getFormProvince(CrawlURI curi) { if (StringUtils.isNotBlank(getApplicableSurtPrefix())) { return getApplicableSurtPrefix(); } try { return curi.getUURI().resolve("/").getSurtForm(); } catch (URIException e) { logger.log(Level.WARNING,"error trimming to root",e); return curi.getClassKey(); // should never happen } }
private static String getKey(String url, boolean prefix) throws URIException { String key = addImpliedHttpIfNecessary(url); UsableURI uuri = UsableURIFactory.getInstance(key); key = uuri.getScheme() + "://" + uuri.getAuthority() + uuri.getEscapedPathQuery(); key = SURT.fromURI(key); int hashPos = key.indexOf('#'); if(hashPos != -1) { key = key.substring(0,hashPos); } if(key.startsWith("http://")) { key = key.substring(7); } if(prefix) { if(key.endsWith(",)/")) { key = key.substring(0,key.length()-3); } } return key; } }
try { refuri = UsableURIFactory.getInstance(referer); host = refuri.getHost(); authority = refuri.getAuthority(); path = refuri.getPath(); } catch (URIException ex) { LOGGER.info("Ignoring unparsable Referer: " + referer); (matchPort != -1 && refuri.getPort() != -1 && matchPort != refuri .getPort())) { LOGGER.info("Server-Relative-Redirect: Skipping, Referer " + host + ":" + refuri.getPort() + " not from matching wayback host:port\t"); return null; url = ArchiveUtils.addImpliedHttpIfNecessary(url); final String root = refuri.getScheme() + "://" + authority; return new ArchivalUrlRef(root, collection, datespec, url);
/** * Return the referenced host in the UURI, if any, also extracting the * host of a DNS-lookup URI where necessary. * * @return the target or topic host of the URI * @throws URIException */ public String getReferencedHost() throws URIException { String referencedHost = this.getHost(); if(referencedHost==null && this.getScheme().equals("dns")) { // extract target domain of DNS lookup String possibleHost = this.getCurrentHierPath(); if(possibleHost != null && possibleHost.matches("[-_\\w\\.:]+")) { referencedHost = possibleHost; } } return referencedHost; }
UsableURI uuri = UsableURIFactory.getInstance(uri); if( "https".contains(uuri.getScheme()) ) { output.collect( new Text("record-hosts"), new Text("HOSTS\t"+uuri.getAuthority()) );
public int length() { return getEscapedURI().length(); }
UsableURI url = UsableURIFactory.getInstance(fullUrl); solr.setField(SolrFields.SOLR_URL_PATH, url.getPath()); if (url.getPath().equals("/") || url.getPath().equals("") || url.getPath().matches("/index\\.[a-z]+$")) { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE); } else if (url.getPath().equalsIgnoreCase("/robots.txt")) { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT); String host = url.getHost(); if (CANONICALISE_HOST) host = Normalisation.canonicaliseHost(host); .URI(url.getEscapedURI(), false);
/** * @param base UURI to use as a base resolving <code>relative</code>. * @param relative Relative URI. * @return Instance of UURI. * @throws URIException */ protected UsableURI create(UsableURI base, String relative) throws URIException { UsableURI relativeUURI = makeOne(fixup(relative, base, base.getProtocolCharset()), true, base.getProtocolCharset()); UsableURI uuri = makeOne(base, relativeUURI); if (logger.isLoggable(Level.FINE)) { logger.fine(" URI " + relative + " PRODUCT " + uuri.toString() + " CHARSET " + base.getProtocolCharset() + " BASE " + base); } return validityCheck(uuri); }
/** * Check the generated UURI. * * At the least look at length of uuri string. We were seeing case * where before escaping, string was < MAX_URL_LENGTH but after was * >. Letting out a too-big message was causing us troubles later * down the processing chain. * @param uuri Created uuri to check. * @return The passed <code>uuri</code> so can easily inline this check. * @throws URIException */ protected UsableURI validityCheck(UsableURI uuri) throws URIException { if (uuri.getRawURI().length > UsableURI.MAX_URL_LENGTH) { throw new URIException("Created (escaped) uuri > " + UsableURI.MAX_URL_LENGTH +": "+uuri.toString()); } return uuri; }
/** * Return the authority minus userinfo (if any). * * If no userinfo present, just returns the authority. * * @return The authority stripped of any userinfo if present. * @throws URIException */ public String getAuthorityMinusUserinfo() throws URIException { if (this.cachedAuthorityMinusUserinfo == null) { String tmp = getAuthority(); if (tmp != null && tmp.length() > 0) { int index = tmp.indexOf('@'); if (index >= 0 && index < tmp.length()) { tmp = tmp.substring(index + 1); } } this.cachedAuthorityMinusUserinfo = tmp; coalesceHostAuthorityStrings(); } return this.cachedAuthorityMinusUserinfo; }
/** * @param uri URI as string that is resolved relative to this UURI. * @param e True if uri is escaped. * @param charset Charset to use. * @return UURI that uses this UURI as base. * @throws URIException */ public UsableURI resolve(String uri, boolean e, String charset) throws URIException { return new UsableURI(this, new UsableURI(uri, e, charset)); }
String decodedUrlStr = URLDecoder.decode(urlStr, "UTF-8"); String idnEncodedHost = UsableURIFactory.getInstance(decodedUrlStr, "UTF-8").getHost();
public synchronized String getEscapedURI() { if (this.cachedEscapedURI == null) { this.cachedEscapedURI = super.getEscapedURI(); coalesceUriStrings(); } return this.cachedEscapedURI; }
public synchronized String getHost() throws URIException { if (this.cachedHost == null) { // If this._host is null, 3.0 httpclient throws // illegalargumentexception. Don't go there. if (this._host != null) { this.cachedHost = super.getHost(); coalesceHostAuthorityStrings(); } } return this.cachedHost; }
private static String getKey(String url, boolean prefix) throws URIException { String key = ArchiveUtils.addImpliedHttpIfNecessary(url); UsableURI uuri = UsableURIFactory.getInstance(key); key = uuri.getScheme() + "://" + uuri.getAuthority() + uuri.getEscapedPathQuery(); key = SURT.fromURI(key); int hashPos = key.indexOf('#'); if(hashPos != -1) { key = key.substring(0,hashPos); } if(key.startsWith("http://")) { key = key.substring(7); } if(prefix) { if(key.endsWith(",)/")) { key = key.substring(0,key.length()-3); } } return key; } }
try { refuri = UsableURIFactory.getInstance(referer); host = refuri.getHost(); authority = refuri.getAuthority(); path = refuri.getPath(); } catch (URIException ex) { LOGGER.info("Ignoring unparsable Referer: " + referer); (matchPort != -1 && refuri.getPort() != -1 && matchPort != refuri .getPort())) { LOGGER.info("Server-Relative-Redirect: Skipping, Referer " + host + ":" + refuri.getPort() + " not from matching wayback host:port\t"); return null; url = ArchiveUtils.addImpliedHttpIfNecessary(url); final String root = refuri.getScheme() + "://" + authority; return new ArchivalUrlRef(root, collection, datespec, url);
/** * Return the referenced host in the UURI, if any, also extracting the * host of a DNS-lookup URI where necessary. * * @return the target or topic host of the URI * @throws URIException */ public String getReferencedHost() throws URIException { String referencedHost = this.getHost(); if(referencedHost==null && this.getScheme().equals("dns")) { // extract target domain of DNS lookup String possibleHost = this.getCurrentHierPath(); if(possibleHost != null && possibleHost.matches("[-_\\w\\.:]+")) { referencedHost = possibleHost; } } return referencedHost; }
UsableURI uuri = UsableURIFactory.getInstance(uri); if( "https".contains(uuri.getScheme()) ) { output.collect( new Text("record-hosts"), new Text("HOSTS\t"+uuri.getAuthority()) );