private static String getAbsoluteUrlFrom(String pageUrl, String linkPath) { String domainUrl = getFullDomainFromUrl(pageUrl); if (linkPath.startsWith("/")) { return domainUrl + linkPath; } return domainUrl + "/" + linkPath; }
public void setOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException { Set<WebURL> outgoingUrls = parseOutgoingUrls(referringPage); this.setOutgoingUrls(outgoingUrls); }
private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException { Set<String> extractedUrls = extractUrlInCssText(this.getTextContent()); final String pagePath = referringPage.getPath(); final String pageUrl = referringPage.getURL(); Set<WebURL> outgoingUrls = new HashSet<>(); for (String url : extractedUrls) { String relative = getLinkRelativeTo(pagePath, url); String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative); WebURL webURL = new WebURL(); webURL.setURL(absolute); outgoingUrls.add(webURL); } return outgoingUrls; }
CssParseData parseData = new CssParseData(); if (page.getContentCharset() == null) { parseData.setTextContent(new String(page.getContentData())); } else { parseData.setTextContent( new String(page.getContentData(), page.getContentCharset())); parseData.setOutgoingUrls(page.getWebURL()); page.setParseData(parseData); } catch (Exception e) {
private static String getLinkRelativeTo(String pagePath, String linkUrl) { if (linkUrl.startsWith("/") && !linkUrl.startsWith("//")) { return linkUrl; } if (linkUrl.startsWith("//")) { linkUrl = "http" + linkUrl; } if (linkUrl.startsWith("http")) { String domainUrl = getPathFromUrl(linkUrl); return domainUrl; } if (linkUrl.startsWith("../")) { String[] parts = pagePath.split("/"); int pos = linkUrl.lastIndexOf("../") + 3; int parents = pos / 3; long diff = parts.length - parents - 1; String absolute = ""; for (int i = 0; i < diff; i++) { String dir = parts[i]; if (!dir.isEmpty()) { absolute = absolute + "/" + dir; } } return absolute + "/" + linkUrl.substring(pos); } String root = getDirsFromUrl(pagePath); return root + linkUrl; }