protected String parse(String rawText) { if (StringUtils.isEmpty(rawText)) return null; else { try { return DefaultExtractor.INSTANCE.getText(rawText); } catch (BoilerpipeProcessingException e) { LOGGER.error(e.getMessage(), e); return null; } } }
public TargetModelElasticSearch(TargetModelCbor model) { URL url = Urls.toJavaURL(model.url); String rawContent = (String) model.response.get("body"); Page page = new Page(url, rawContent); page.setParsedData(new ParsedData(new PaginaURL(url, rawContent))); this.html = rawContent; this.url = model.url; this.retrieved = new Date(model.timestamp * 1000); this.words = page.getParsedData().getWords(); this.wordsMeta = page.getParsedData().getWordsMeta(); this.title = page.getParsedData().getTitle(); this.domain = url.getHost(); try { this.text = DefaultExtractor.getInstance().getText(page.getContentAsString()); } catch (Exception e) { this.text = ""; } InternetDomainName domainName = InternetDomainName.from(page.getDomainName()); if (domainName.isUnderPublicSuffix()) { this.topPrivateDomain = domainName.topPrivateDomain().toString(); } else { this.topPrivateDomain = domainName.toString(); } }
public TargetModelElasticSearch(Page page) { this.url = page.getURL().toString(); this.retrieved = page.getFetchTime() > 0 ? new Date(page.getFetchTime()) : new Date(); this.domain = page.getDomainName(); this.responseHeaders = page.getResponseHeaders(); this.topPrivateDomain = LinkRelevance.getTopLevelDomain(page.getDomainName()); this.crawlerId = page.getCrawlerId(); this.isRelevant = page.getTargetRelevance().isRelevant() ? "relevant" : "irrelevant"; if (page.isHtml()) { String contentAsString = page.getContentAsString(); this.html = contentAsString; ParsedData parsedData = page.getParsedData(); if (parsedData != null) { this.words = parsedData.getWords(); this.wordsMeta = parsedData.getWordsMeta(); this.title = parsedData.getTitle(); } if (page.getTargetRelevance() != null) { this.relevance = page.getTargetRelevance().getRelevance(); } if (contentAsString != null) { try { this.text = DefaultExtractor.getInstance().getText(contentAsString); } catch (Exception e) { this.text = ""; } } } }