private boolean isBlacklisted(LocalPage lp) { String title = lp.getTitle().getCanonicalTitle(); for (Pattern p : TITLE_BLACKLIST) { if (p.matcher(title).matches()) { return true; } } return false; }
/** * TODO: make multi lingual * @param lp * @return */ private boolean isList(LocalPage lp) { return lp.getTitle().getCanonicalTitle().toLowerCase().startsWith("list"); }
private void printAmbiguousCluster(List<LocalId> vertices) throws WikiBrainException { try { List<String> titles = Lists.newArrayList(); for (LocalId vertex : vertices) { LocalPage localPage = lpDao.getById(vertex); titles.add(localPage.getTitle().toString()); } LOG.info("Found ambiguous cluster: " + StringUtils.join(titles, ", ")); }catch(DaoException e){ throw new WikiBrainException(e); } }
private Map pageJson(LocalPage p) { if (p == null) { return null; } Map json = new HashMap(); json.put("articleId", p.getLocalId()); json.put("title", p.getTitle().getCanonicalTitle()); json.put("lang", p.getLanguage().getLangCode()); return json; }
/** * @return, for example "/w/en/1000/Hercule_Poirot" */ public String getCompactUrl() { String escapedTitle = getTitle().getCanonicalTitle().replace(" ", "_"); escapedTitle = escapedTitle.replaceAll("\\s+", ""); return "/w/" + getLanguage().getLangCode() + "/" + getLocalId() + "/" + escapedTitle; }
private String makeMentionUrl(LocalPage page) { return "/w/" + language.getLangCode() + "/" + page.getLocalId() + "/" + page.getTitle().getCanonicalTitle().replaceAll(" ", "_"); }
public static void main(String args[]) throws Exception { // Prepare the environment Env env = EnvBuilder.envFromArgs(args); // Get the configurator that creates components and a phraze analyzer from it Configurator configurator = env.getConfigurator(); PhraseAnalyzer pa = configurator.get(PhraseAnalyzer.class, "anchortext"); LocalPageDao pageDao = configurator.get(LocalPageDao.class); // get the most common phrases in simple LinkedHashMap<LocalId, Float> resolution = pa.resolve(Language.SIMPLE, "Apple", 20); // show the closest pages System.out.println("resolution of apple"); if (resolution == null) { System.out.println("\tno resolution !"); } else { for (LocalId p : resolution.keySet()) { Title title = pageDao.getById(p).getTitle(); System.out.println("\t" + title + ": " + resolution.get(p)); } } } }
public static void main(String args[]) throws Exception { // Prepare the environment Env env = EnvBuilder.envFromArgs(args); // Get the configurator that creates components and a phraze analyzer from it Configurator configurator = env.getConfigurator(); PhraseAnalyzer pa = configurator.get(PhraseAnalyzer.class, "anchortext"); LocalPageDao pageDao = configurator.get(LocalPageDao.class); // get the most common phrases in simple LinkedHashMap<LocalId, Float> resolution = pa.resolve(Language.SIMPLE, "Apple", 20); // show the closest pages System.out.println("resolution of apple"); if (resolution == null) { System.out.println("\tno resolution !"); } else { for (LocalId p : resolution.keySet()) { Title title = pageDao.getById(p).getTitle(); System.out.println("\t" + title + ": " + resolution.get(p)); } } } }
public Title getBestEnglishTitle(LocalPageDao lpDao, boolean returnRandomLangIfEnglishNotAvailable) throws WikiBrainException { try { Language lang = getLanguageSet().getBestAvailableEnglishLang(returnRandomLangIfEnglishNotAvailable); LocalPage lp = lpDao.getById(lang, getLocalEntities(lang).iterator().next().getId()); return lp.getTitle(); }catch(DaoException e){ throw new WikiBrainException(e); } }
public String formatExplanation(Explanation explanation) throws DaoException { String[] plaintextBuilder = explanation.getFormat().split("\\?", -1); if (explanation.getInformation().size()!=plaintextBuilder.length-1){ throw new IllegalStateException("Incorrect number of information objects in explanation. Expected "+(plaintextBuilder.length-1)+" but found "+explanation.getInformation().size()); } String plaintext = plaintextBuilder[0]; for (int i = 1; i<plaintextBuilder.length; i++){ Object object = explanation.getInformation().get(i-1); //Handle the different possible types of information. //Add additional handlers as appropriate if (object instanceof LocalPage){ plaintext+=((LocalPage) object).getTitle().getCanonicalTitle(); }else if(object instanceof UniversalPage){ Language defaultlang = ((UniversalPage) object).getLanguageSet().getDefaultLanguage(); LocalId nameId = (LocalId)((UniversalPage) object).getLocalEntities(defaultlang).toArray()[0]; LocalPage namePage = localPageDao.getById(nameId.getLanguage(), nameId.getId()); plaintext+=namePage.getTitle().getCanonicalTitle(); }else { plaintext+=object.toString(); } plaintext+=plaintextBuilder[i]; } return plaintext; }
public void testWikify() throws DaoException { int barackId = lpd.getIdByTitle("Barack Obama", language, NameSpace.ARTICLE); RawPage rp = rpd.getById(language, barackId); for (int i = 0; i < 1; i++) { List<LocalLink> detected = wikify(rp.getLocalId()); System.out.println("Links detected for " + rp.getTitle() + " (" + i + ")"); for (LocalLink ll : detected) { System.out.println("\t" + ll + " page " + lpd.getById(language, ll.getDestId()).getTitle()); } } }
private String getTitle(LocalId id) throws DaoException { return metric.getLocalPageDao().getById(language, id.getId()).getTitle().toString(); }
private void doWikify(WikiBrainWebRequest req) throws ConfigurationException, DaoException { Language lang = req.getLanguage(); Wikifier wf = env.getConfigurator().get(Wikifier.class, "websail", "language", lang.getLangCode()); String text = req.getParamOrDie("text"); List jsonConcepts = new ArrayList(); for (LocalLink ll : wf.wikify(text)) { LocalPage page = pageDao.getById(lang, ll.getDestId()); Map obj = new HashMap(); obj.put("index", ll.getLocation()); obj.put("text", ll.getAnchorText()); obj.put("lang", lang.getLangCode()); obj.put("articleId", ll.getDestId()); obj.put("title", page == null ? "Unknown" : page.getTitle().getCanonicalTitle()); jsonConcepts.add(obj); } req.writeJsonResponse("text", text, "references", jsonConcepts); }
public static void main(String[] args) throws Exception{ // Initialize the WikiBrain environment and get the local page dao Env env = EnvBuilder.envFromArgs(args); Configurator conf = env.getConfigurator(); LocalPageDao lpDao = conf.get(LocalPageDao.class); Language simple = env.getDefaultLanguage(); // Retrieve the "milnewitten" sr metric for simple english SRMetric sr = conf.get( SRMetric.class, "prebuiltword2vec", "language", simple.getLangCode()); //Similarity between strings for (String phrase : Arrays.asList("Barack Obama", "US", "Canada", "vim")) { SRResultList similar = sr.mostSimilar(phrase, 3); List<String> pages = new ArrayList<String>(); for (int i = 0; i < similar.numDocs(); i++) { LocalPage page = lpDao.getById(simple, similar.getId(i)); pages.add((i+1) + ") " + page.getTitle()); } System.out.println("'" + phrase + "' is similar to " + StringUtils.join(pages, ", ")); } } }
public static void main(String[] args) throws Exception{ // Initialize the WikiBrain environment and get the local page dao Env env = EnvBuilder.envFromArgs(args); Configurator conf = env.getConfigurator(); LocalPageDao lpDao = conf.get(LocalPageDao.class); Language simple = env.getDefaultLanguage(); // Retrieve the "milnewitten" sr metric for simple english SRMetric sr = conf.get( SRMetric.class, "prebuiltword2vec", "language", simple.getLangCode()); //Similarity between strings for (String phrase : Arrays.asList("Barack Obama", "US", "Canada", "vim")) { SRResultList similar = sr.mostSimilar(phrase, 3); List<String> pages = new ArrayList<String>(); for (int i = 0; i < similar.numDocs(); i++) { LocalPage page = lpDao.getById(simple, similar.getId(i)); pages.add((i+1) + ") " + page.getTitle()); } System.out.println("'" + phrase + "' is similar to " + StringUtils.join(pages, ", ")); } } }
public static void main(String args[]) throws ConfigurationException, DaoException { // Setup environment Env env = EnvBuilder.envFromArgs(args); LocalPageDao pageDao = env.getConfigurator().get(LocalPageDao.class); UniversalPageDao conceptDao = env.getConfigurator().get(UniversalPageDao.class); // Get local and universal pages LocalPage page = pageDao.getByTitle(Language.EN, "Apple"); UniversalPage concept = conceptDao.getByLocalPage(page); // Translate to other languages. System.out.format("%s in other languages:\n", page.getTitle()); for (Language lang : concept.getLanguageSet()) { LocalPage page2 = pageDao.getById(lang, concept.getLocalId(lang)); System.out.format("%s: %s\n", lang.toString(), page2.getTitle().getCanonicalTitle()); } }
public static void main(String args[]) throws ConfigurationException, DaoException { // Setup environment Env env = EnvBuilder.envFromArgs(args); LocalPageDao pageDao = env.getConfigurator().get(LocalPageDao.class); UniversalPageDao conceptDao = env.getConfigurator().get(UniversalPageDao.class); // Get local and universal pages LocalPage page = pageDao.getByTitle(Language.EN, "Apple"); UniversalPage concept = conceptDao.getByLocalPage(page); // Translate to other languages. System.out.format("%s in other languages:\n", page.getTitle()); for (Language lang : concept.getLanguageSet()) { LocalPage page2 = pageDao.getById(lang, concept.getLocalId(lang)); System.out.format("%s: %s\n", lang.toString(), page2.getTitle().getCanonicalTitle()); } }
public static void main(String args[]) throws ConfigurationException, DaoException { // Get the pageview dao Env env = EnvBuilder.envFromArgs(args); PageViewDao viewDao = env.getConfigurator().get(PageViewDao.class); LocalPageDao pageDao = env.getConfigurator().get(LocalPageDao.class); Language lang = env.getDefaultLanguage(); // Download and import pageview stats if necessary. DateTime start = new DateTime(2014, 8, 14, 11, 0, 0); DateTime end = new DateTime(2014, 8, 14, 23, 0, 0); viewDao.ensureLoaded(start, end, env.getLanguages()); // Retrieve counts for all pageviews TIntIntMap allViews = viewDao.getAllViews(lang, start, end); int pageIds[] = WpCollectionUtils.sortMapKeys(allViews, true); System.out.println("Top pageviews in " + lang); for (int i = 0; i < 10; i++) { LocalPage page = pageDao.getById(lang, pageIds[i]); int n = allViews.get(pageIds[i]); System.out.format("%d. %s (nviews=%d)\n", (i+1), page.getTitle(), n); } } }
public static void main(String args[]) throws ConfigurationException, DaoException { // Get the pageview dao Env env = EnvBuilder.envFromArgs(args); PageViewDao viewDao = env.getConfigurator().get(PageViewDao.class); LocalPageDao pageDao = env.getConfigurator().get(LocalPageDao.class); Language lang = env.getDefaultLanguage(); // Download and import pageview stats if necessary. DateTime start = new DateTime(2014, 8, 14, 11, 0, 0); DateTime end = new DateTime(2014, 8, 14, 23, 0, 0); viewDao.ensureLoaded(start, end, env.getLanguages()); // Retrieve counts for all pageviews TIntIntMap allViews = viewDao.getAllViews(lang, start, end); int pageIds[] = WpCollectionUtils.sortMapKeys(allViews, true); System.out.println("Top pageviews in " + lang); for (int i = 0; i < 10; i++) { LocalPage page = pageDao.getById(lang, pageIds[i]); int n = allViews.get(pageIds[i]); System.out.format("%d. %s (nviews=%d)\n", (i+1), page.getTitle(), n); } } }
@Override public void save(LocalPage page) throws DaoException { insert( page.getLanguage().getId(), page.getLocalId(), page.getTitle().getCanonicalTitle(), page.getNameSpace().getArbitraryId(), page.isRedirect(), page.isDisambig() ); }