public static void addTextPressoPipes(List<String> usedPipeNames, List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException, Exception { usedPipeNames.add("TextPresso"); // TEXTPRESSO files, files are split by how many tokens for (int i = 1; i < 8; i++) { pipes.add(new TrieLexiconMembership("textPresso" + i, new File( LEXICON_HOME + "TextPresso-wordLength-" + i + ".txt"), ignoreCase)); } pipes.add(new TrieLexiconMembership("textPressoAll", new File( LEXICON_HOME + "TextPresso-all.txt"), ignoreCase)); pipes.addAll(NGramPipeFactory.getAllGramsPipes("textPressoAll", new File(LEXICON_HOME + "TextPresso-all.txt"), ignoreCase)); }
private static void addAbbreviationLexiconPipes(List<String> usedPipeNames, List<Pipe> pipes) throws IOException { usedPipeNames.add("AbbrevLex"); File ratMouse = new File(LEXICON_HOME + "NN2007RatMouseAbbrev.txt"); File human = new File(LEXICON_HOME + "NN2002HumanAbbrev.txt"); boolean ignoreCase = true; // should be one word only but may not.. pipes.add(new TrieLexiconMembership("NNHumanAbbrev", human, ignoreCase)); pipes.add(new TrieLexiconMembership("NNRatMouseAbbrev", ratMouse, ignoreCase)); addPrefixPipes(pipes, ratMouse, "NNHumanAbbrevPrefix"); addPrefixPipes(pipes, human, "NNRatMouseAbbrevPrefix"); }
public static void addBrainRegionLexicons(List<String> usedPipeNames, List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException, Exception { usedPipeNames.add("BrainRegions"); // BRAINREGION Lexicons pipes.add(new TrieLexiconMembership("NNHu", new File(LEXICON_HOME + "NN2002Human.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("NNMouseRat", new File(LEXICON_HOME + "NN2007RatMouse.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("Allen", new File(LEXICON_HOME + "Allen.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("BAMS", new File(LEXICON_HOME + "BAMS.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("AllRegions", new File(LEXICON_HOME + "AllRegions.txt"), ignoreCase)); pipes.addAll(NGramPipeFactory.getAllGramsPipes("AllRegions", new File( LEXICON_HOME + "AllRegions.txt"), ignoreCase)); }
public static Pipe getNGramPipe(String name, File inputFile, boolean ignoreCase, int gram) throws Exception { File tempFile = File.createTempFile("ngram", ".txt"); // System.out.println( "Your temp file is " + // tempFile.getCanonicalPath() ); // Arrange for it to be deleted at exit. tempFile.deleteOnExit(); BufferedWriter bw = new BufferedWriter(new FileWriter(tempFile)); BufferedReader br = new BufferedReader(new FileReader(inputFile)); while (br.ready()) { String line = br.readLine().trim(); if (line.equals("")) continue; // ignore blank lines // check null String[] gramStrings = getGrams(line, gram); if (gramStrings != null) { for (String gramString : gramStrings) { bw.write(gramString); // System.out.println(gramString); bw.newLine(); } } } bw.close(); br.close(); return new TrieLexiconMembership(name + "-" + gram + "-gram", tempFile, ignoreCase); }
new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOMISC")), }); new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGSOCCER")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGGOVT")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNGO")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGMILITARY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGCOMPANY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGBANK")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGTRADE")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNEWS")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGOPERATINGSYSTEM")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGPOLITICALPARTY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCRELIGION")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCGOVT")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCWAR")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCCURRENCY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/LOC")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/PERFL")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCF")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGFRAWEDITEDSORTED")), }); new TrieLexiconMembership ("COUNTRYCAPITAL", new File(conlllexdir + "country-capitals"), true), new TrieLexiconMembership ("USSTATE", new File(conlllexdir + "US-states"), true),
new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOMISC")), }); new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGSOCCER")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGGOVT")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNGO")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGMILITARY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGCOMPANY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGBANK")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGTRADE")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNEWS")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGOPERATINGSYSTEM")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGPOLITICALPARTY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCRELIGION")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCGOVT")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCWAR")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCCURRENCY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/LOC")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/PERFL")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCF")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGFRAWEDITEDSORTED")), }); new TrieLexiconMembership ("COUNTRYCAPITAL", new File(conlllexdir + "country-capitals"), true), new TrieLexiconMembership ("USSTATE", new File(conlllexdir + "US-states"), true),
new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOMISC")), }); new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGSOCCER")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGGOVT")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNGO")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGMILITARY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGCOMPANY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGBANK")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGTRADE")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNEWS")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGOPERATINGSYSTEM")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGPOLITICALPARTY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCRELIGION")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCGOVT")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCWAR")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCCURRENCY")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/LOC")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/PERFL")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCF")), new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGFRAWEDITEDSORTED")), }); new TrieLexiconMembership ("COUNTRYCAPITAL", new File(conlllexdir + "country-capitals"), true), new TrieLexiconMembership ("USSTATE", new File(conlllexdir + "US-states"), true),
if (includeConllLexiconsOption.value) conllLexiconsPipe = new SerialPipes (new Pipe[] { new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOMISC")), });
if (includeConllLexiconsOption.value) conllLexiconsPipe = new SerialPipes (new Pipe[] { new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOMISC")), });
if (includeConllLexiconsOption.value) conllLexiconsPipe = new SerialPipes (new Pipe[] { new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOPER")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOLOC")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOORG")), new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOMISC")), });