List<PairRule> loadPairRule(Path path) throws IOException { List<String> lines = TextIO.loadLines(path, "#"); List<PairRule> rules = new ArrayList<>(); for (String line : lines) { PairRule rule = PairRule.fromLine(line); if (rule == null) { continue; } rules.add(rule); } return rules; }
public static List<String> loadLinesFromResource(String resourcePath) throws IOException { return loadLinesFromResource(resourcePath, null); }
public static Path createTempFile(String content) throws IOException { return createTempFile(Collections.singletonList(content)); }
public RuleBasedDisambiguator(TurkishMorphology analyzer, Rules rules) throws IOException { this.analyzer = analyzer; Log.info("Loading 100k word frequencies."); List<String> freqLines = TextIO.loadLinesFromCompressedResource("/ambiguity/freq-100k.txt.gz"); wordFreq = Histogram.loadFromLines(freqLines, ' '); this.rules = rules; }
Log.info("There are %d files", files.size()); for (Path file : files) { String s = TextIO.loadUtfAsString(file);
long charCount = TextIO.charCount(input, StandardCharsets.UTF_8); Log.info("Training started."); Stopwatch sw = Stopwatch.createStarted();
private ArrayListMultimap<String, String> loadMultiMap(Path path) throws IOException { List<String> lines = TextIO.loadLines(path); return loadMultiMap(lines); }
public static RootLexicon loadFromResources(Collection<String> resourcePaths) throws IOException { List<String> lines = Lists.newArrayList(); for (String resourcePath : resourcePaths) { lines.addAll(TextIO.loadLinesFromResource(resourcePath, "##")); } return load(lines); }
public static Path createTempFile(String... content) throws IOException { return createTempFile(Arrays.asList(content)); }
public static Weights loadFromFile(Path file) throws IOException { List<String> all = TextIO.loadLines(file); return loadFromLines(all); }
static List<PairRule> loadPairRule(String resource) throws IOException { List<String> lines = TextIO.loadLinesFromResource(resource, "#"); List<PairRule> rules = new ArrayList<>(); for (String line : lines) { PairRule rule = PairRule.fromLine(line); if (rule == null) { continue; } rules.add(rule); } return rules; }
static void createTestSet(Path p, Path labeled, Path out) throws IOException { List<String> allNone = TextIO.loadLines(p); allNone = allNone.stream().map(s -> s.replaceAll("[(].+?[)]", " ") .replaceAll("\\s+", " ").trim()) .collect(Collectors.toList()); List<String> test = new ArrayList<>(); for (String s : allNone) { test.add("__label__none " + s); } Random rnd = new Random(2345); List<String> allLabeled = TextIO.loadLines(labeled); for (String s : allNone) { ArrayList<String> tokens = new ArrayList<>(Splitter.on(" ").splitToList(s)); String rndLine = allLabeled.get(rnd.nextInt(allLabeled.size())); tokens.add(rnd.nextInt(tokens.size()), rndLine); test.add("__label__rec_notice " + String.join(" ", tokens)); } Files.write(out, test); }
public static Weights loadFromResource(String resource) throws IOException { List<String> lines = TextIO.loadLinesFromResource(resource); return loadFromLines(lines); }
static void generateTraining(Path labeled, Path junk, int junkCount, Path out) throws IOException { Random rnd = new Random(1234); List<String> allTrue = TextIO.loadLines(labeled); List<String> junkAll = TextIO.loadLines(junk); if (junkCount > junkAll.size()) { junkCount = junkAll.size(); } Collections.shuffle(junkAll, rnd); List<String> junkLabeled = new ArrayList<>(junkAll.subList(0, junkCount)); List<String> set = new ArrayList<>(); for (String s : junkLabeled) { set.add("__label__none " + s); } for (int i = 0; i < 5; i++) { for (String s : allTrue) { set.add("__label__rec_notice " + s); } } Collections.shuffle(set, rnd); Files.write(out, set); }
public static void checkAbbreviations() throws IOException { LinkedHashSet<String> fromProper = new LinkedHashSet<>(TextIO.loadLinesFromResource("tr/proper-from-corpus.dict")); LinkedHashSet<String> fromAbbrv = new LinkedHashSet<>(TextIO.loadLinesFromResource("tr/abbreviations.dict")); Map<String, String> map = new HashMap<>(); putToMap(fromProper, map); Map<String, String> mapAbbrv = new HashMap<>(); putToMap(fromAbbrv, mapAbbrv); for (String s : mapAbbrv.keySet()) { if (map.containsKey(s)) { Log.info(s); map.remove(s); } } List<String> vals = new ArrayList<>(map.values()); vals.sort(Turkish.STRING_COMPARATOR_ASC); Files.write(Paths.get("zemberek.prop.sorted"), vals); }
public static BlockTextLoader fromDirectoryRoot( Path corporaRoot, Path folderListFile, int blockSize) throws IOException { List<String> rootNames = TextIO.loadLines(folderListFile, "#"); List<Path> roots = new ArrayList<>(); rootNames.forEach(s -> roots.add(corporaRoot.resolve(s))); List<Path> corpora = new ArrayList<>(); for (Path corpusRoot : roots) { corpora.addAll(Files.walk(corpusRoot, 1) .filter(s -> s.toFile().isFile()) .collect(Collectors.toList())); } corpora.sort(Comparator.comparing(a -> a.toFile().getAbsolutePath())); Log.info("There are %d corpus files.", corpora.size()); return new BlockTextLoader(corpora, blockSize); }
public Builder addTextDictionaryResources(Collection<String> resources) throws IOException { Log.info("Dictionaries :%s", String.join(", ", resources)); List<String> lines = new ArrayList<>(); for (String resource : resources) { lines.addAll(TextIO.loadLinesFromResource(resource)); } lexicon.addAll(TurkishDictionaryLoader.load(lines)); return this; }
static List<String> addLabels(Path input) throws IOException { List<String> lines = TextIO.loadLines(input); List<String> result = new ArrayList<>(); for (String line : lines) { int i = line.indexOf('\t'); if (i == -1) { continue; } String content = line.substring(0, i).trim(); normalizer.setAlwaysApplyDeasciifier(true); content = normalizer.normalize(content); String label = "__label__" + line.substring(i).trim(); result.add(label + " " + content); } return result; }
public static PerceptronNer loadModelFromResources(String name, TurkishMorphology morphology) { String resourceRoot = "/ner/model/" + name; try { List<String> types = TextIO.loadLinesFromResource(resourceRoot + "/ner-types"); Map<String, ClassModel> weightsMap = new HashMap<>(); for (String type : types) { String resourcePath = resourceRoot + "/" + type + ".ner.model"; ClassModel weights = ClassModel.loadFromResource(type, resourcePath); weightsMap.put(weights.id, weights); } return new PerceptronNer(weightsMap, morphology); } catch (IOException e) { throw new RuntimeException(e); } }
void generateData(int testSize) throws IOException { Path raw = root.resolve("raw3/all"); Random r = new Random(1); List<String> lines = TextIO.loadLines(raw); Collections.shuffle(lines, r); List<String> test = lines.subList(0, testSize); List<String> train = lines.subList(testSize, lines.size() - 1); Log.info("Train = %d, Test = %d lines.", train.size(), test.size()); train = train.stream() .filter(s -> s.contains("__label__")) .map(s -> s.replaceAll("^\"", "")) .map(s -> normalizer.normalize(s)) .collect(Collectors.toList()); test = test.stream() .filter(s -> s.contains("__label__")) .map(s -> s.replaceAll("^\"", "")) .map(s -> normalizer.normalize(s)) .collect(Collectors.toList()); Log.info("After pre-process, Train = %d, Test = %d lines.", train.size(), test.size()); Files.createDirectories(t1out); Files.write(trainRaw, train); Files.write(testRaw, test); }