public Corpus transform(final BiConsumer<Path, KAFDocument> transformer) { return new Corpus(this.files, this.transformer == null ? transformer : this.transformer.andThen(transformer)); }
public Corpus[] split(@Nullable final Long shuffleSeed, final float... percentages) { // Shuffle the files if necessary, using the supplied seed Path[] files = this.files; if (shuffleSeed != null) { final List<Path> list = Lists.newArrayList(files); final Random random = new Random(shuffleSeed); Collections.shuffle(list, random); files = list.toArray(new Path[list.size()]); } // Split the (shuffled) file array based on supplied percentages final Corpus[] corpora = new Corpus[percentages.length]; int index = 0; float cumulated = 0.0f; for (int i = 0; i < percentages.length; ++i) { cumulated += percentages[i]; if (cumulated > 1.0f) { throw new IllegalArgumentException("Invalid percentages (sum must be 1.0f): " + Arrays.toString(percentages)); } final int endIndex = (int) Math.ceil(files.length * cumulated); final Path[] partition = Arrays.copyOfRange(files, index, endIndex); if (shuffleSeed != null) { Arrays.sort(partition); } corpora[i] = new Corpus(partition, this.transformer); index = endIndex; } return corpora; }
public static Corpus create(final boolean recursive, final Iterable<?> filesOrDirs) { final List<Path> paths = Lists.newArrayList(); for (final Object fileOrDir : filesOrDirs) { if (fileOrDir instanceof Path) { paths.add((Path) fileOrDir); } else if (fileOrDir instanceof File) { paths.add(((File) fileOrDir).toPath()); } else { paths.add(Paths.get(fileOrDir.toString())); } } // todo: this uses Util, a class included in utils-svm final List<Path> files = Util.fileMatch(paths, ImmutableList.of(".naf", ".naf.gz", ".naf.bz2", ".naf.xz", ".xml", ".xml.gz", ".xml.bz2", ".xml.xz"), recursive); for (int i = 0; i < files.size(); ++i) { files.set(i, files.get(i).toAbsolutePath().normalize()); } if (files.isEmpty()) { return EMPTY; } else { return new Corpus(files.toArray(new Path[files.size()]), null); } }