/** * Convenience for {@link #map(String, Predicate, Projection)} * which uses a {@link DistributedFunction} as the projection function. */ @Nonnull public static <T, K, V> BatchSource<T> map( @Nonnull String mapName, @Nonnull Predicate<? super K, ? super V> predicate, @Nonnull DistributedFunction<? super Map.Entry<K, V>, ? extends T> projectionFn ) { return batchFromProcessor("mapSource(" + mapName + ')', readMapP(mapName, predicate, projectionFn)); }
/** * Convenience for {@link #remoteMap(String, ClientConfig, Predicate, Projection)} * which use a {@link DistributedFunction} as the projection function. */ @Nonnull public static <T, K, V> BatchSource<T> remoteMap( @Nonnull String mapName, @Nonnull ClientConfig clientConfig, @Nonnull Predicate<? super K, ? super V> predicate, @Nonnull DistributedFunction<? super Entry<K, V>, ? extends T> projectionFn ) { return batchFromProcessor("remoteMapSource(" + mapName + ')', readRemoteMapP(mapName, clientConfig, predicate, projectionFn)); }
public static BatchSource<User> customSource() { return Sources.batchFromProcessor("custom-source", preferLocalParallelismOne(CustomSourceP::new)); } }
/** * Returns a source that emits items retrieved from a Hazelcast {@code * IList}. All elements are emitted on a single member — the one * where the entire list is stored by the IMDG. * <p> * If the {@code IList} is modified while being read, the source may miss * and/or duplicate some entries. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * The default local parallelism for this processor is 1. */ @Nonnull public static <T> BatchSource<T> list(@Nonnull String listName) { return batchFromProcessor("listSource(" + listName + ')', readListP(listName)); }
/** * Returns a source that emits items retrieved from a Hazelcast {@code * IList} in a remote cluster identified by the supplied {@code * ClientConfig}. All elements are emitted on a single member. * <p> * If the {@code IList} is modified while being read, the source may miss * and/or duplicate some entries. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * The default local parallelism for this processor is 1. */ @Nonnull public static <T> BatchSource<T> remoteList(@Nonnull String listName, @Nonnull ClientConfig clientConfig) { return batchFromProcessor("remoteListSource(" + listName + ')', readRemoteListP(listName, clientConfig)); }
/** * Returns a source that fetches entries from a local Hazelcast {@code IMap} * with the specified name and emits them as {@code Map.Entry}. It leverages * data locality by making each of the underlying processors fetch only those * entries that are stored on the member where it is running. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * If the {@code IMap} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. * <p> * The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). */ @Nonnull public static <K, V> BatchSource<Entry<K, V>> map(@Nonnull String mapName) { return batchFromProcessor("mapSource(" + mapName + ')', readMapP(mapName)); }
/** * Returns a source that fetches entries from a Hazelcast {@code ICache} * with the given name and emits them as {@code Map.Entry}. It leverages * data locality by making each of the underlying processors fetch only * those entries that are stored on the member where it is running. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * If the {@code ICache} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. * <p> * The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). */ @Nonnull public static <K, V> BatchSource<Entry<K, V>> cache(@Nonnull String cacheName) { return batchFromProcessor("cacheSource(" + cacheName + ')', readCacheP(cacheName)); }
/** * Returns a source that fetches entries from the Hazelcast {@code IMap} * with the specified name in a remote cluster identified by the supplied * {@code ClientConfig} and emits them as {@code Map.Entry}. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * If the {@code IMap} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. * <p> * The default local parallelism for this processor is 1. */ @Nonnull public static <K, V> BatchSource<Entry<K, V>> remoteMap( @Nonnull String mapName, @Nonnull ClientConfig clientConfig ) { return batchFromProcessor("remoteMapSource(" + mapName + ')', readRemoteMapP(mapName, clientConfig)); }
/** * Builds a custom file {@link BatchSource} with supplied components and the * output function {@code mapOutputFn}. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * Any {@code IOException} will cause the job to fail. The files must not * change while being read; if they do, the behavior is unspecified. * <p> * The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). * * @param mapOutputFn the function which creates output object from each * line. Gets the filename and line as parameters * @param <T> the type of the items the source emits */ public <T> BatchSource<T> build(DistributedBiFunction<String, String, ? extends T> mapOutputFn) { return batchFromProcessor("filesSource(" + new File(directory, glob) + ')', SourceProcessors.readFilesP(directory, charset, glob, sharedFileSystem, mapOutputFn)); }
/** * Builds a custom Avro file {@link BatchSource} with supplied components * and the output function {@code mapOutputFn}. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * Any {@code IOException} will cause the job to fail. The files must not * change while being read; if they do, the behavior is unspecified. * <p> * The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). * * @param mapOutputFn the function which creates output object from each * record. Gets the filename and record read by {@code * datumReader} as parameters * @param <T> the type of the items the source emits */ public <T> BatchSource<T> build(@Nonnull DistributedBiFunction<String, ? super D, T> mapOutputFn) { return batchFromProcessor("avroFilesSource(" + new File(directory, glob) + ')', AvroProcessors.readFilesP(directory, glob, sharedFileSystem, datumReaderSupplier, mapOutputFn)); }
/** * Returns a source that fetches entries from the Hazelcast {@code ICache} * with the specified name in a remote cluster identified by the supplied * {@code ClientConfig} and emits them as {@code Map.Entry}. * <p> * The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. * <p> * If the {@code ICache} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. * <p> * The default local parallelism for this processor is 1. */ @Nonnull public static <K, V> BatchSource<Entry<K, V>> remoteCache( @Nonnull String cacheName, @Nonnull ClientConfig clientConfig ) { return batchFromProcessor( "remoteCacheSource(" + cacheName + ')', readRemoteCacheP(cacheName, clientConfig) ); }
/** * Convenience for {@link Sources#jdbc(DistributedSupplier, * ToResultSetFunction, DistributedFunction)}. * A non-distributed, single-worker source which fetches the whole resultSet * with a single query on single member. * <p> * This method executes exactly one query in the target database. If the * underlying table is modified while being read, the behavior depends on * the configured transaction isolation level in the target database. Refer * to the documentation for the target database system. * <p> * Example: <pre>{@code * p.drawFrom(Sources.jdbc( * DB_CONNECTION_URL, * "select ID, NAME from PERSON", * resultSet -> new Person(resultSet.getInt(1), resultSet.getString(2)))) * }</pre> */ public static <T> BatchSource<T> jdbc( @Nonnull String connectionURL, @Nonnull String query, @Nonnull DistributedFunction<? super ResultSet, ? extends T> createOutputFn ) { return batchFromProcessor("jdbcSource", SourceProcessors.readJdbcP(connectionURL, query, createOutputFn)); } }
@Nonnull Projection<? super Entry<K, V>, ? extends T> projection ) { return batchFromProcessor("remoteMapSource(" + mapName + ')', readRemoteMapP(mapName, clientConfig, predicate, projection));
@Nonnull DistributedFunction<? super ResultSet, ? extends T> createOutputFn ) { return batchFromProcessor("jdbcSource", SourceProcessors.readJdbcP(connectionSupplier, resultSetFn, createOutputFn));
@Nonnull DistributedBiFunction<K, V, E> projectionFn ) { return Sources.batchFromProcessor("readHdfs", new MetaSupplier<>(asSerializable(jobConf), projectionFn));
@Nonnull Projection<? super Entry<K, V>, ? extends T> projection ) { return batchFromProcessor("mapSource(" + mapName + ')', readMapP(mapName, predicate, projection));
@Nonnull DistributedBiFunction<K, V, E> projectionFn ) { return Sources.batchFromProcessor("readHdfs", new MetaSupplier<>(asSerializable(jobConf), projectionFn));
@Test(timeout = 20000) public void test() { Pipeline p = Pipeline.create(); p.drawFrom(Sources.batchFromProcessor("source", preferLocalParallelismOne(CustomSourceP::new))) .drainTo(Sinks.fromProcessor("sink", preferLocalParallelismOne(CustomSinkP::new))); jetInstance.newJob(p).join(); }