private static DatasetSplitId getId(EntityId datasetId, long version) { return DatasetSplitId.of(datasetId, version, ""); }
@Override public DatasetSplitId revert(byte[] v) { return DatasetSplitId.of(StringSerializer.INSTANCE.revert(v)); } }
@Override public DatasetSplitId fromJson(String v) throws IOException { return DatasetSplitId.of(StringSerializer.INSTANCE.fromJson(v)); }
public static DatasetSplitId of(DatasetConfig config, DatasetSplit split, long splitVersion) { Preconditions.checkArgument(splitVersion > -1); EntityId datasetId = config.getId(); String splitKey = split.getSplitKey(); return of(datasetId, splitVersion, splitKey); }
final DatasetSplitId splitId = DatasetSplitId.of(dataset, newSplit, dataset.getReadDefinition().getSplitVersion());
@Test public void testInvalidIdFromString() throws Exception { try { DatasetSplitId split = DatasetSplitId.of("ds1_1"); fail("ds1_1 is an invalid dataset split id"); } catch (IllegalArgumentException e) { } try { DatasetSplitId split = DatasetSplitId.of("ds2_2_"); fail("ds2_2_ is an invalid dataset split id"); } catch (IllegalArgumentException e) { } }
newTestCase(true, KEEP_CURRENT_VERSION_ONLY, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION, "foo")), newTestCase(false, KEEP_CURRENT_VERSION_ONLY, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION - 1, "foo")), newTestCase(false, KEEP_CURRENT_VERSION_ONLY, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION + 1, "foo")), newTestCase(true, KEEP_CURRENT_VERSION_ONLY, null, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION, "foo")), newTestCase(false, KEEP_CURRENT_VERSION_ONLY, null, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION - 1, "foo")), newTestCase(false, KEEP_CURRENT_VERSION_ONLY, null, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION + 1, "foo")), newTestCase(true, KEEP_CURRENT_VERSION_ONLY, SOURCE_METADATA_POLICY, OLD_DATASET, DatasetSplitId.of(OLD_DATASET_ID, OLD_DATASET_SPLIT_VERSION, "foo")), newTestCase(false, KEEP_CURRENT_VERSION_ONLY, SOURCE_METADATA_POLICY, OLD_DATASET, DatasetSplitId.of(OLD_DATASET_ID, OLD_DATASET_SPLIT_VERSION - 1, "foo")), newTestCase(false, KEEP_CURRENT_VERSION_ONLY, SOURCE_METADATA_POLICY, OLD_DATASET, DatasetSplitId.of(OLD_DATASET_ID, OLD_DATASET_SPLIT_VERSION + 1, "foo")), newTestCase(true, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION, "foo")), newTestCase(true, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION - 1, "foo")), newTestCase(true, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION + 1, "foo")), newTestCase(true, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION - TimeUnit.DAYS.toMillis(1) + TimeUnit.SECONDS.toMillis(30), "foo")), newTestCase(false, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION - TimeUnit.DAYS.toMillis(1) - 1, "foo")), newTestCase(true, KEEP_VALID_SPLITS, null, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION, "foo")), newTestCase(true, KEEP_VALID_SPLITS, null, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION - 1, "foo")), newTestCase(true, KEEP_VALID_SPLITS, null, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION + 1, "foo")), newTestCase(true, KEEP_VALID_SPLITS, null, DATASET, DatasetSplitId.of(DATASET_ID, DATASET_SPLIT_VERSION - TimeUnit.DAYS.toMillis(1) + TimeUnit.SECONDS.toMillis(30), "foo")), newTestCase(true, KEEP_VALID_SPLITS, null, DATASET, DatasetSplitId.of(DATASET_ID, 0, "foo")), newTestCase(true, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, OLD_DATASET, DatasetSplitId.of(OLD_DATASET_ID, OLD_DATASET_SPLIT_VERSION, "foo")), newTestCase(false, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, OLD_DATASET, DatasetSplitId.of(OLD_DATASET_ID, OLD_DATASET_SPLIT_VERSION - 1, "foo")), newTestCase(true, KEEP_VALID_SPLITS, SOURCE_METADATA_POLICY, OLD_DATASET, DatasetSplitId.of(OLD_DATASET_ID, OLD_DATASET_SPLIT_VERSION + 1, "foo")) );
private void fixSplits(final KVStore<DatasetSplitId, DatasetSplit> splitsStore, DatasetConfig config) { final long version = config.getReadDefinition().getSplitVersion(); // Get old splits final FindByRange<DatasetSplitId> query = DatasetSplitId.unsafeGetSplitsRange(config); for (Entry<DatasetSplitId, DatasetSplit> entry : splitsStore.find(query)) { final DatasetSplitId oldId = entry.getKey(); final DatasetSplit split = entry.getValue(); // Generate new Id and compare with old id final DatasetSplitId newId = DatasetSplitId.of(config, split, version); if (oldId.equals(newId)) { continue; } // Delete the previous entry and add a new one splitsStore.delete(oldId); splitsStore.put(newId, split.setVersion(null)); } }
@Test public void testIdFromString() throws Exception { DatasetSplitId split1 = DatasetSplitId.of("ds1_1_s1"); DatasetSplitId split2 = DatasetSplitId.of("ds2_2_s2"); DatasetSplitId split3 = DatasetSplitId.of("ds3_3_s3"); DatasetSplitId split4 = DatasetSplitId.of("ds4%5Ftest_4_s4"); DatasetSplitId split5 = DatasetSplitId.of("ds5%25test_5_s5"); assertEquals("ds1", split1.getDatasetId()); assertEquals("ds2", split2.getDatasetId()); assertEquals("ds3", split3.getDatasetId()); assertEquals("ds4_test", split4.getDatasetId()); assertEquals("ds5%test", split5.getDatasetId()); }
public void savePhysicalDataset(List<String> path, DatasetType type, long splitVersion, int count) throws NamespaceException { final ReadDefinition readDefinition = new ReadDefinition() .setSplitVersion(splitVersion); final DatasetConfig datasetConfig = saveDataset(path, type, config -> config.setReadDefinition(readDefinition)); generateSplits(splitVersion, count) .forEach(split -> splitsStore.put(DatasetSplitId.of(datasetConfig, split, splitVersion), split)); }
@Test public void testKeepValidSplits() throws Exception { namespaceService.deleteSplitOrphans(SplitOrphansRetentionPolicy.KEEP_VALID_SPLITS); assertThat( namespaceService.getSplitCount(new FindByCondition().setCondition(SearchQueryUtils.newMatchAllQuery())), is(10 + 20 + 24 * 100 + 1000)); DatasetConfig dataset1 = namespaceService.getDataset(new NamespaceKey(Arrays.asList("test", "dataset1"))); for(int i = 23; i >= 0; i--) { final long splitVersion = now - TimeUnit.HOURS.toMillis(i); generateSplits(splitVersion, 100).forEach(split -> { DatasetSplitId id = DatasetSplitId.of(dataset1, split, splitVersion); assertThat(splitsStore.get(id), is(split)); }); } DatasetConfig dataset2 = namespaceService.getDataset(new NamespaceKey(Arrays.asList("test", "dataset1"))); generateSplits(now, 100).forEach(split -> { DatasetSplitId id = DatasetSplitId.of(dataset2, split, now); assertThat(splitsStore.get(id), is(split)); }); }
@Test public void testKeepCurrentVersion() throws Exception { namespaceService.deleteSplitOrphans(KEEP_CURRENT_VERSION_ONLY); assertThat( namespaceService.getSplitCount(new FindByCondition().setCondition(SearchQueryUtils.newMatchAllQuery())), is(10 + 20 + 100 + 1000)); DatasetConfig dataset1 = namespaceService.getDataset(new NamespaceKey(Arrays.asList("test", "dataset1"))); generateSplits(now, 100).forEach(split -> { DatasetSplitId id = DatasetSplitId.of(dataset1, split, now); assertThat(splitsStore.get(id), is(split)); }); DatasetConfig dataset2 = namespaceService.getDataset(new NamespaceKey(Arrays.asList("test", "dataset1"))); generateSplits(now, 100).forEach(split -> { DatasetSplitId id = DatasetSplitId.of(dataset2, split, now); assertThat(splitsStore.get(id), is(split)); }); }
@Test public void testUnsafeIdFromString() throws Exception { DatasetSplitId split1 = DatasetSplitId.of("ds1_test_1_s1"); assertEquals("ds1", split1.getDatasetId()); assertEquals(Long.MIN_VALUE, split1.getSplitVersion()); }
final DatasetSplitId splitId = DatasetSplitId.of(dataset, split, nextSplitVersion); split.setSplitVersion(nextSplitVersion); splitsStore.put(splitId, split);
@Test public void testIdWithPercentageFromConfig() throws Exception { DatasetConfig datasetConfig = new DatasetConfig() .setId(new EntityId().setId("ds1%test")) .setReadDefinition(new ReadDefinition().setSplitVersion(0L)); DatasetSplitId split1 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s1"), 0L); DatasetSplitId split2 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s2"), 0L); DatasetSplitId split3 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s3"), 0L); assertEquals("ds1%25test_0_s1", split1.getSplitId()); assertEquals("ds1%25test_0_s2", split2.getSplitId()); assertEquals("ds1%25test_0_s3", split3.getSplitId()); }
@Test public void testIdFromConfig() throws Exception { DatasetConfig datasetConfig = new DatasetConfig() .setId(new EntityId().setId("ds1")) .setReadDefinition(new ReadDefinition().setSplitVersion(0L)); DatasetSplitId split1 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s1"), 0L); DatasetSplitId split2 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s2"), 0L); DatasetSplitId split3 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s3"), 0L); assertEquals("ds1_0_s1", split1.getSplitId()); assertEquals("ds1_0_s2", split2.getSplitId()); assertEquals("ds1_0_s3", split3.getSplitId()); }
@Test public void testIdWithUnderscoreFromConfig() throws Exception { DatasetConfig datasetConfig = new DatasetConfig() .setId(new EntityId().setId("ds1_test")) .setReadDefinition(new ReadDefinition().setSplitVersion(0L)); DatasetSplitId split1 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s1"), 0L); DatasetSplitId split2 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s2"), 0L); DatasetSplitId split3 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s3"), 0L); assertEquals("ds1%5Ftest_0_s1", split1.getSplitId()); assertEquals("ds1%5Ftest_0_s2", split2.getSplitId()); assertEquals("ds1%5Ftest_0_s3", split3.getSplitId()); }