@Test public void testIdFromString() throws Exception { DatasetSplitId split1 = DatasetSplitId.of("ds1_1_s1"); DatasetSplitId split2 = DatasetSplitId.of("ds2_2_s2"); DatasetSplitId split3 = DatasetSplitId.of("ds3_3_s3"); DatasetSplitId split4 = DatasetSplitId.of("ds4%5Ftest_4_s4"); DatasetSplitId split5 = DatasetSplitId.of("ds5%25test_5_s5"); assertEquals("ds1", split1.getDatasetId()); assertEquals("ds2", split2.getDatasetId()); assertEquals("ds3", split3.getDatasetId()); assertEquals("ds4_test", split4.getDatasetId()); assertEquals("ds5%test", split5.getDatasetId()); }
public static SearchQuery getSplitsQuery(DatasetConfig datasetConfig) { Preconditions.checkNotNull(datasetConfig.getReadDefinition()); long splitVersion = Preconditions.checkNotNull(datasetConfig.getReadDefinition().getSplitVersion()); return getSplitsQuery(datasetConfig.getId(), splitVersion); }
@Override protected Iterable<Entry<DatasetSplitId, DatasetSplit>> findSplits() { FindByRange<DatasetSplitId> filter = DatasetSplitId.getSplitsRange(datasetId, splitVersion); return getNamespaceService().findSplits(filter); }
@VisibleForTesting static DatasetSplitId of(EntityId datasetId, long splitVersion, String splitKey) { final String datasetIdAsString = escape(datasetId.getId()); String compoundSplitId = SPLIT_ID_JOINER.join(datasetIdAsString, splitVersion, splitKey); return new DatasetSplitId(compoundSplitId, datasetIdAsString, splitVersion, splitKey); }
@JsonCreator public static DatasetSplitId of(String datasetSplitId) { final String[] ids = datasetSplitId.split(DELIMITER, 3); Preconditions.checkArgument(ids.length == 3 && !ids[0].isEmpty() && !ids[1].isEmpty() && !ids[2].isEmpty(), "Invalid dataset split id %s", datasetSplitId); // Some dataset split before upgrade might not have a valid version // but the compound key would still allow for the entry to be removed from the kvstore // so allowing it temporarily. // // See DX-13336 for details long version; try { version = Long.parseLong(ids[1]); } catch (NumberFormatException e) { version = Long.MIN_VALUE; } return new DatasetSplitId(datasetSplitId, unescape(ids[0]), version, ids[2]); }
private void fixSplits(final KVStore<DatasetSplitId, DatasetSplit> splitsStore, DatasetConfig config) { final long version = config.getReadDefinition().getSplitVersion(); // Get old splits final FindByRange<DatasetSplitId> query = DatasetSplitId.unsafeGetSplitsRange(config); for (Entry<DatasetSplitId, DatasetSplit> entry : splitsStore.find(query)) { final DatasetSplitId oldId = entry.getKey(); final DatasetSplit split = entry.getValue(); // Generate new Id and compare with old id final DatasetSplitId newId = DatasetSplitId.of(config, split, version); if (oldId.equals(newId)) { continue; } // Delete the previous entry and add a new one splitsStore.delete(oldId); splitsStore.put(newId, split.setVersion(null)); } }
for(Map.Entry<DatasetSplitId, DatasetSplit> entry : splitsStore.find(DatasetSplitId.unsafeGetSplitsRange(ds1))) { DatasetSplitId splitId = entry.getKey(); assertThat(splitId.getDatasetId(), is("foo")); assertThat(splitId.getSplitVersion(), is(Long.MIN_VALUE)); count++; for(Map.Entry<DatasetSplitId, DatasetSplit> entry : splitsStore.find(DatasetSplitId.unsafeGetSplitsRange(ds2))) { DatasetSplitId splitId = entry.getKey(); assertThat(splitId.getDatasetId(), startsWith("foo")); // unescaped dataset split id might generate invalid unicode when unescaped assertThat(splitId.getSplitVersion(), is(42L)); count++; for(Map.Entry<DatasetSplitId, DatasetSplit> entry : splitsStore.find(DatasetSplitId.unsafeGetSplitsRange(ds3))) { DatasetSplitId splitId = entry.getKey(); assertThat(splitId.getDatasetId(), is(ds3.getId().getId())); assertThat(splitId.getSplitVersion(), is(42L)); count++; for(Map.Entry<DatasetSplitId, DatasetSplit> entry : splitsStore.find(DatasetSplitId.getSplitsRange(ds1))) { DatasetSplitId splitId = entry.getKey(); assertThat(splitId.getDatasetId(), is("foo_bar")); assertThat(splitId.getSplitVersion(), is(42L)); count++; for(Map.Entry<DatasetSplitId, DatasetSplit> entry : splitsStore.find(DatasetSplitId.getSplitsRange(ds2))) { DatasetSplitId splitId = entry.getKey(); assertThat(splitId.getDatasetId(), startsWith("foo%bar")); // unescaped dataset split id might generate invalid unicode when unescaped assertThat(splitId.getSplitVersion(), is(42L));
private static DatasetSplitId getId(EntityId datasetId, long version) { return DatasetSplitId.of(datasetId, version, ""); }
@Test public void testUnsafeIdFromString() throws Exception { DatasetSplitId split1 = DatasetSplitId.of("ds1_test_1_s1"); assertEquals("ds1", split1.getDatasetId()); assertEquals(Long.MIN_VALUE, split1.getSplitVersion()); }
@Test public void testIdWithPercentageFromConfig() throws Exception { DatasetConfig datasetConfig = new DatasetConfig() .setId(new EntityId().setId("ds1%test")) .setReadDefinition(new ReadDefinition().setSplitVersion(0L)); DatasetSplitId split1 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s1"), 0L); DatasetSplitId split2 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s2"), 0L); DatasetSplitId split3 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s3"), 0L); assertEquals("ds1%25test_0_s1", split1.getSplitId()); assertEquals("ds1%25test_0_s2", split2.getSplitId()); assertEquals("ds1%25test_0_s3", split3.getSplitId()); }
!compareSplits(dataset, splits, splitsStore.find(DatasetSplitId.getSplitsRange(dataset)))) { addOrUpdateDataset(datasetPath, dataset, attributes); return; final DatasetSplitId splitId = DatasetSplitId.of(dataset, split, nextSplitVersion); split.setSplitVersion(nextSplitVersion); splitsStore.put(splitId, split);
@Override public void convert(DocumentWriter writer, DatasetSplitId key, DatasetSplit split) { writer.write(SPLIT_ID, key.getSplitId()); writer.write(DATASET_ID, key.getDatasetId()); writer.write(SPLIT_IDENTIFIER, key.getSplitIdentifier()); writer.write(SPLIT_VERSION, split.getSplitVersion()); writer.write(SPLIT_ROWS, split.getRowCount());
@Override public byte[] convert(DatasetSplitId v) { return StringSerializer.INSTANCE.convert(v.getSplitId()); }
@VisibleForTesting static DatasetSplitId ofUnsafe(EntityId datasetId, long splitVersion, String splitKey) { final String datasetIdAsString = datasetId.getId(); String compoundSplitId = SPLIT_ID_JOINER.join(datasetIdAsString, splitVersion, splitKey); return new DatasetSplitId(compoundSplitId, datasetIdAsString, splitVersion, splitKey); } private DatasetSplitId(String compoundSplitId, String datasetId, long splitVersion, String splitKey) {
@Override public DatasetSplitId revert(byte[] v) { return DatasetSplitId.of(StringSerializer.INSTANCE.revert(v)); } }
@Test public void testIdFromConfig() throws Exception { DatasetConfig datasetConfig = new DatasetConfig() .setId(new EntityId().setId("ds1")) .setReadDefinition(new ReadDefinition().setSplitVersion(0L)); DatasetSplitId split1 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s1"), 0L); DatasetSplitId split2 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s2"), 0L); DatasetSplitId split3 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s3"), 0L); assertEquals("ds1_0_s1", split1.getSplitId()); assertEquals("ds1_0_s2", split2.getSplitId()); assertEquals("ds1_0_s3", split3.getSplitId()); }
/** * UNSAFE! Use {@code DatasetSplitId#getSplitRange(EntityId, long)} instead */ public static FindByRange<DatasetSplitId> unsafeGetSplitsRange(DatasetConfig config) { final long splitVersion = config.getReadDefinition().getSplitVersion(); final long nextSplitVersion = splitVersion + 1; final String datasetId = config.getId().getId(); // Unsafe way of constructing dataset split id!!! final DatasetSplitId start = new DatasetSplitId(SPLIT_ID_JOINER.join(datasetId, splitVersion, ""), datasetId, splitVersion, ""); final DatasetSplitId end = new DatasetSplitId(SPLIT_ID_JOINER.join(datasetId, nextSplitVersion, ""), datasetId, splitVersion, ""); return new FindByRange<DatasetSplitId>() .setStart(start, true) .setEnd(end, false); } /**
@Override public DatasetSplitId fromJson(String v) throws IOException { return DatasetSplitId.of(StringSerializer.INSTANCE.fromJson(v)); }
@Test public void testIdWithUnderscoreFromConfig() throws Exception { DatasetConfig datasetConfig = new DatasetConfig() .setId(new EntityId().setId("ds1_test")) .setReadDefinition(new ReadDefinition().setSplitVersion(0L)); DatasetSplitId split1 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s1"), 0L); DatasetSplitId split2 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s2"), 0L); DatasetSplitId split3 = DatasetSplitId.of(datasetConfig, new DatasetSplit().setSplitKey("s3"), 0L); assertEquals("ds1%5Ftest_0_s1", split1.getSplitId()); assertEquals("ds1%5Ftest_0_s2", split2.getSplitId()); assertEquals("ds1%5Ftest_0_s3", split3.getSplitId()); }
@Override protected SearchQuery getPartitionQuery(SearchQuery partitionFilterQuery) { FindByCondition splitFilter = new FindByCondition().setCondition(DatasetSplitId.getSplitsQuery(datasetId, splitVersion)); return SearchQueryUtils.and(splitFilter.getCondition(), partitionFilterQuery); }