private static CollapseKey getCollapseKey(Relation relation, Set<CollapseType> collapseTypes) { CollapseKeyBuilder builder = new CollapseKeyBuilder(relation.getData(), relation.getProgram()); if (!collapseTypes.contains(CollapseType.ACCESS)) { builder.setAccess(relation.getAccess()); } if (!collapseTypes.contains(CollapseType.RUN)) { builder.setRun(relation.getRun()); } if (!collapseTypes.contains(CollapseType.COMPONENT)) { builder.setComponents(relation.getComponents()); } return builder.build(); }
private RelationKey(Relation relation) { this.relation = relation; this.hashCode = Objects.hash(relation.getData(), relation.getProgram(), relation.getRun(), relation.getComponents()); }
@Nullable private ProgramRunId getWorkflowProgramRunid (Relation relation, Map<ProgramRunId, RunRecordMeta> runRecordMap, Map<String, ProgramRunId> workflowIdMap) { ProgramRunId workflowProgramRunId = null; RunRecordMeta runRecord = runRecordMap.get( new ProgramRunId(relation.getProgram().getNamespace(), relation.getProgram().getApplication(), relation.getProgram().getType(), relation.getProgram().getProgram(), relation.getRun().getId())); if (runRecord != null && runRecord.getProperties().containsKey("workflowrunid")) { String workflowRunId = runRecord.getProperties().get("workflowrunid"); workflowProgramRunId = workflowIdMap.get(workflowRunId); } return workflowProgramRunId; }
/** * Collapse {@link Relation}s based on {@link CollapseType} * @param relations lineage relations * @param collapseTypes fields to collapse relations on * @return collapsed relations */ public static Set<CollapsedRelation> collapseRelations(Iterable<Relation> relations, Set<CollapseType> collapseTypes) { Set<CollapsedRelation> collapsedRelations = new HashSet<>(); Multimap<CollapseKey, Relation> multimap = HashMultimap.create(); for (Relation relation : relations) { multimap.put(getCollapseKey(relation, collapseTypes), relation); } LOG.trace("Collapsed relations: {}", multimap.asMap()); for (Map.Entry<CollapseKey, Collection<Relation>> collapsedEntry : multimap.asMap().entrySet()) { NamespacedEntityId data = collapsedEntry.getKey().data; ProgramId program = collapsedEntry.getKey().program; Set<AccessType> accessTypes = new HashSet<>(); Set<RunId> runs = new HashSet<>(); Set<NamespacedEntityId> components = new HashSet<>(); for (Relation relation : collapsedEntry.getValue()) { accessTypes.add(relation.getAccess()); runs.add(relation.getRun()); components.addAll(relation.getComponents()); } collapsedRelations.add(toCollapsedRelation(data, program, accessTypes, runs, components)); } return collapsedRelations; }
private Multimap<RelationKey, Relation> getRollupRelations(Multimap<RelationKey, Relation> relations, Map<ProgramRunId, RunRecordMeta> runRecordMap, Map<String, ProgramRunId> workflowIdMap) { Multimap<RelationKey, Relation> relationsNew = HashMultimap.create(); for (Map.Entry<RelationKey, Collection<Relation>> entry : relations.asMap().entrySet()) { for (Relation relation : entry.getValue()) { ProgramRunId workflowProgramRunId = getWorkflowProgramRunid(relation, runRecordMap, workflowIdMap); if (workflowProgramRunId == null) { relationsNew.put(entry.getKey(), relation); } else { ProgramId workflowProgramId = new ProgramId(workflowProgramRunId.getNamespace(), workflowProgramRunId.getApplication(), workflowProgramRunId.getType(), workflowProgramRunId.getProgram()); NamespacedEntityId data = relation.getData(); if (!(data instanceof DatasetId)) { // This shouldn't happen throw new IllegalStateException("Unknown data type " + data); } Relation workflowRelation = new Relation((DatasetId) data, workflowProgramId, relation.getAccess(), RunIds.fromString(workflowProgramRunId.getRun())); relationsNew.put(entry.getKey(), workflowRelation); } } } return relationsNew; }
/** * Convert a set of runIds into a scan range based on earliest runtime and latest runtime of runIds. * Also, add a scan filter to include only runIds in the given set. * @param runIds input runIds set * @return scan range */ @VisibleForTesting static ScanRangeWithFilter getScanRange(final Set<RunId> runIds) { if (runIds.isEmpty()) { return new ScanRangeWithFilter(0, 0, x -> false); } // Pick the earliest start time and latest start time for lineage range long earliest = Long.MAX_VALUE; long latest = 0; for (RunId runId : runIds) { long runStartTime = RunIds.getTime(runId, TimeUnit.MILLISECONDS); if (runStartTime < earliest) { earliest = runStartTime; } if (runStartTime > latest) { latest = runStartTime; } } // scan end key is exclusive, so need to add 1 to to include the last runid return new ScanRangeWithFilter(earliest, latest + 1, input -> runIds.contains(input.getRun())); }
private Relation toRelation(Row row) { Map<Character, EntityId> rowInfo = new HashMap<>(4); MDSKey.Splitter splitter = new MDSKey(row.getRow()).split(); char marker = (char) splitter.getInt(); LOG.trace("Got marker {}", marker); EntityId id1 = toEntityId(splitter, marker); LOG.trace("Got id1 {}", id1); rowInfo.put(marker, id1); splitter.skipLong(); // inverted time - not required for relation marker = (char) splitter.getInt(); LOG.trace("Got marker {}", marker); EntityId id2 = toEntityId(splitter, marker); LOG.trace("Got id2 {}", id1); rowInfo.put(marker, id2); RunId runId = RunIds.fromString(splitter.getString()); LOG.trace("Got runId {}", runId); AccessType accessType = AccessType.fromType((char) splitter.getInt()); LOG.trace("Got access type {}", accessType); DatasetId datasetInstance = (DatasetId) rowInfo.get(DATASET_MARKER); LOG.trace("Got datasetInstance {}", datasetInstance); ProgramId program = (ProgramId) rowInfo.get(PROGRAM_MARKER); LOG.trace("Got program {}", program); NamespacedEntityId component = toComponent(splitter, program); LOG.trace("Got component {}", component); return new Relation(datasetInstance, program, accessType, runId, component == null ? Collections.emptySet() : Collections.singleton(component)); }
private Multimap<RelationKey, Relation> getRollupRelations (Multimap<RelationKey, Relation> relations, Map<ProgramRunId, RunRecordMeta> runRecordMap, Map<String, ProgramRunId> workflowIdMap) throws NotFoundException { Multimap<RelationKey, Relation> relationsNew = HashMultimap.create(); for (Map.Entry<RelationKey, Collection<Relation>> entry : relations.asMap().entrySet()) { for (Relation relation : entry.getValue()) { ProgramRunId workflowProgramRunId = getWorkflowProgramRunid(relation, runRecordMap, workflowIdMap); if (workflowProgramRunId == null) { relationsNew.put(entry.getKey(), relation); } else { ProgramId workflowProgramId = new ProgramId(workflowProgramRunId.getNamespace(), workflowProgramRunId.getApplication(), workflowProgramRunId.getType(), workflowProgramRunId.getProgram()); Relation workflowRelation; NamespacedEntityId data = relation.getData(); if (data instanceof DatasetId) { workflowRelation = new Relation((DatasetId) data, workflowProgramId, relation.getAccess(), RunIds.fromString(workflowProgramRunId.getRun())); } else { workflowRelation = new Relation((StreamId) data, workflowProgramId, relation.getAccess(), RunIds.fromString(workflowProgramRunId.getRun())); } relationsNew.put(entry.getKey(), workflowRelation); } } } return relationsNew; }
private Set<String> getWorkflowIds (Multimap<RelationKey, Relation> relations, Map<ProgramRunId, RunRecordMeta> runRecordMap) throws NotFoundException { final Set<String> workflowIDs = new HashSet<>(); for (Relation relation : Iterables.concat(relations.values())) { RunRecordMeta runRecord = runRecordMap.get( new ProgramRunId(relation.getProgram().getNamespace(), relation.getProgram().getApplication(), relation.getProgram().getType(), relation.getProgram().getProgram(), relation.getRun().getId())); if (runRecord != null && runRecord.getProperties().containsKey("workflowrunid")) { String workflowRunId = runRecord.getProperties().get("workflowrunid"); workflowIDs.add(workflowRunId); } } return workflowIDs; }
/** * Collapse {@link Relation}s based on {@link CollapseType} * @param relations lineage relations * @param collapseTypes fields to collapse relations on * @return collapsed relations */ public static Set<CollapsedRelation> collapseRelations(Iterable<Relation> relations, Set<CollapseType> collapseTypes) { Set<CollapsedRelation> collapsedRelations = new HashSet<>(); Multimap<CollapseKey, Relation> multimap = HashMultimap.create(); for (Relation relation : relations) { multimap.put(getCollapseKey(relation, collapseTypes), relation); } LOG.trace("Collapsed relations: {}", multimap.asMap()); for (Map.Entry<CollapseKey, Collection<Relation>> collapsedEntry : multimap.asMap().entrySet()) { NamespacedEntityId data = collapsedEntry.getKey().data; ProgramId program = collapsedEntry.getKey().program; Set<AccessType> accessTypes = new HashSet<>(); Set<RunId> runs = new HashSet<>(); Set<NamespacedEntityId> components = new HashSet<>(); for (Relation relation : collapsedEntry.getValue()) { accessTypes.add(relation.getAccess()); runs.add(relation.getRun()); components.addAll(relation.getComponents()); } collapsedRelations.add(toCollapsedRelation(data, program, accessTypes, runs, components)); } return collapsedRelations; }
/** * Convert a set of runIds into a scan range based on earliest runtime and latest runtime of runIds. * Also, add a scan filter to include only runIds in the given set. * @param runIds input runIds set * @return scan range */ @VisibleForTesting static ScanRangeWithFilter getScanRange(final Set<RunId> runIds) { if (runIds.isEmpty()) { return new ScanRangeWithFilter(0, 0, x -> false); } // Pick the earliest start time and latest start time for lineage range long earliest = Long.MAX_VALUE; long latest = 0; for (RunId runId : runIds) { long runStartTime = RunIds.getTime(runId, TimeUnit.MILLISECONDS); if (runStartTime < earliest) { earliest = runStartTime; } if (runStartTime > latest) { latest = runStartTime; } } // scan end key is exclusive, so need to add 1 to to include the last runid return new ScanRangeWithFilter(earliest, latest + 1, input -> runIds.contains(input.getRun())); }
private RelationKey(Relation relation) { this.relation = relation; this.hashCode = Objects.hash(relation.getData(), relation.getProgram(), relation.getRun(), relation.getComponents()); }
@Test public void testCollapseAccess() { Set<Relation> relations = ImmutableSet.of( new Relation(data1, service1, AccessType.READ, runId1), new Relation(data1, service1, AccessType.WRITE, runId1), new Relation(data1, service1, AccessType.READ, runId1) ); // Collapse on access Assert.assertEquals( toSet( new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.WRITE), toSet(runId1), Collections.emptySet()) ), LineageCollapser.collapseRelations(relations, ImmutableSet.of(CollapseType.ACCESS)) ); }
private Set<String> getWorkflowIds (Multimap<RelationKey, Relation> relations, Map<ProgramRunId, RunRecordMeta> runRecordMap) throws NotFoundException { final Set<String> workflowIDs = new HashSet<>(); for (Relation relation : Iterables.concat(relations.values())) { RunRecordMeta runRecord = runRecordMap.get( new ProgramRunId(relation.getProgram().getNamespace(), relation.getProgram().getApplication(), relation.getProgram().getType(), relation.getProgram().getProgram(), relation.getRun().getId())); if (runRecord != null && runRecord.getProperties().containsKey("workflowrunid")) { String workflowRunId = runRecord.getProperties().get("workflowrunid"); workflowIDs.add(workflowRunId); } } return workflowIDs; }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } // Don't use AccessType for equals (same for hashCode) RelationKey other = (RelationKey) o; return Objects.equals(relation.getData(), other.relation.getData()) && Objects.equals(relation.getProgram(), other.relation.getProgram()) && Objects.equals(relation.getRun(), other.relation.getRun()) && Objects.equals(relation.getComponents(), other.relation.getComponents()); }
@Test public void testCollapseMulti() { Set<Relation> relations = ImmutableSet.of( new Relation(data1, service1, AccessType.READ, runId1), new Relation(data1, service1, AccessType.WRITE, runId1), new Relation(data1, service1, AccessType.READ, runId1), new Relation(data1, service2, AccessType.READ, runId1), new Relation(data1, service2, AccessType.READ, runId1), new Relation(data2, service1, AccessType.READ, runId1), new Relation(data2, service1, AccessType.READ, runId1) ); // Collapse on access Assert.assertEquals( toSet( new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.WRITE), toSet(runId1), Collections.emptySet()), new CollapsedRelation(data1, service2, toSet(AccessType.READ), toSet(runId1), Collections.emptySet()), new CollapsedRelation(data2, service1, toSet(AccessType.READ), toSet(runId1), Collections.emptySet()) ), LineageCollapser.collapseRelations(relations, ImmutableSet.of(CollapseType.ACCESS)) ); }
@Nullable private ProgramRunId getWorkflowProgramRunid (Relation relation, Map<ProgramRunId, RunRecordMeta> runRecordMap, Map<String, ProgramRunId> workflowIdMap) { ProgramRunId workflowProgramRunId = null; RunRecordMeta runRecord = runRecordMap.get( new ProgramRunId(relation.getProgram().getNamespace(), relation.getProgram().getApplication(), relation.getProgram().getType(), relation.getProgram().getProgram(), relation.getRun().getId())); if (runRecord != null && runRecord.getProperties().containsKey("workflowrunid")) { String workflowRunId = runRecord.getProperties().get("workflowrunid"); workflowProgramRunId = workflowIdMap.get(workflowRunId); } return workflowProgramRunId; }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } // Don't use AccessType for equals (same for hashCode) RelationKey other = (RelationKey) o; return Objects.equals(relation.getData(), other.relation.getData()) && Objects.equals(relation.getProgram(), other.relation.getProgram()) && Objects.equals(relation.getRun(), other.relation.getRun()) && Objects.equals(relation.getComponents(), other.relation.getComponents()); }
@Test public void testCollapseComponent() { Set<Relation> relations = ImmutableSet.of( new Relation(data1, service1, AccessType.READ, runId1), new Relation(data1, service1, AccessType.WRITE, runId1), new Relation(data1, service1, AccessType.READ, runId1) ); // Collapse on component Assert.assertEquals( toSet( new CollapsedRelation(data1, service1, toSet(AccessType.READ), toSet(runId1), Collections.emptySet()), new CollapsedRelation(data1, service1, toSet(AccessType.WRITE), toSet(runId1), Collections.emptySet()) ), LineageCollapser.collapseRelations(relations, ImmutableSet.of(CollapseType.COMPONENT)) ); }