public void createInvertedIndexForAllColumns() { if (_schema == null) { LOGGER.warn("Schema has not been set, will not create inverted index for all columns."); return; } for (FieldSpec spec : _schema.getAllFieldSpecs()) { _invertedIndexCreationColumns.add(spec.getName()); } }
public static org.apache.avro.Schema getAvroSchema(Schema schema) { ObjectNode avroSchema = JsonUtils.newObjectNode(); avroSchema.put("name", "data_gen_record"); avroSchema.put("type", "record"); ArrayNode fields = JsonUtils.newArrayNode(); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { JsonNode jsonObject = fieldSpec.toAvroSchemaJsonObject(); fields.add(jsonObject); } avroSchema.set("fields", fields); return new org.apache.avro.Schema.Parser().parse(avroSchema.toString()); }
public PinotSegmentSorter(int numDocs, Schema schema, Map<String, PinotSegmentColumnReader> columnReaderMap) { _numDocs = numDocs; _schema = schema; _columnReaderMap = columnReaderMap; _dimensionNames = new ArrayList<>(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { // Count all fields that are not metrics as dimensions if (fieldSpec.getFieldType() != FieldSpec.FieldType.METRIC) { String dimensionName = fieldSpec.getName(); _numDimensions++; _dimensionNames.add(dimensionName); } } }
@Override public GenericRow getRecord(int docId, GenericRow reuse) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); reuse.putField(column, IndexSegmentUtils .getValue(docId, fieldSpec, _indexReaderWriterMap.get(column), _dictionaryMap.get(column), _maxNumValuesMap.getOrDefault(column, 0))); } return reuse; }
@Override public GenericRow decode(byte[] payload, GenericRow destination) { try { JsonNode message = JsonUtils.bytesToJsonNode(payload); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); destination.putField(column, JsonUtils.extractValue(message.get(column), fieldSpec)); } return destination; } catch (Exception e) { LOGGER.error("Caught exception while decoding row, discarding row.", e); return null; } }
private void addInvertedIndex(int docId, Map<String, Object> dictIdMap) { // Update inverted index at last // NOTE: inverted index have to be updated at last because once it gets updated, the latest record will become // queryable for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); RealtimeInvertedIndexReader invertedIndex = _invertedIndexMap.get(column); if (invertedIndex != null) { if (fieldSpec.isSingleValueField()) { invertedIndex.add(((Integer) dictIdMap.get(column)), docId); } else { int[] dictIds = (int[]) dictIdMap.get(column); for (int dictId : dictIds) { invertedIndex.add(dictId, docId); } } } } }
@Override public GenericRow next(GenericRow reuse) { Map record = _iterator.next(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Object jsonValue = record.get(fieldName); Object value; if (fieldSpec.isSingleValueField()) { String token = jsonValue != null ? jsonValue.toString() : null; value = RecordReaderUtils.convertToDataType(token, fieldSpec); } else { value = RecordReaderUtils.convertToDataTypeArray((ArrayList) jsonValue, fieldSpec); } reuse.putField(fieldName, value); } return reuse; }
/** * Returns a comma separated list of qualifying field name strings * @param type FieldType to filter on * @return Comma separate qualifying fields names. */ @JsonIgnore private String getQualifyingFields(FieldType type, boolean excludeVirtualColumns) { List<String> fields = new ArrayList<>(); for (final FieldSpec spec : getSchema().getAllFieldSpecs()) { if (excludeVirtualColumns && getSchema().isVirtualColumn(spec.getName())) { continue; } if (spec.getFieldType() == type) { fields.add(spec.getName()); } } Collections.sort(fields); return StringUtils.join(fields, ","); } }
@Override public PinotRecord serialize(T t) { _record.clear(); JsonNode jsonRecord = JsonUtils.objectToJsonNode(t); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); _record.putField(column, JsonUtils.extractValue(jsonRecord.get(column), fieldSpec)); } return _record; }
private void validateSchema() { org.apache.avro.Schema avroSchema = _avroReader.getSchema(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Field avroField = avroSchema.getField(fieldName); if (avroField == null) { LOGGER.warn("Pinot field: {} does not exist in Avro Schema", fieldName); } else { boolean isPinotFieldSingleValue = fieldSpec.isSingleValueField(); boolean isAvroFieldSingleValue = AvroUtils.isSingleValueField(avroField); if (isPinotFieldSingleValue != isAvroFieldSingleValue) { String errorMessage = "Pinot field: " + fieldName + " is " + (isPinotFieldSingleValue ? "Single" : "Multi") + "-valued in Pinot schema but not in Avro schema"; LOGGER.error(errorMessage); throw new IllegalStateException(errorMessage); } DataType pinotFieldDataType = fieldSpec.getDataType(); DataType avroFieldDataType = AvroUtils.extractFieldDataType(avroField); if (pinotFieldDataType != avroFieldDataType) { LOGGER.warn("Pinot field: {} of type: {} mismatches with corresponding field in Avro Schema of type: {}", fieldName, pinotFieldDataType, avroFieldDataType); } } } }
/** * Return the row given a docId */ private GenericRow getRecord(GenericRow reuse, int docId) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); if (fieldSpec.isSingleValueField()) { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readSV(docId, fieldSpec.getDataType())); } else { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readMV(docId)); } } return reuse; }
@Nonnull public GenericRow transform(@Nonnull GenericData.Record from, @Nonnull GenericRow to) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { FieldSpec incomingFieldSpec = fieldSpec.getFieldType() == FieldSpec.FieldType.TIME ? _incomingTimeFieldSpec : fieldSpec; String fieldName = incomingFieldSpec.getName(); Object avroValue = from.get(fieldName); if (incomingFieldSpec.isSingleValueField()) { to.putField(fieldName, AvroUtils.transformAvroValueToObject(avroValue, incomingFieldSpec)); } else { to.putField(fieldName, AvroUtils.transformAvroArrayToObjectArray((Array) avroValue, incomingFieldSpec)); } } return to; } }
@Override public GenericRow next(GenericRow reuse) { CSVRecord record = _iterator.next(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); String token = record.isSet(column) ? record.get(column) : null; Object value; if (fieldSpec.isSingleValueField()) { value = RecordReaderUtils.convertToDataType(token, fieldSpec); } else { String[] tokens = token != null ? StringUtils.split(token, _multiValueDelimiter) : null; value = RecordReaderUtils.convertToDataTypeArray(tokens, fieldSpec); } reuse.putField(column, value); } return reuse; }
/** * Fill the data in a {@link GenericRecord} to a {@link GenericRow}. */ public static void fillGenericRow(GenericRecord from, GenericRow to, Schema schema) { for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Object avroValue = from.get(fieldName); if (fieldSpec.isSingleValueField()) { to.putField(fieldName, transformAvroValueToObject(avroValue, fieldSpec)); } else { to.putField(fieldName, transformAvroArrayToObjectArray((GenericData.Array) avroValue, fieldSpec)); } } }
@Override public GenericRow getRecord(int docId, GenericRow reuse) { for (FieldSpec fieldSpec : _segmentMetadata.getSchema().getAllFieldSpecs()) { String column = fieldSpec.getName(); ColumnIndexContainer indexContainer = _indexContainerMap.get(column); reuse.putField(column, IndexSegmentUtils .getValue(docId, fieldSpec, indexContainer.getForwardIndex(), indexContainer.getDictionary(), _segmentMetadata.getColumnMetadataFor(column).getMaxNumberOfMultiValues())); } return reuse; } }
public static List<GenericRow> createTestData(Schema schema, int numRows) { List<GenericRow> rows = new ArrayList<>(); final Random random = new Random(); Map<String, Object> fields; for (int i = 0; i < numRows; i++) { fields = new HashMap<>(); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { Object value; if (fieldSpec.isSingleValueField()) { value = generateSingleValue(random, fieldSpec.getDataType()); } else { value = generateMultiValue(random, fieldSpec.getDataType()); } fields.put(fieldSpec.getName(), value); } GenericRow row = new GenericRow(); row.init(fields); rows.add(row); } return rows; }
/** * Complete the stats gathering process and store the stats information in indexCreationInfoMap. */ void buildIndexCreationInfo() throws Exception { for (FieldSpec spec : dataSchema.getAllFieldSpecs()) { String column = spec.getName(); // Skip adding virtual columns, so that they don't get an on-disk representation if (dataSchema.isVirtualColumn(column)) { continue; } ColumnStatistics columnProfile = segmentStats.getColumnProfileFor(column); indexCreationInfoMap.put(column, new ColumnIndexCreationInfo(columnProfile, true/*createDictionary*/, ForwardIndexType.FIXED_BIT_COMPRESSED, InvertedIndexType.ROARING_BITMAPS, false/*isAutoGenerated*/, dataSchema.getFieldSpecFor(column).getDefaultNullValue())); } segmentIndexCreationInfo.setTotalDocs(totalDocs); segmentIndexCreationInfo.setTotalRawDocs(totalRawDocs); segmentIndexCreationInfo.setTotalAggDocs(totalAggDocs); segmentIndexCreationInfo.setStarTreeEnabled(createStarTree); }
@Test public void testMetadata() { SegmentMetadata actualSegmentMetadata = _mutableSegmentImpl.getSegmentMetadata(); SegmentMetadata expectedSegmentMetadata = _immutableSegment.getSegmentMetadata(); Assert.assertEquals(actualSegmentMetadata.getTotalDocs(), expectedSegmentMetadata.getTotalDocs()); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); DataSourceMetadata actualDataSourceMetadata = _mutableSegmentImpl.getDataSource(column).getDataSourceMetadata(); DataSourceMetadata expectedDataSourceMetadata = _immutableSegment.getDataSource(column).getDataSourceMetadata(); Assert.assertEquals(actualDataSourceMetadata.getDataType(), expectedDataSourceMetadata.getDataType()); Assert.assertEquals(actualDataSourceMetadata.isSingleValue(), expectedDataSourceMetadata.isSingleValue()); Assert.assertEquals(actualDataSourceMetadata.getNumDocs(), expectedDataSourceMetadata.getNumDocs()); if (!expectedDataSourceMetadata.isSingleValue()) { Assert.assertEquals(actualDataSourceMetadata.getMaxNumMultiValues(), expectedDataSourceMetadata.getMaxNumMultiValues()); } } }
@Test public void testDataSourceForSVColumns() { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { if (fieldSpec.isSingleValueField()) { String column = fieldSpec.getName(); DataSource actualDataSource = _mutableSegmentImpl.getDataSource(column); DataSource expectedDataSource = _immutableSegment.getDataSource(column); Dictionary actualDictionary = actualDataSource.getDictionary(); Dictionary expectedDictionary = expectedDataSource.getDictionary(); Assert.assertEquals(actualDictionary.length(), expectedDictionary.length()); BlockSingleValIterator actualSVIterator = (BlockSingleValIterator) actualDataSource.nextBlock().getBlockValueSet().iterator(); BlockSingleValIterator expectedSVIterator = (BlockSingleValIterator) expectedDataSource.nextBlock().getBlockValueSet().iterator(); while (expectedSVIterator.hasNext()) { Assert.assertTrue(actualSVIterator.hasNext()); int actualDictId = actualSVIterator.nextIntVal(); int expectedDictId = expectedSVIterator.nextIntVal(); Assert.assertEquals(actualDictionary.get(actualDictId), expectedDictionary.get(expectedDictId)); } Assert.assertFalse(actualSVIterator.hasNext()); } } }
@Test public void testDataSourceForMVColumns() { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { if (!fieldSpec.isSingleValueField()) { String column = fieldSpec.getName(); DataSource actualDataSource = _mutableSegmentImpl.getDataSource(column); DataSource expectedDataSource = _immutableSegment.getDataSource(column); Dictionary actualDictionary = actualDataSource.getDictionary(); Dictionary expectedDictionary = expectedDataSource.getDictionary(); Assert.assertEquals(actualDictionary.length(), expectedDictionary.length()); BlockMultiValIterator actualMVIterator = (BlockMultiValIterator) actualDataSource.nextBlock().getBlockValueSet().iterator(); BlockMultiValIterator expectedMVIterator = (BlockMultiValIterator) expectedDataSource.nextBlock().getBlockValueSet().iterator(); int numMaxMultiValues = expectedDataSource.getDataSourceMetadata().getMaxNumMultiValues(); int[] actualDictIds = new int[numMaxMultiValues]; int[] expectedDictIds = new int[numMaxMultiValues]; while (expectedMVIterator.hasNext()) { Assert.assertTrue(actualMVIterator.hasNext()); int actualNumMultiValues = actualMVIterator.nextIntVal(actualDictIds); int expectedNumMultiValues = expectedMVIterator.nextIntVal(expectedDictIds); Assert.assertEquals(actualNumMultiValues, expectedNumMultiValues); for (int i = 0; i < expectedNumMultiValues; i++) { Assert.assertEquals(actualDictionary.get(actualDictIds[i]), expectedDictionary.get(expectedDictIds[i])); } } Assert.assertFalse(actualMVIterator.hasNext()); } } }