@Test public void testDelimiterOnBufferBoundary() throws IOException { String[] records = new String[]{"1234567890<DEL?NO!>1234567890", "1234567890<DEL?NO!>1234567890", "<DEL?NO!>"}; String delimiter = "<DELIM>"; String fileContent = StringUtils.join(records, delimiter); final FileInputSplit split = createTempFile(fileContent); final Configuration parameters = new Configuration(); format.setBufferSize(12); format.setDelimiter(delimiter); format.configure(parameters); format.open(split); for (String record : records) { String value = format.nextRecord(null); assertEquals(record, value); } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); }
@Test public void testConfigure() { Configuration cfg = new Configuration(); cfg.setString("delimited-format.delimiter", "\n"); format.configure(cfg); assertEquals("\n", new String(format.getDelimiter(), format.getCharset())); cfg.setString("delimited-format.delimiter", "&-&"); format.configure(cfg); assertEquals("&-&", new String(format.getDelimiter(), format.getCharset())); }
format.setFilePath("file:///some/file/that/will/not/be/read"); format.setDelimiter(delimiter); format.setCharset(charset); format.configure(new Configuration()); format.open(split); String value = format.nextRecord(null); assertEquals(record, value); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd());
/** * Opens the given input split. This method opens the input stream to the specified file, allocates read buffers * and positions the stream at the correct position, making sure that any partial record at the beginning is skipped. * * @param split The input split to open. * * @see org.apache.flink.api.common.io.FileInputFormat#open(org.apache.flink.core.fs.FileInputSplit) */ @Override public void open(FileInputSplit split) throws IOException { super.open(split); initBuffers(); this.offset = splitStart; if (this.splitStart != 0) { this.stream.seek(offset); readLine(); // if the first partial record already pushes the stream over // the limit of our split, then no record starts within this split if (this.overLimit) { this.end = true; } } else { fillBuffer(0); } }
String delimString = parameters.getString(RECORD_DELIMITER, null); if (delimString != null) { setDelimiter(delimString); if (samplesString != null) { try { setNumLineSamples(Integer.parseInt(samplesString)); } catch (NumberFormatException e) { if (LOG.isWarnEnabled()) { LOG.warn("Invalid value for number of samples to take: " + samplesString + ". Skipping sampling."); setNumLineSamples(0);
if (this.readPos >= this.limit) { if (!fillBuffer(delimPos)) { int countInReadBuffer = delimPos; if (countInWrapBuffer + countInReadBuffer > 0) { setResult(this.wrapBuffer, 0, countInWrapBuffer); return true; } else { System.arraycopy(this.readBuffer, 0, this.wrapBuffer, countInWrapBuffer, count); setResult(this.wrapBuffer, 0, countInWrapBuffer + count); return true; } else { setResult(this.readBuffer, startPos, count); return true;
final FileBaseStatistics stats = getFileStats(cachedFileStats, getFilePaths(), allFiles); if (stats == null) { return null; open(split); if (readLine()) { totalNumBytes += this.currLen + this.delimiter.length; samplesTaken++; LOG.warn("Could not determine statistics for files '" + Arrays.toString(getFilePaths()) + "' " + "due to an io error: " + ioex.getMessage()); LOG.error("Unexpected problem while getting the file statistics for files '" + Arrays.toString(getFilePaths()) + "': " + t.getMessage(), t);
@Test public void testSerialization() throws Exception { final byte[] DELIMITER = new byte[] {1, 2, 3, 4}; final int NUM_LINE_SAMPLES = 7; final int LINE_LENGTH_LIMIT = 12345; final int BUFFER_SIZE = 178; DelimitedInputFormat<String> format = new MyTextInputFormat(); format.setDelimiter(DELIMITER); format.setNumLineSamples(NUM_LINE_SAMPLES); format.setLineLengthLimit(LINE_LENGTH_LIMIT); format.setBufferSize(BUFFER_SIZE); ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); ObjectOutputStream oos = new ObjectOutputStream(baos); oos.writeObject(format); oos.flush(); oos.close(); ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray())); @SuppressWarnings("unchecked") DelimitedInputFormat<String> deserialized = (DelimitedInputFormat<String>) ois.readObject(); assertEquals(NUM_LINE_SAMPLES, deserialized.getNumLineSamples()); assertEquals(LINE_LENGTH_LIMIT, deserialized.getLineLengthLimit()); assertEquals(BUFFER_SIZE, deserialized.getBufferSize()); assertArrayEquals(DELIMITER, deserialized.getDelimiter()); }
this.open(split); } finally { this.offset = state; this.end = true; } else if (state > split.getStart()) { initBuffers(); fillBuffer(0); } else { this.splitLength = this.splitStart + split.getLength() - this.offset;
@Override public void open(FileInputSplit split) throws IOException { super.open(split); Class<? extends FieldParser<OT>> parserType = FieldParser.getParserForType(primitiveClass); if (parserType == null) { throw new IllegalArgumentException("The type '" + primitiveClass.getName() + "' is not supported for the primitive input format."); } parser = InstantiationUtil.instantiate(parserType, FieldParser.class); }
@Test public void testOpen() throws IOException { final String myString = "my mocked line 1\nmy mocked line 2\n"; final FileInputSplit split = createTempFile(myString); int bufferSize = 5; format.setBufferSize(bufferSize); format.open(split); assertEquals(0, format.splitStart); assertEquals(myString.length() - bufferSize, format.splitLength); assertEquals(bufferSize, format.getBufferSize()); }
public void setDelimiter(String delimiter) { if (delimiter == null) { throw new IllegalArgumentException("Delimiter must not be null"); } this.delimiter = delimiter.getBytes(getCharset()); this.delimiterString = delimiter; }
@Override public void configure(Configuration parameters) { super.configure(parameters); if (charsetName == null || !Charset.isSupported(charsetName)) { throw new RuntimeException("Unsupported charset: " + charsetName); } }
format.setFilePath(tempFile); format.configure(new Configuration()); FileBaseStatistics stats = format.getStatistics(null); assertNotNull(stats); assertEquals("The file size from the statistics is wrong.", size, stats.getTotalInputSize()); format.setFilePath(tempFile); format.configure(new Configuration()); FileBaseStatistics newStats = format.getStatistics(stats); assertEquals("Statistics object was changed.", newStats, stats); format.setFilePath(tempFile); format.configure(new Configuration()); BaseStatistics latest = format.getStatistics(fakeStats); assertEquals("The file size from the statistics is wrong.", fakeSize, latest.getTotalInputSize()); format.setFilePath(tempFile); format.configure(new Configuration()); BaseStatistics reGathered = format.getStatistics(outDatedFakeStats); assertEquals("The file size from the statistics is wrong.", size, reGathered.getTotalInputSize());
@Override public void close() throws IOException { if (this.invalidLineCount > 0) { if (LOG.isWarnEnabled()) { LOG.warn("In file \"" + currentSplit.getPath() + "\" (split start: " + this.splitStart + ") " + this.invalidLineCount +" invalid line(s) were skipped."); } } if (this.commentCount > 0) { if (LOG.isInfoEnabled()) { LOG.info("In file \"" + currentSplit.getPath() + "\" (split start: " + this.splitStart + ") " + this.commentCount +" comment line(s) were skipped."); } } super.close(); }
@Override public void setCharset(String charset) { super.setCharset(charset); if (this.fieldDelimString != null) { this.fieldDelim = fieldDelimString.getBytes(getCharset()); } if (this.commentPrefixString != null) { this.commentPrefix = commentPrefixString.getBytes(getCharset()); } }
public void setDelimiter(char delimiter) { setDelimiter(String.valueOf(delimiter)); }
final FileBaseStatistics stats = getFileStats(cachedFileStats, getFilePaths(), allFiles); if (stats == null) { return null; open(split); if (readLine()) { totalNumBytes += this.currLen + this.delimiter.length; samplesTaken++; LOG.warn("Could not determine statistics for files '" + Arrays.toString(getFilePaths()) + "' " + "due to an io error: " + ioex.getMessage()); LOG.error("Unexpected problem while getting the file statistics for files '" + Arrays.toString(getFilePaths()) + "': " + t.getMessage(), t);
this.open(split); } finally { this.offset = state; this.end = true; } else if (state > split.getStart()) { initBuffers(); fillBuffer(0); } else { this.splitLength = this.splitStart + split.getLength() - this.offset;
@Override public void open(FileInputSplit split) throws IOException { super.open(split);