Collection<Record> batch = new ArrayList(); long start = System.currentTimeMillis(); while (it2.hasNext()) { Record record = it2.next(); batch.add(record); count++; srcread += (System.currentTimeMillis() - start); deduplicate(batch); it2.batchProcessed(); batch = new ArrayList(); start = System.currentTimeMillis(); it2.batchProcessed(); it2.close();
private static void showdata(Configuration config) { List<Property> props = config.getProperties(); List<DataSource> sources = new ArrayList(); sources.addAll(config.getDataSources()); sources.addAll(config.getDataSources(1)); sources.addAll(config.getDataSources(2)); for (DataSource src : sources) { RecordIterator it = src.getRecords(); while (it.hasNext()) { Record r = it.next(); PrintMatchListener.prettyPrint(r, props); System.out.println(""); } it.close(); } }
@Test public void testTripleModeNoUri() { setupTripleConfig1("?url"); // NOTE: url, not uri source.setRows(new String[][] {{"http://a/1", "http://a/name", "1"}}); try { RecordIterator it = source.getRecords(); it.next(); fail("No config exception, despite missing '?uri' column"); } catch (DukeConfigException e) { } }
public boolean hasNext() { return recit.hasNext(); }
/** * Retrieve new records from data sources, and match them to * previously indexed records in the given database. This method * does <em>not</em> index the new records. * @param dbno Which database to match against. * @param matchall If true, all matching records are accepted. If false, * only the single best match for each record is accepted. * @param batch_size The batch size to use. * @since 1.3 */ public void linkRecords(int dbno, Collection<DataSource> sources, boolean matchall, int batch_size) { for (DataSource source : sources) { source.setLogger(logger); Collection<Record> batch = new ArrayList(batch_size); RecordIterator it = source.getRecords(); while (it.hasNext()) { batch.add(it.next()); if (batch.size() == batch_size) { linkBatch(dbno, batch, matchall); batch.clear(); } } it.close(); if (!batch.isEmpty()) linkBatch(dbno, batch, matchall); } endProcessing(); }
@Test public void testSplitting() throws IOException { source.addColumn(new Column("F1", null, null, null)); Column c = new Column("F2", null, null, null); c.setSplitOn(";"); source.addColumn(c); source.addColumn(new Column("F3", null, null, null)); RecordIterator it = read("F1,F2,F3\na,b;d;e,c"); Record r = it.next(); assertEquals("a", r.getValue("F1")); assertEquals("c", r.getValue("F3")); Collection<String> values = r.getValues("F2"); assertEquals(3, values.size()); assertTrue(values.contains("b")); assertTrue(values.contains("d")); assertTrue(values.contains("e")); }
@Test public void testEmpty() { RecordIterator it = read(""); assertFalse("empty data source contains records", it.hasNext()); }
@Test public void testSimple() { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1")); records.add(TestUtils.makeRecord("ID", "2")); records.add(TestUtils.makeRecord("ID", "3")); InMemoryDataSource src = new InMemoryDataSource(records); RecordIterator it = src.getRecords(); assertTrue("record missing", it.hasNext()); assertEquals("wrong record", it.next().getValue("ID"), "1"); assertTrue("record missing", it.hasNext()); assertEquals("wrong record", it.next().getValue("ID"), "2"); assertTrue("record missing", it.hasNext()); assertEquals("wrong record", it.next().getValue("ID"), "3"); assertFalse("too many records", it.hasNext()); } }
/** * Index all new records from the given data sources into the given * database. This method does <em>not</em> do any matching. * @since 1.3 */ public void index(int dbno, Collection<DataSource> sources, int batch_size) { Database thedb = getDB(dbno); int count = 0; for (DataSource source : sources) { source.setLogger(logger); RecordIterator it2 = source.getRecords(); while (it2.hasNext()) { Record record = it2.next(); if (logger.isDebugEnabled()) logger.debug("Indexing record " + record); thedb.index(record); count++; if (count % batch_size == 0) batchReady(batch_size); } it2.close(); } if (count % batch_size == 0) batchReady(count % batch_size); thedb.commit(); }
Collection<Record> batch = new ArrayList(); long start = System.currentTimeMillis(); while (it2.hasNext()) { Record record = it2.next(); batch.add(record); count++; srcread += (System.currentTimeMillis() - start); deduplicate(batch); it2.batchProcessed(); batch = new ArrayList(); start = System.currentTimeMillis(); it2.batchProcessed(); it2.close();
@Test public void testNoValueForEmptySplit() throws IOException { source.addColumn(new Column("F1", null, null, null)); Column c = new Column("F2", null, null, null); c.setSplitOn(";"); source.addColumn(c); source.addColumn(new Column("F3", null, null, null)); RecordIterator it = read("F1,F2,F3\na,b;;e,c"); Record r = it.next(); assertEquals("a", r.getValue("F1")); assertEquals("c", r.getValue("F3")); Collection<String> values = r.getValues("F2"); assertEquals(2, values.size()); assertTrue(values.contains("b")); assertTrue(values.contains("e")); }
@Test public void testEmptyBlank() { RecordIterator it = read("\n\n"); assertFalse("empty data source contains records", it.hasNext()); }
private static void showdata(Configuration config) { List<Property> props = config.getProperties(); List<DataSource> sources = new ArrayList(); sources.addAll(config.getDataSources()); sources.addAll(config.getDataSources(1)); sources.addAll(config.getDataSources(2)); for (DataSource src : sources) { RecordIterator it = src.getRecords(); while (it.hasNext()) { Record r = it.next(); PrintMatchListener.prettyPrint(r, props); System.out.println(""); } it.close(); } }
@Test public void testSplittingCleaning() throws IOException { source.addColumn(new Column("F1", null, null, null)); Column c = new Column("F2", null, null, new LowerCaseNormalizeCleaner()); c.setSplitOn(";"); source.addColumn(c); source.addColumn(new Column("F3", null, null, null)); RecordIterator it = read("F1,F2,F3\na, b ; d ; e ,c"); Record r = it.next(); assertEquals("a", r.getValue("F1")); assertEquals("c", r.getValue("F3")); Collection<String> values = r.getValues("F2"); assertEquals(3, values.size()); assertTrue(values.contains("b")); assertTrue(values.contains("d")); assertTrue(values.contains("e")); }
@Test public void testEmpty() { InMemoryDataSource src = new InMemoryDataSource(); RecordIterator it = src.getRecords(); assertFalse("empty data source contains records", it.hasNext()); }
@Test public void testSingleTriple() { setupTripleConfig1("?uri"); source.setRows(new String[][] {{"http://a/1", "http://a/name", "1"}}); RecordIterator it = source.getRecords(); assertTrue("data source contains no records", it.hasNext()); Record r = it.next(); assertEquals("wrong ID", "http://a/1", r.getValue("ID")); assertEquals("wrong NAME", "1", r.getValue("NAME")); assertEquals("wrong AGE", null, r.getValue("AGE")); assertFalse("data source contains more than one record", it.hasNext()); assertEquals("wrong number of pages", 2, source.getPages()); }
/** * Retrieve new records from data sources, and match them to * previously indexed records. This method does <em>not</em> index * the new records. * @param matchall If true, all matching records are accepted. If false, * only the single best match for each record is accepted. * @param batch_size The batch size to use. * @since 1.0 */ public void linkRecords(Collection<DataSource> sources, boolean matchall, int batch_size) { for (DataSource source : sources) { source.setLogger(logger); Collection<Record> batch = new ArrayList(batch_size); RecordIterator it = source.getRecords(); while (it.hasNext()) { batch.add(it.next()); if (batch.size() == batch_size) { linkBatch(batch, matchall); batch.clear(); } } it.close(); if (!batch.isEmpty()) linkBatch(batch, matchall); } endProcessing(); }
@Test public void testTripleModeFourColumns() { source.addColumn(new Column("?uri", "ID", null, null)); source.addColumn(new Column("http://a/name", "NAME", null, null)); source.addColumn(new Column("http://a/age", "AGE", null, null)); source.addColumn(new Column("http://a/blubb", "BLUBB", null, null)); source.setVariables(new String[] {"S", "P", "O", "C"}); source.setRows(new String[][] { {"http://a/1", "http://a/name", "1", "blubb"}}); try { RecordIterator it = source.getRecords(); it.next(); fail("Didn't catch four result columns in triple mode"); } catch (DukeConfigException e) { } }