/** * Deduplicates a newly arrived batch of records. The records may * have been seen before. */ public void deduplicate(Collection<Record> records) { logger.info("Deduplicating batch of " + records.size() + " records"); batchReady(records.size()); // prepare long start = System.currentTimeMillis(); for (Record record : records) database1.index(record); database1.commit(); indexing += System.currentTimeMillis() - start; // then match match(records, true); batchDone(); }
/** * Index the records into the given database. This method does * <em>not</em> do any matching. * @since 1.3 */ public void index(int dbno, Collection<Record> batch) { Database thedb = getDB(dbno); for (Record r : batch) { if (logger.isDebugEnabled()) logger.debug("Indexing record " + r); thedb.index(r); } thedb.commit(); }
@Test public void testTrivialFind() throws IOException { Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); Collection<Record> cands = db.findCandidateMatches(record); assertEquals("no record found", 1, cands.size()); assertEquals("wrong ID", "1", cands.iterator().next().getValue("ID")); }
@Test public void testRecordImplementation() throws IOException { Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); record = db.findRecordById("1"); assertEquals("wrong ID", "1", record.getValue("ID")); assertEquals("wrong NAME", "AND", record.getValue("NAME")); assertEquals("wrong EMAIL", "BBBBB", record.getValue("EMAIL")); Collection<String> props = record.getProperties(); assertEquals("wrong number of properties", 3, props.size()); assertTrue("no ID", props.contains("ID")); assertTrue("no NAME", props.contains("NAME")); assertTrue("no EMAIL", props.contains("EMAIL")); }
@Test public void testBNode() throws IOException { Record record = TestUtils.makeRecord("ID", "_:RHUKdfPM299", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); record = db.findRecordById("_:RHUKdfPM299"); assertTrue("no record found", record != null); assertEquals("wrong ID", "_:RHUKdfPM299", record.getValue("ID")); }
@Test public void testTrivial() throws IOException { Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); record = db.findRecordById("1"); assertTrue("no record found", record != null); assertEquals("wrong ID", "1", record.getValue("ID")); assertEquals("wrong EMAIL", "BBBBB", record.getValue("EMAIL")); }
@Test public void testURI() throws IOException { Record record = TestUtils.makeRecord("ID", "http://norman.walsh.name/knows/who/robin-berjon", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); record = db.findRecordById("http://norman.walsh.name/knows/who/robin-berjon"); assertTrue("no record found", record != null); assertEquals("wrong ID", "http://norman.walsh.name/knows/who/robin-berjon", record.getValue("ID")); }
@Test public void testPersistence() throws IOException { // can we index a record, close and reopen the database, and find // the same record again afterwards? assertTrue("database claims to be in-memory", !db.isInMemory()); Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); db.close(); db = createDatabase(config); Record r = db.findRecordById("1"); assertTrue("record not found after reopening", r != null); assertEquals("wrong ID", "1", r.getValue("ID")); assertEquals("wrong NAME", "AND", r.getValue("NAME")); assertEquals("wrong EMAIL", "BBBBB", r.getValue("EMAIL")); Collection<Record> recs = db.findCandidateMatches(record); assertEquals("wrong number of records found", 1, recs.size()); r = recs.iterator().next(); assertEquals("wrong ID", "1", r.getValue("ID")); assertEquals("wrong NAME", "AND", r.getValue("NAME")); assertEquals("wrong EMAIL", "BBBBB", r.getValue("EMAIL")); }
@Test public void testBoostAt1() throws IOException { // make own config ExactComparator comp = new ExactComparator(); List<Property> props = new ArrayList(); props.add(new PropertyImpl("ID")); props.add(new PropertyImpl("NAME", comp, 0.3, 1.0)); // 1.0 !!! props.add(new PropertyImpl("EMAIL", comp, 0.3, 0.8)); config = new ConfigurationImpl(); config.setProperties(props); config.setThreshold(0.85); config.setMaybeThreshold(0.8); db = createDatabase(config); // now we can try Record record = TestUtils.makeRecord("ID", "1", "NAME", "George", "EMAIL", "BBBBB"); db.index(record); db.commit(); Collection<Record> cands = db.findCandidateMatches(record); assertEquals("no record found", 1, cands.size()); assertEquals("wrong ID", "1", cands.iterator().next().getValue("ID")); } }
@Test public void testOverwrite() throws IOException { // can we index a record, close and reopen the database with overwrite // set, and not find it again? assertTrue("database claims to be in-memory", !db.isInMemory()); Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); db.close(); db = createDatabase(config); db.setOverwrite(true); Record r = db.findRecordById("1"); assertTrue("record found after reopening, despite overwrite", r == null); }
@Test public void testBackslash() throws IOException { String name = "\"Lastname, Firstname \\(external\\)\""; Record record = TestUtils.makeRecord("ID", "1", "NAME", name, "EMAIL", "BBBBB"); db.index(record); db.commit(); Record record2 = TestUtils.makeRecord("NAME", "\"lastname, firstname \\(external\\)\""); db.findCandidateMatches(record2); }
database.index(it.next()); database.commit();
/** * Index all new records from the given data sources into the given * database. This method does <em>not</em> do any matching. * @since 1.3 */ public void index(int dbno, Collection<DataSource> sources, int batch_size) { Database thedb = getDB(dbno); int count = 0; for (DataSource source : sources) { source.setLogger(logger); RecordIterator it2 = source.getRecords(); while (it2.hasNext()) { Record record = it2.next(); if (logger.isDebugEnabled()) logger.debug("Indexing record " + record); thedb.index(record); count++; if (count % batch_size == 0) batchReady(batch_size); } it2.close(); } if (count % batch_size == 0) batchReady(count % batch_size); thedb.commit(); }
/** * Deduplicates a newly arrived batch of records. The records may * have been seen before. */ public void deduplicate(Collection<Record> records) { logger.info("Deduplicating batch of " + records.size() + " records"); batchReady(records.size()); // prepare long start = System.currentTimeMillis(); for (Record record : records) database.index(record); database.commit(); indexing += System.currentTimeMillis() - start; // then match match(records, true); batchDone(); }
/** * Index all new records from the given data sources. This method * does <em>not</em> do any matching. * @since 0.4 */ public void index(Collection<DataSource> sources, int batch_size) { int count = 0; for (DataSource source : sources) { source.setLogger(logger); RecordIterator it2 = source.getRecords(); while (it2.hasNext()) { Record record = it2.next(); database.index(record); count++; if (count % batch_size == 0) batchReady(batch_size); } it2.close(); } if (count % batch_size == 0) batchReady(count % batch_size); database.commit(); }
database.index(it.next()); database.commit();