assertTrue("database claims to be in-memory", !db.isInMemory()); db.index(record); db.commit(); db.close(); db.index(record2); db.commit(); Record r = db.findRecordById("1"); assertTrue("record not found", r != null); assertEquals("wrong ID", "1", r.getValue("ID")); assertEquals("wrong EMAIL", "BARS", r.getValue("EMAIL")); Collection<Record> recs = db.findCandidateMatches(record2); assertEquals("wrong number of records found", 1, recs.size()); r = recs.iterator().next(); assertEquals("wrong EMAIL", "BARS", r.getValue("EMAIL")); recs = db.findCandidateMatches(record); assertEquals("wrong number of records found", 0, recs.size());
public Database getDatabase(int groupno, boolean overwrite) { Database thedb; if (groupno == 1) { if (database1 == null) // not set, so use default with is in memory database1 = new no.priv.garshol.duke.databases.InMemoryDatabase(); thedb = database1; } else if (groupno == 2) thedb = database2; // no default for no 2 else throw new DukeException("Can only have two databases"); if (thedb != null) { thedb.setConfiguration(this); thedb.setOverwrite(overwrite); // hmmm? } return thedb; }
/** * Index the records into the given database. This method does * <em>not</em> do any matching. * @since 1.3 */ public void index(int dbno, Collection<Record> batch) { Database thedb = getDB(dbno); for (Record r : batch) { if (logger.isDebugEnabled()) logger.debug("Indexing record " + r); thedb.index(r); } thedb.commit(); }
@Test public void testOverwrite() throws IOException { // can we index a record, close and reopen the database with overwrite // set, and not find it again? assertTrue("database claims to be in-memory", !db.isInMemory()); Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); db.close(); db = createDatabase(config); db.setOverwrite(true); Record r = db.findRecordById("1"); assertTrue("record found after reopening, despite overwrite", r == null); }
@Test public void testBNode() throws IOException { Record record = TestUtils.makeRecord("ID", "_:RHUKdfPM299", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); record = db.findRecordById("_:RHUKdfPM299"); assertTrue("no record found", record != null); assertEquals("wrong ID", "_:RHUKdfPM299", record.getValue("ID")); }
@Test public void testBackslash() throws IOException { String name = "\"Lastname, Firstname \\(external\\)\""; Record record = TestUtils.makeRecord("ID", "1", "NAME", name, "EMAIL", "BBBBB"); db.index(record); db.commit(); Record record2 = TestUtils.makeRecord("NAME", "\"lastname, firstname \\(external\\)\""); db.findCandidateMatches(record2); }
private boolean[] whoThinksThisIsTrue(String id1, String id2) { Record r1 = database.findRecordById(id1); if (r1 == null) r1 = secondary.get(id1); Record r2 = database.findRecordById(id2); if (r2 == null) r2 = secondary.get(id2); List<GeneticConfiguration> configs = population.getConfigs(); boolean[] believers = new boolean[configs.size()]; for (int ix = 0; ix < configs.size(); ix++) { Configuration config = configs.get(ix).getConfiguration(); Processor proc = new Processor(config, database); believers[ix] = proc.compare(r1, r2) > config.getThreshold(); } return believers; } }
private void match(int dbno, Record record, boolean matchall) { long start = System.currentTimeMillis(); Collection<Record> candidates = getDB(dbno).findCandidateMatches(record); searching += System.currentTimeMillis() - start; if (logger.isDebugEnabled()) logger.debug("Matching record " + PrintMatchListener.toString(record, config.getProperties()) + " found " + candidates.size() + " candidates"); start = System.currentTimeMillis(); if (matchall) compareCandidatesSimple(record, candidates); else compareCandidatesBest(record, candidates); comparing += System.currentTimeMillis() - start; }
if (database.isInMemory()) reindex = true; // no other way to do it in this case
public Database createDatabase(Configuration config) { Database db = new KeyValueDatabase(); db.setConfiguration(config); return db; }
@Test public void testURI() throws IOException { Record record = TestUtils.makeRecord("ID", "http://norman.walsh.name/knows/who/robin-berjon", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); record = db.findRecordById("http://norman.walsh.name/knows/who/robin-berjon"); assertTrue("no record found", record != null); assertEquals("wrong ID", "http://norman.walsh.name/knows/who/robin-berjon", record.getValue("ID")); }
/** * Deduplicates a newly arrived batch of records. The records may * have been seen before. */ public void deduplicate(Collection<Record> records) { logger.info("Deduplicating batch of " + records.size() + " records"); batchReady(records.size()); // prepare long start = System.currentTimeMillis(); for (Record record : records) database1.index(record); database1.commit(); indexing += System.currentTimeMillis() - start; // then match match(records, true); batchDone(); }
@Test public void testTrivialFind() throws IOException { Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); Collection<Record> cands = db.findCandidateMatches(record); assertEquals("no record found", 1, cands.size()); assertEquals("wrong ID", "1", cands.iterator().next().getValue("ID")); }
public synchronized void noMatchFor(Record r) { // we missed all of the correct links for this record (if any). // count, and tell the user. for (Link link : golddb.getAllLinksFor(getid(r))) { if (link.getKind() != LinkKind.SAME) continue; // it's a bad link, so never mind missed++; Record r1 = database.findRecordById(link.getID1()); Record r2 = database.findRecordById(link.getID2()); if (r1 != null && r2 != null) { if (debug && !showmatches) PrintMatchListener.show(r1, r2, processor.compare(r1, r2), "\nNOT FOUND", props, pretty); } else if (debug && !showmatches) { System.out.println("\nIDENTITIES IN TEST FILE NOT FOUND IN DATA"); System.out.println("ID1: " + link.getID1() + " -> " + r1); System.out.println("ID2: " + link.getID2() + " -> " + r2); } } }
public void run(String[] argv) throws IOException, SAXException { Collection<CommandLineParser.Option> options = Collections.singleton((CommandLineParser.Option) new CommandLineParser.StringOption("maxhits", 'H')); argv = init(argv, 3, 3, options); int max_hits = 10000; if (parser.getOptionValue("maxhits") != null) max_hits = Integer.parseInt(parser.getOptionValue("maxhits")); // build record RecordImpl prototype = new RecordImpl(); prototype.addValue(argv[1], argv[2]); // search Collection<Record> records = database.findCandidateMatches(prototype); int hitno = 1; for (Record record : records) { PrintMatchListener.prettyPrint(record, config.getProperties()); System.out.println(); if (hitno++ == max_hits) break; } }
if (noreindex && processor.getDatabase().isInMemory()) { System.out.println("Option --noreindex not available with in-memory " + "database");
public Database createDatabase(Configuration config) { Database db = new InMemoryDatabase(); db.setConfiguration(config); return db; }
/** * Commits all state to disk and frees up resources. */ public void close() { database.close(); }
@Test public void testPersistence() throws IOException { // can we index a record, close and reopen the database, and find // the same record again afterwards? assertTrue("database claims to be in-memory", !db.isInMemory()); Record record = TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB"); db.index(record); db.commit(); db.close(); db = createDatabase(config); Record r = db.findRecordById("1"); assertTrue("record not found after reopening", r != null); assertEquals("wrong ID", "1", r.getValue("ID")); assertEquals("wrong NAME", "AND", r.getValue("NAME")); assertEquals("wrong EMAIL", "BBBBB", r.getValue("EMAIL")); Collection<Record> recs = db.findCandidateMatches(record); assertEquals("wrong number of records found", 1, recs.size()); r = recs.iterator().next(); assertEquals("wrong ID", "1", r.getValue("ID")); assertEquals("wrong NAME", "AND", r.getValue("NAME")); assertEquals("wrong EMAIL", "BBBBB", r.getValue("EMAIL")); }