/** * Creates a {@link TokenNameFinderFactory} that provides the default implementation * of the resources. */ public TokenNameFinderFactory() { this.seqCodec = new BioCodec(); }
@Test public void testDecodeEmpty() { Span[] expected = new Span[] {}; Span[] actual = codec.decode(new ArrayList<String>()); Assert.assertArrayEquals(expected, actual); } /**
@Override public Span[] decode(List<String> c) { int start = -1; int end = -1; List<Span> spans = new ArrayList<>(c.size()); for (int li = 0; li < c.size(); li++) { String chunkTag = c.get(li); if (chunkTag.endsWith(BilouCodec.START)) { start = li; end = li + 1; } else if (chunkTag.endsWith(BilouCodec.CONTINUE)) { end = li + 1; } else if (chunkTag.endsWith(LAST)) { if (start != -1) { spans.add(new Span(start, end + 1, BioCodec.extractNameType(c.get(li - 1)))); start = -1; end = -1; } } else if (chunkTag.endsWith(UNIT)) { spans.add(new Span(li, li + 1, BioCodec.extractNameType(c.get(li)))); } } return spans.toArray(new Span[spans.size()]); }
public synchronized static List<INDArray> mapToLabelVectors(NameSample sample, int windowSize, String[] labelStrings) { Map<String, Integer> labelToIndex = IntStream.range(0, labelStrings.length).boxed() .collect(Collectors.toMap(i -> labelStrings[i], i -> i)); List<INDArray> vectors = new ArrayList<INDArray>(); // encode the outcome as one-hot-representation String outcomes[] = new BioCodec().encode(sample.getNames(), sample.getSentence().length); for (int i = 0; i < sample.getSentence().length; i++) { INDArray labels = Nd4j.create(1, labelStrings.length, windowSize); labels.putScalar(new int[] { 0, labelToIndex.get(outcomes[i]), windowSize - 1 }, 1.0d); vectors.add(labels); } return vectors; }
@Test public void testEncodeNoNames() { NameSample nameSample = new NameSample("Once upon a time.".split(" "), new Span[] {}, true); String[] expected = new String[] { OTHER, OTHER, OTHER, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals("Only 'Other' is expected.", expected, actual); }
@Test public void testCompatibilityContinueOther() { Assert.assertFalse(codec.areOutcomesCompatible(new String[] {A_CONTINUE, OTHER})); Assert.assertFalse(codec.areOutcomesCompatible(new String[] {B_START, A_CONTINUE, OTHER})); }
@Test public void testCreateSequenceValidator() { Assert.assertTrue(codec.createSequenceValidator() instanceof NameFinderSequenceValidator); }
@Override public Span[] find(String[] tokens) { List<INDArray> featureMatrices = DeepLearningUtils.mapToFeatureMatrices(wordVectors, tokens, windowSize); String[] outcomes = new String[tokens.length]; for (int i = 0; i < tokens.length; i++) { INDArray predictionMatrix = network.output(featureMatrices.get(i), false); INDArray outcomeVector = predictionMatrix.get(NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(windowSize - 1)); outcomes[i] = labels[max(outcomeVector)]; } // Delete invalid spans ... for (int i = 0; i < outcomes.length; i++) { if (outcomes[i].endsWith("cont") && (i == 0 || "other".equals(outcomes[i - 1]))) { outcomes[i] = "other"; } } return new BioCodec().decode(Arrays.asList(outcomes)); }
static List<INDArray> mapToLabelVectors(NameSample sample, int windowSize, String[] labelStrings) { Map<String, Integer> labelToIndex = IntStream.range(0, labelStrings.length).boxed() .collect(Collectors.toMap(i -> labelStrings[i], i -> i)); List<INDArray> vectors = new ArrayList<INDArray>(); for (int i = 0; i < sample.getSentence().length; i++) { // encode the outcome as one-hot-representation String outcomes[] = new BioCodec().encode(sample.getNames(), sample.getSentence().length); INDArray labels = Nd4j.create(1, labelStrings.length, windowSize); labels.putScalar(new int[]{0, labelToIndex.get(outcomes[i]), windowSize - 1}, 1.0d); vectors.add(labels); } return vectors; }
@Test public void testEncodeSingleTokenSpan() { String[] sentence = "I called Julie again.".split(" "); Span[] spans = new Span[] { new Span(2,3, A_TYPE)}; NameSample nameSample = new NameSample(sentence, spans, true); String[] expected = new String[] {OTHER, OTHER, A_START, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals("'Julie' should be 'start' only, the rest should be 'other'.", expected, actual); }
@Test public void testCompatibilitySingleContinue() { Assert.assertFalse(codec.areOutcomesCompatible(new String[] {A_CONTINUE})); Assert.assertFalse(codec.areOutcomesCompatible(new String[] {B_START, A_CONTINUE})); }
public TokenNameFinderCrossValidator(String languageCode, String type, TrainingParameters trainParams, byte[] featureGeneratorBytes, Map<String, Object> resources, TokenNameFinderEvaluationMonitor... listeners) { this(languageCode, type, trainParams, featureGeneratorBytes, resources, new BioCodec(), listeners); }
@Override public Span[] find(String[] tokens) { List<INDArray> featureMartrices = mapToFeatureMatrices(wordVectors, tokens, windowSize); String[] outcomes = new String[tokens.length]; for (int i = 0; i < tokens.length; i++) { INDArray predictionMatrix = network.output(featureMartrices.get(i), false); INDArray outcomeVector = predictionMatrix.get(NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(windowSize - 1)); outcomes[i] = labels[max(outcomeVector)]; } // Delete invalid spans ... for (int i = 0; i < outcomes.length; i++) { if (outcomes[i].endsWith("cont") && (i == 0 || "other".equals(outcomes[i - 1]))) { outcomes[i] = "other"; } } return new BioCodec().decode(Arrays.asList(outcomes)); }
/** * Start, Other */ @Test public void testDecodeSingletonFirst() { List<String> encoded = Arrays.asList(B_START, OTHER); Span[] expected = new Span[] {new Span(0, 1, B_TYPE)}; Span[] actual = codec.decode(encoded); Assert.assertArrayEquals(expected, actual); }
@Test public void testEncodeDoubleTokenSpan() { String[] sentence = "I saw Stefanie Schmidt today.".split(" "); Span[] span = new Span[] { new Span(2,4, A_TYPE)}; NameSample nameSample = new NameSample(sentence, span, true); String[] expected = new String[] {OTHER, OTHER, A_START, A_CONTINUE, OTHER}; String[] actual = codec.encode(nameSample.getNames(), nameSample.getSentence().length); Assert.assertArrayEquals("'Stefanie' should be 'start' only, 'Schmidt' is " + "'continue' and the rest should be 'other'.", expected, actual); }
public Span[] decode(List<String> c) { int start = -1; int end = -1; List<Span> spans = new ArrayList<>(c.size()); for (int li = 0; li < c.size(); li++) { String chunkTag = c.get(li); if (chunkTag.endsWith(BioCodec.START)) { if (start != -1) { spans.add(new Span(start, end, extractNameType(c.get(li - 1)))); } start = li; end = li + 1; } else if (chunkTag.endsWith(BioCodec.CONTINUE)) { end = li + 1; } else if (chunkTag.endsWith(BioCodec.OTHER)) { if (start != -1) { spans.add(new Span(start, end, extractNameType(c.get(li - 1)))); start = -1; end = -1; } } } if (start != -1) { spans.add(new Span(start, end, extractNameType(c.get(c.size() - 1)))); } return spans.toArray(new Span[spans.size()]); }
@Test public void testCompatibilityRepeated() { Assert.assertTrue(codec.areOutcomesCompatible( new String[] {A_START, A_START, A_CONTINUE, A_CONTINUE, B_START, B_START, OTHER, OTHER})); }
public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg, boolean useOutcomes) throws IOException { this(psi, pcg, useOutcomes, new BioCodec()); }
/** * Start Continue Continue Other */ @Test public void testDecodeTripletFirst() { List<String> encoded = Arrays.asList(B_START, B_CONTINUE, B_CONTINUE, OTHER); Span[] expected = new Span[] {new Span(0, 3, B_TYPE)}; Span[] actual = codec.decode(encoded); Assert.assertArrayEquals(expected, actual); }