public void testNestedXMLTokenizationFilter () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL"); Label VB = dict.lookupLabel ("VERB"); Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ"); Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ()); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); // Test the ignore function extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*"))); actualXml = extr.toXmlString(); expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public void testNestedXMLTokenizationFilter () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL"); Label VB = dict.lookupLabel ("VERB"); Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ"); Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ()); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); // Test the ignore function extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*"))); actualXml = extr.toXmlString(); expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }