cc.mallet.extract.DocumentExtraction.toXmlString java code examples

public void testToXml () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public void testNestedXMLTokenizationFilter ()
{
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL");
 Label VB = dict.lookupLabel ("VERB");
 Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ");
 Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ());
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
 // Test the ignore function
 extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*")));
 actualXml = extr.toXmlString();
 expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public void testNestedXMLTokenizationFilter ()
{
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL");
 Label VB = dict.lookupLabel ("VERB");
 Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ");
 Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ());
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
 // Test the ignore function
 extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*")));
 actualXml = extr.toXmlString();
 expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public void testToXml () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

 public void testToXmlBIO () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label BANML = dict.lookupLabel ("B-ANIMAL");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label BVB = dict.lookupLabel ("B-VERB");
 Label VB = dict.lookupLabel ("I-VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

 public void testToXmlBIO () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label BANML = dict.lookupLabel ("B-ANIMAL");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label BVB = dict.lookupLabel ("B-VERB");
 Label VB = dict.lookupLabel ("I-VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public void testNestedToXML ()
{
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 Label JJ = dict.lookupLabel ("ADJ");
 Label MAMMAL = dict.lookupLabel ("MAMMAL");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, ANML, ANML, ANML });
 LabeledSpans spans = new DefaultTokenizationFilter ().constructLabeledSpans (dict, document, O, toks, tags);
 Span foxToken = toks.subspan (3, 4);
 spans.add (new LabeledSpan (foxToken, MAMMAL, false));
 Span bigDogToken = toks.subspan (7, 8);
 spans.add (new LabeledSpan (bigDogToken, JJ, false));
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, spans, null, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy </ADJ>dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public void testNestedToXML ()
{
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 Label JJ = dict.lookupLabel ("ADJ");
 Label MAMMAL = dict.lookupLabel ("MAMMAL");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, ANML, ANML, ANML });
 LabeledSpans spans = new DefaultTokenizationFilter ().constructLabeledSpans (dict, document, O, toks, tags);
 Span foxToken = toks.subspan (3, 4);
 spans.add (new LabeledSpan (foxToken, MAMMAL, false));
 Span bigDogToken = toks.subspan (7, 8);
 spans.add (new LabeledSpan (bigDogToken, JJ, false));
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, spans, null, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy </ADJ>dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

Popular in Java

Running tasks concurrently on multiple threads
notifyDataSetChanged (ArrayAdapter)
setRequestProperty (URLConnection)
getContentResolver (Context)
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
DataSource (javax.sql)
An interface for the creation of Connection objects which represent a connection to a database. This
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
Menu (java.awt)
Location (org.springframework.beans.factory.parsing)
Class that models an arbitrary location in a Resource.Typically used to track the location of proble
Top plugins for WebStorm

How to use toXmlStringmethodin cc.mallet.extract.DocumentExtraction

Best Java code snippets using cc.mallet.extract.DocumentExtraction.toXmlString (Showing top 8 results out of 315)

How to use
toXmlString
method
in
cc.mallet.extract.DocumentExtraction