KAFDocument document = KAFDocument.createFromFile(f); String title = document.getFileDesc().title; String text = document.getRawText();
KAFDocument document = KAFDocument.createFromFile(f); String title = document.getFileDesc().title; String text = document.getRawText().substring(title.length() + 1).trim(); String id = document.getPublic().publicId;
try { KAFDocument document = KAFDocument.createFromFile(file); if (document.getRawText() == null || document.getRawText().trim().length() == 0) { logger.info("File is empty: " + file); logger.info("Writing empty file " + outputFile);
String text = document.getRawText(); text = text.replaceAll("[^a-zA-Z]", ""); textToFile.put(text, file.toFile().getName());
public static KAFDocument join(List<KAFDocument> nafs) { KAFDocument firstNaf = nafs.get(0); KAFDocument joinedNaf = new KAFDocument(firstNaf.getLang(), nafs.get(0).getVersion()); joinedNaf.setRawText(firstNaf.getRawText()); for (KAFDocument nafPart : nafs) { for (AnnotationType type : highLevelAnnotationTypes) { Layer layer = highLevelAnnotationType2Layer.get(type); List<Annotation> annotations = new ArrayList<Annotation>(); if (isMultiLayerAnnotationType(type)) { for (String groupId : nafPart.annotationContainer.getGroupIDs(type)) { annotations.addAll(nafPart.getAnnotations(type, groupId)); } } else { annotations = nafPart.getAnnotations(type); } for (Annotation ann : annotations) { joinedNaf.addExistingAnnotation(ann, layer, type); } } } return joinedNaf; }
public List<KAFDocument> splitInParagraphs() { List<KAFDocument> paraNafs = new ArrayList<KAFDocument>(); Integer numParagraphs = this.getNumParagraphs(); for (Integer paragraph = 1; paragraph <= numParagraphs; paragraph++) { KAFDocument naf = new KAFDocument(this.getLang(), this.getVersion()); naf.setRawText(this.getRawText()); for (AnnotationType type : highLevelAnnotationTypes) { Layer layer = highLevelAnnotationType2Layer.get(type); if (isParagraphLevelAnnotationType(type)) { List<Annotation> annotations = new ArrayList<Annotation>(); if (isMultiLayerAnnotationType(type)) { for (String groupId : annotationContainer.getGroupIDs(type)) { annotations.addAll(this.getByPara(type, groupId, paragraph)); } } else { annotations = this.getByPara(type, paragraph); } for (Annotation ann : annotations) { naf.addExistingAnnotation(ann, layer, type); } } } paraNafs.add(naf); } return paraNafs; }
public List<KAFDocument> splitInSentences() { List<KAFDocument> sentNafs = new ArrayList<KAFDocument>(); Integer numParagraphs = this.getNumParagraphs(); for (Integer paragraph = 1; paragraph <= numParagraphs; paragraph++) { List<Integer> sentences = this.getSentsByParagraph(paragraph); for (Integer sentence : sentences) { KAFDocument naf = new KAFDocument(this.getLang(), this.getVersion()); naf.setRawText(this.getRawText()); for (AnnotationType type : highLevelAnnotationTypes) { Layer layer = highLevelAnnotationType2Layer.get(type); if (isSentenceLevelAnnotationType(type)) { List<Annotation> annotations = new ArrayList<Annotation>(); if (isMultiLayerAnnotationType(type)) { for (String groupId : annotationContainer.getGroupIDs(type)) { annotations.addAll(this.getBySent(type, groupId, sentence)); } } else { annotations = this.getBySent(type, sentence); } for (Annotation ann : annotations) { naf.addExistingAnnotation(ann, layer, type); } } } sentNafs.add(naf); } } return sentNafs; }
textElement.appendChild(doc.createTextNode(document.getRawText())); sentenceElement.appendChild(textElement);
try { KAFDocument document = KAFDocument.createFromFile(nafFile); text = document.getRawText(); } catch (Exception e) { text = Files.toString(nafFile, Charsets.UTF_8);
LOGGER.info(String.format("Loading file %s", file)); KAFDocument document = KAFDocument.createFromFile(file); text = document.getRawText(); text = StringEscapeUtils.unescapeHtml(text); List<Term> terms = document.getTerms();
if (this.document.getRawText() != null) { final String rawText = this.document.getRawText(); final StringBuilder builder = new StringBuilder(); boolean addSpace = false;
if (this.document.getRawText() != null) { final String rawText = this.document.getRawText(); final StringBuilder builder = new StringBuilder(); boolean addSpace = false;