public static boolean checkSegments(Segment sseg, Segment tseg) { // tseg is allowed to be null return sseg != null && (sseg.getContent().hasText() || (tseg != null && tseg.getContent().hasText())); }
private void assembleSegmentContent(Segment segment, StringBuilder textContentBuilder) { TextFragment content = segment.getContent(); String codedText = content.getCodedText(); for (int i = 0; i < codedText.length(); i++) { char c = codedText.charAt(i); textContentBuilder.append(c); } } }
private void harvestTextAnalysisAnnotations (ITextUnit tu) { for ( Segment seg : tu.getSource().getSegments() ) { if ( !seg.getContent().hasAnnotation(GenericAnnotationType.GENERIC) ) continue; // Else: check if it's a TA annotation List<AnnotatedSpan> aspans = seg.getContent().getAnnotatedSpans(GenericAnnotationType.GENERIC); for ( AnnotatedSpan aspan : aspans ) { String term = aspan.span.toText(); if ( termsFromAnnotations.containsKey(term) ) { termsFromAnnotations.put(term, termsFromAnnotations.get(term)+1); } else { termsFromAnnotations.put(term, 1); } } } }
/** * Converts whitespace ({tab}, {space}, {CR}, {LF}) to single space. * * @param tu: the TextUnit containing the segments to update * @param seg: the Segment to update * @param targetLocale: the language for which the text should be updated */ protected void normalizeWhitespace(ITextUnit tu, Segment seg, LocaleId targetLocale) { TextFragment.unwrap(seg.getContent()); TextFragment.unwrap(tu.getTargetSegment(targetLocale, seg.getId(), false).getContent()); }
private void checkUnusualCharacters(ITextUnit tu, Segment seg, LocaleId targetLocale) { TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent(); StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText()); StringBuilder trgText = new StringBuilder(trgFrag.getCodedText()); boolean isFound = false; Pattern pattern; // any 3 extended characters together pattern = Pattern.compile("[\\u00C0-\\u00FF]{3}"); // check source if ((pattern.matcher(srcText).find() == true) && (isFound == false)) { isFound = true; markSegmentForRemoval(tu, seg, targetLocale); } // check target if ((pattern.matcher(trgText).find() == true) && (isFound == false)) { isFound = true; markSegmentForRemoval(tu, seg, targetLocale); } }
TextFragment trgFragment = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent(); String srcText = seg.getContent().getCodedText(); String trgText = trgFragment.getCodedText(); seg.getContent().setCodedText(srcText); trgFragment.setCodedText(trgText);
/** * Attempts to detect character corruption from either the source or target. * If any corruption are detected, the segment is marked for removal. * * @param tu the text unit to be modified * @param seg the source segment to be modified * @param targetLocale the locale used to fetch the target text */ private void removeCorruptions(ITextUnit tu, Segment seg, LocaleId targetLocale) { TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent(); StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText()); StringBuilder trgText = new StringBuilder(trgFrag.getCodedText()); Matcher matcher; String corruptionRegex = "\\u00C3[\\u00A4-\\u00B6]|\\u00C3\\u201E|\\u00C3\\u2026|\\u00C3\\u2013"; // find corruption in source matcher = Pattern.compile(corruptionRegex).matcher(srcText); if (matcher.find() == true) { this.markSegmentForRemoval(tu, seg, targetLocale); } // find corruption in target matcher = Pattern.compile(corruptionRegex).matcher(trgText); if (matcher.find() == true) { this.markSegmentForRemoval(tu, seg, targetLocale); } }
private void mergeSegment(Segment segment) { TextFragment textFragment = segment.getContent(); String codedText = textFragment.getCodedText(); List<Code> codes = textFragment.getCodes(); for (int i = 0; i < codedText.length(); i++) { char c = codedText.charAt(i); if (!TextFragment.isMarker(c)) { addChar(c); continue; } int codeIndex = TextFragment.toIndex(codedText.charAt(++i)); addCode(codes.get(codeIndex)); } }
/** * Effectively marks the segment for removal by emptying the content for the * given target. the text unit will be pruned by a different method * ({@link #pruneTextUnit(ITextUnit, LocaleId)}). * * @param tu the text unit containing the content * @param seg the segment to be marked for removal * @param targetLocale the locale for which the segment should be removed */ protected void markSegmentForRemoval(ITextUnit tu, Segment seg, LocaleId targetLocale) { tu.getTargetSegment(targetLocale, seg.getId(), false).getContent().clear(); }
private void writeSegment(Segment segment) { try { TextFragment content = segment.getContent(); String codedText = content.getCodedText(); List<Code> codes = content.getCodes(); for (int i = 0; i < codedText.length(); i++) { char c = codedText.charAt(i); if (TextFragment.isMarker(c)) { int codeIndex = TextFragment.toIndex(codedText.charAt(++i)); writeCode(codes.get(codeIndex)); } else { writeChar(c); } } } catch (Exception e) { LOGGER.error("Threw {} writing segment id {} '{}'", e.getClass().getSimpleName(), segment.getId(), segment.toString()); throw e; } }
protected Tokens getTokens(Segment segment, LocaleId locale) { TokensAnnotation ta = segment.getAnnotation(TokensAnnotation.class); Tokens allTokens = ta != null ? ta.getTokens() : Tokenizer.tokenize(segment.getContent(), locale); return filterTokens(allTokens); }
Segment srcSegment = srcSegIterator.next(); Segment trgSegment = bilingualSegs.getCorrespondingTarget(srcSegment, trgLocale); List<Code> srcCodes = srcSegment.getContent().getCodes(); List<Code> trgCodes = trgSegment.getContent().getCodes();
writeQuotedContent(srcSeg.getContent()); writer.write("\t"); Segment trgSeg = trgSegs.get(srcSeg.getId()); if ( trgSeg != null ) { writeQuotedContent(trgSeg.getContent());
if ( !compareTextFragments(seg1.getContent(), seg2.getContent()) ){ return false;
int unmatchedDelta = 0; for ( Segment seg : tc.getSegments() ) { TextFragment tf = seg.getContent(); int count = tf.getCodes().size(); if ( count == 0 ) continue;
int unmatchedMinId = 0; for ( Segment seg : tc.getSegments() ) { TextFragment tf = seg.getContent(); if (RenumberingUtil.containsOnlyMatchingCodes(tf)) { inUnmatched = false;
newSkel.add(skelWriter.getContent(srcSeg.getContent(), null, EncoderContext.TEXT)); // Source goes to skeleton TextContainer s = new TextContainer(skelWriter.getContent(srcSeg.getContent(), null, EncoderContext.TEXT)); TextContainer t = new TextContainer(skelWriter.getContent(trgSeg.getContent(), part.getLocale(), EncoderContext.TEXT)); ITextUnit segTu = TextUnitUtil.buildTU(null, null, s, t, part.getLocale(), null); newSkel.add(skelWriter.getContent(trgSeg.getContent(), part.getLocale(), EncoderContext.TEXT)); // Target goes to skeleton
return doCount(seg.getContent(), language);
return doFullCount(seg.getContent(), language);
Segment trgSeg = trgSegs.get(srcSeg.id); if (trgSeg != null && trgSeg.getContent().hasText()) { TranslationUnitVariant source = new TranslationUnitVariant(srcLoc, srcSeg.text); TranslationUnitVariant target = new TranslationUnitVariant(trgLoc, trgSeg.text);