/** * Effectively marks the segment for removal by emptying the content for the * given target. the text unit will be pruned by a different method * ({@link #pruneTextUnit(ITextUnit, LocaleId)}). * * @param tu the text unit containing the content * @param seg the segment to be marked for removal * @param targetLocale the locale for which the segment should be removed */ protected void markSegmentForRemoval(ITextUnit tu, Segment seg, LocaleId targetLocale) { tu.getTargetSegment(targetLocale, seg.getId(), false).getContent().clear(); }
private void checkUnusualCharacters(ITextUnit tu, Segment seg, LocaleId targetLocale) { TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent(); StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText()); StringBuilder trgText = new StringBuilder(trgFrag.getCodedText()); boolean isFound = false; Pattern pattern; // any 3 extended characters together pattern = Pattern.compile("[\\u00C0-\\u00FF]{3}"); // check source if ((pattern.matcher(srcText).find() == true) && (isFound == false)) { isFound = true; markSegmentForRemoval(tu, seg, targetLocale); } // check target if ((pattern.matcher(trgText).find() == true) && (isFound == false)) { isFound = true; markSegmentForRemoval(tu, seg, targetLocale); } }
TextFragment trgFragment = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent(); String srcText = seg.getContent().getCodedText(); String trgText = trgFragment.getCodedText();
String itsAllowedCharsPattern = "\u0000"; StringBuilder trgOri = new StringBuilder(tu.getTargetSegment(targetLocale, seg.getId(), false).text);
/** * Attempts to detect character corruption from either the source or target. * If any corruption are detected, the segment is marked for removal. * * @param tu the text unit to be modified * @param seg the source segment to be modified * @param targetLocale the locale used to fetch the target text */ private void removeCorruptions(ITextUnit tu, Segment seg, LocaleId targetLocale) { TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent(); StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText()); StringBuilder trgText = new StringBuilder(trgFrag.getCodedText()); Matcher matcher; String corruptionRegex = "\\u00C3[\\u00A4-\\u00B6]|\\u00C3\\u201E|\\u00C3\\u2026|\\u00C3\\u2013"; // find corruption in source matcher = Pattern.compile(corruptionRegex).matcher(srcText); if (matcher.find() == true) { this.markSegmentForRemoval(tu, seg, targetLocale); } // find corruption in target matcher = Pattern.compile(corruptionRegex).matcher(trgText); if (matcher.find() == true) { this.markSegmentForRemoval(tu, seg, targetLocale); } }
StringBuilder trgText = new StringBuilder(tu.getTargetSegment(targetLocale, seg.getId(), false).text); boolean alreadyFound = false;
/** * Converts whitespace ({tab}, {space}, {CR}, {LF}) to single space. * * @param tu: the TextUnit containing the segments to update * @param seg: the Segment to update * @param targetLocale: the language for which the text should be updated */ protected void normalizeWhitespace(ITextUnit tu, Segment seg, LocaleId targetLocale) { TextFragment.unwrap(seg.getContent()); TextFragment.unwrap(tu.getTargetSegment(targetLocale, seg.getId(), false).getContent()); }
ISegments srcSegs = tu.getSourceSegments(); for (Segment srcSeg : srcSegs) { Segment trgSeg = tu.getTargetSegment(targetLocale, srcSeg.getId(), false);
Segment trgSeg = tu.getTargetSegment(targetLocale, srcSeg.getId(), false);
for(Segment s : tu.getAlignedSegments()) { params.getCodeFinder().process(s.text); params.getCodeFinder().process(tu.getTargetSegment(trgLoc, s.id, false).text);