public static String normalizeForAnalysis(String word) { String s = word.toLowerCase(Turkish.LOCALE); s = TurkishAlphabet.INSTANCE.normalizeCircumflex(s); String noDot = s.replace(".", ""); if (noDot.length() == 0) { noDot = s; } return TextUtil.normalizeApostrophes(noDot); }
public static String cleanAllHtmlRelated(String input) { return cleanHtmlTagsAndComments(removeAmpresandStrings(convertAmpersandStrings(input))); }
/** * it generates an HTML only containing bare head and meta tags with utf-8 charset. and body * content. it also eliminates all script tags. * * @param htmlToReduce html file to reduce. * @return reduced html file. charset is set to utf-8. */ public static String reduceHtmlFixedUTF8Charset(String htmlToReduce) { return HTML_START + "<html><head>" + META_CHARSET_UTF8 + "</head>\n" + cleanScripts(getHtmlBody(htmlToReduce)) + "</html>"; }
line = TextUtil.normalizeApostrophes(line); line = TextUtil.normalizeQuotesHyphens(line); line = TextUtil.normalizeSpacesAndSoftHyphens(line);
public List<WordAnalysis> analyzeSentence(String sentence) { String normalized = TextUtil.normalizeQuotesHyphens(sentence); List<WordAnalysis> result = new ArrayList<>(); for (Token token : tokenizer.tokenize(normalized)) { result.add(analyze(token)); } return result; }
private static String getAttribute(Pattern pattern, String content) { String str = Regexps.firstMatch(pattern, content, 2); str = str == null ? "" : str.replace('\"', ' ').trim(); return TextUtil.convertAmpersandStrings(str); }
@Override public void run() throws Exception { initializeOutputDir(); IOUtil.checkDirectoryArgument(modelRoot, "Model Root"); IOUtil.checkFileArgument(inputPath, "Input File"); Path out = outDir.resolve(inputPath.toFile().getName() + ".ne"); List<String> lines = Files.readAllLines(inputPath, StandardCharsets.UTF_8); List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines); Log.info("There are %d lines and about %d sentences", lines.size(), sentences.size()); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology); Stopwatch sw = Stopwatch.createStarted(); int tokenCount = 0; try (PrintWriter pw = new PrintWriter(out.toFile(), "UTF-8")) { for (String sentence : sentences) { sentence = TextUtil.normalizeApostrophes(sentence); sentence = TextUtil.normalizeQuotesHyphens(sentence); sentence = TextUtil.normalizeSpacesAndSoftHyphens(sentence); List<String> words = TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence); tokenCount += words.size(); NerSentence result = ner.findNamedEntities(sentence, words); pw.println(result.getAsTrainingSentence(annotationStyle)); } } double secs = sw.elapsed(TimeUnit.MILLISECONDS) / 1000d; Log.info("Token count = %s", tokenCount); Log.info("File processed in %.4f seconds.", secs); Log.info("Speed = %.2f tokens/sec", tokenCount / secs); Log.info("Result is written in %s", out); }
public List<String> readAll(String filename) throws IOException { List<String> lines = new ArrayList<>(); File file = new File(filename); LineIterator it = SimpleTextReader.trimmingUTF8Reader(file).getLineIterator(); while (it.hasNext()) { String quotesHyphensNormalzied = TextUtil.normalizeQuotesHyphens(it.next()); lines.add(Joiner.on(" ").join(lexer.tokenizeToStrings(quotesHyphensNormalzied))); } return lines; }
s = TextUtil.normalizeApostrophes(s); s = TextUtil.normalizeQuotesHyphens(s); s = TextUtil.normalizeSpacesAndSoftHyphens(s); s = removeMultipleSymbols(s);
public String normalize(String input) { StringBuilder sb = new StringBuilder(input.length()); input = TextUtil.normalizeApostrophes(input.toLowerCase(TR)); for (char c : input.toCharArray()) { if (letterMap.containsKey(c) || c == '.' || c == '-') { sb.append(c); } else { sb.append("?"); } } return sb.toString(); }
public static String reduceHtml(String htmlToReduce) { String htmlBody = getHtmlBody(htmlToReduce); if (htmlBody == null) { Log.warn("Cannot get html body. "); return htmlToReduce; } List<String> parts = Regexps.allMatches(HTML_META_CONTENT_TAG, htmlToReduce); return HTML_START + "<html><head>" + Joiner.on(" ").join(parts) + "</head>\n" + cleanScripts(htmlBody) + "</html>"; }
void wordFeatures(String word, String featurePrefix, List<String> features) { if (word == null) { return; } features.add(featurePrefix + "Upper:" + Character.isUpperCase(word.charAt(0))); features.add(featurePrefix + "Punct:" + (word.length() == 1)); boolean allCap = true; for (char c : word.toCharArray()) { if (!Character.isUpperCase(c)) { allCap = false; break; } } features.add(featurePrefix + "AllCap:" + allCap); String s = TextUtil.normalizeApostrophes(word); int apostropheIndex = s.indexOf('\''); features.add(featurePrefix + "Apost:" + (apostropheIndex >= 0)); if (apostropheIndex >= 0) { String stem = word.substring(0, apostropheIndex); String ending = word.substring(apostropheIndex + 1); features.add(featurePrefix + "Stem:" + stem); features.add(featurePrefix + "Ending:" + ending); } } }