zemberek.core.text.TextUtil java code examples

public static String normalizeForAnalysis(String word) {
 String s = word.toLowerCase(Turkish.LOCALE);
 s = TurkishAlphabet.INSTANCE.normalizeCircumflex(s);
 String noDot = s.replace(".", "");
 if (noDot.length() == 0) {
  noDot = s;
 }
 return TextUtil.normalizeApostrophes(noDot);
}

public static String cleanAllHtmlRelated(String input) {
 return cleanHtmlTagsAndComments(removeAmpresandStrings(convertAmpersandStrings(input)));
}

/**
 * it generates an HTML only containing bare head and meta tags with utf-8 charset. and body
 * content. it also eliminates all script tags.
 *
 * @param htmlToReduce html file to reduce.
 * @return reduced html file. charset is set to utf-8.
 */
public static String reduceHtmlFixedUTF8Charset(String htmlToReduce) {
 return HTML_START + "<html><head>" + META_CHARSET_UTF8 + "</head>\n" +
   cleanScripts(getHtmlBody(htmlToReduce)) + "</html>";
}

line = TextUtil.normalizeApostrophes(line);
line = TextUtil.normalizeQuotesHyphens(line);
line = TextUtil.normalizeSpacesAndSoftHyphens(line);

public List<WordAnalysis> analyzeSentence(String sentence) {
 String normalized = TextUtil.normalizeQuotesHyphens(sentence);
 List<WordAnalysis> result = new ArrayList<>();
 for (Token token : tokenizer.tokenize(normalized)) {
  result.add(analyze(token));
 }
 return result;
}

private static String getAttribute(Pattern pattern, String content) {
 String str = Regexps.firstMatch(pattern, content, 2);
 str = str == null ? "" : str.replace('\"', ' ').trim();
 return TextUtil.convertAmpersandStrings(str);
}

@Override
public void run() throws Exception {
 initializeOutputDir();
 IOUtil.checkDirectoryArgument(modelRoot, "Model Root");
 IOUtil.checkFileArgument(inputPath, "Input File");
 Path out = outDir.resolve(inputPath.toFile().getName() + ".ne");
 List<String> lines = Files.readAllLines(inputPath, StandardCharsets.UTF_8);
 List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines);
 Log.info("There are %d lines and about %d sentences", lines.size(), sentences.size());
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
 Stopwatch sw = Stopwatch.createStarted();
 int tokenCount = 0;
 try (PrintWriter pw = new PrintWriter(out.toFile(), "UTF-8")) {
  for (String sentence : sentences) {
   sentence = TextUtil.normalizeApostrophes(sentence);
   sentence = TextUtil.normalizeQuotesHyphens(sentence);
   sentence = TextUtil.normalizeSpacesAndSoftHyphens(sentence);
   List<String> words = TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence);
   tokenCount += words.size();
   NerSentence result = ner.findNamedEntities(sentence, words);
   pw.println(result.getAsTrainingSentence(annotationStyle));
  }
 }
 double secs = sw.elapsed(TimeUnit.MILLISECONDS) / 1000d;
 Log.info("Token count = %s", tokenCount);
 Log.info("File processed in %.4f seconds.", secs);
 Log.info("Speed = %.2f tokens/sec", tokenCount / secs);
 Log.info("Result is written in %s", out);
}

public List<String> readAll(String filename) throws IOException {
 List<String> lines = new ArrayList<>();
 File file = new File(filename);
 LineIterator it = SimpleTextReader.trimmingUTF8Reader(file).getLineIterator();
 while (it.hasNext()) {
  String quotesHyphensNormalzied = TextUtil.normalizeQuotesHyphens(it.next());
  lines.add(Joiner.on(" ").join(lexer.tokenizeToStrings(quotesHyphensNormalzied)));
 }
 return lines;
}

s = TextUtil.normalizeApostrophes(s);
s = TextUtil.normalizeQuotesHyphens(s);
s = TextUtil.normalizeSpacesAndSoftHyphens(s);
s = removeMultipleSymbols(s);

public String normalize(String input) {
 StringBuilder sb = new StringBuilder(input.length());
 input = TextUtil.normalizeApostrophes(input.toLowerCase(TR));
 for (char c : input.toCharArray()) {
  if (letterMap.containsKey(c) || c == '.' || c == '-') {
   sb.append(c);
  } else {
   sb.append("?");
  }
 }
 return sb.toString();
}

public static String reduceHtml(String htmlToReduce) {
 String htmlBody = getHtmlBody(htmlToReduce);
 if (htmlBody == null) {
  Log.warn("Cannot get html body. ");
  return htmlToReduce;
 }
 List<String> parts = Regexps.allMatches(HTML_META_CONTENT_TAG, htmlToReduce);
 return HTML_START + "<html><head>" + Joiner.on(" ").join(parts) +
   "</head>\n" + cleanScripts(htmlBody) + "</html>";
}

 void wordFeatures(String word, String featurePrefix, List<String> features) {
  if (word == null) {
   return;
  }
  features.add(featurePrefix + "Upper:" + Character.isUpperCase(word.charAt(0)));
  features.add(featurePrefix + "Punct:" + (word.length() == 1));
  boolean allCap = true;
  for (char c : word.toCharArray()) {
   if (!Character.isUpperCase(c)) {
    allCap = false;
    break;
   }
  }
  features.add(featurePrefix + "AllCap:" + allCap);
  String s = TextUtil.normalizeApostrophes(word);
  int apostropheIndex = s.indexOf('\'');
  features.add(featurePrefix + "Apost:" + (apostropheIndex >= 0));
  if (apostropheIndex >= 0) {
   String stem = word.substring(0, apostropheIndex);
   String ending = word.substring(apostropheIndex + 1);
   features.add(featurePrefix + "Stem:" + stem);
   features.add(featurePrefix + "Ending:" + ending);
  }
 }
}

Most used methods

normalizeApostrophes
This method converts different apostrophe symbols to a unified form.
normalizeQuotesHyphens
This method converts different single and double quote symbols to a unified form. also it reduces tw
normalizeSpacesAndSoftHyphens
Replaces all unicode space like characters with " " and replaces soft hyphens [u00ad].
convertAmpersandStrings
replaces all special html Strings such as(&....; or &#dddd;) with their original characters.
cleanHtmlTagsAndComments
cleanScripts
containsCombiningDiacritics
Returns true iff input contains Combining Diacritics symbols. These characters sometimes appear in d
getAttributes
returns a map with attributes of an xml line. For example if [content] is `` and [element] is `Foo`
getHtmlBody
removeAmpresandStrings
This method removes all &....; type strings form html.
separateWords

separateWords

Popular in Java

Start an intent from android
getSharedPreferences (Context)
notifyDataSetChanged (ArrayAdapter)
scheduleAtFixedRate (Timer)
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
GridLayout (java.awt)
The GridLayout class is a layout manager that lays out a container's components in a rectangular gri
JComboBox (javax.swing)
JOptionPane (javax.swing)
Top plugins for Android Studio

How to useTextUtil in zemberek.core.text

Best Java code snippets using zemberek.core.text.TextUtil (Showing top 12 results out of 315)

How to use
TextUtil
in
zemberek.core.text