net.htmlparser.jericho.OutputDocument java code examples

 public static String removeSpans(String html) {
  Source source = new Source(html);
  source.fullSequentialParse();
  OutputDocument outputDocument = new OutputDocument(source);
  List<Tag> tags = source.getAllTags();
  for (Tag tag : tags) {
    String tagname = tag.getName().toLowerCase();
    if (tagname.equals("span")) {
      //remove the <span>
      outputDocument.remove(tag);
    }
  }
  return outputDocument.toString();
}

  void replaceInOutputDocument(final OutputDocument outputDocument) {
    for (FormControl formControl : formControls) outputDocument.replace(formControl);
  }
}

/**
 * Register mandatory attributes for Scoped CSS
 */
private void registerMandatoryAttributes(Attributes attributes) {
 // see https://sourceforge.net/p/jerichohtml/discussion/350024/thread/501a7d05/
 Map<String, String> attrs = context.getMandatoryAttributes();
 if (!attrs.isEmpty()) {
  for (Entry<String, String> i : attrs.entrySet()) {
   String v = i.getValue();
   if (v == null) {
    outputDocument.insert(attributes.getBegin(), " " + i.getKey() + " ");
   } else {
    outputDocument.insert(attributes.getBegin(), " " + i.getKey() + "\"" + v + "\" ");
   }
  }
 }
}

public String filter(String key, String value) {
  Source source = new Source(value);
  OutputDocument document = new OutputDocument(source);
  replaceChildren(source, source, document);
  return document.toString();
}

/**
 * Causes the specified range of the source text to be ignored when parsing.
 * <p>
 * See the documentation of the {@link Segment#ignoreWhenParsing()} method for more information.
 *
 * @param begin  the beginning character position in the source text.
 * @param end  the end character position in the source text.
 */
public void ignoreWhenParsing(final int begin, final int end) {
  if (wasFullSequentialParseCalled()) throw new IllegalStateException("ignoreWhenParsing can not be used after a full sequential parse has been performed");
  if (parseTextOutputDocument==null) {
    parseTextOutputDocument=new OutputDocument(getParseText());
    parseText=null;
  }
  parseTextOutputDocument.replaceWithSpaces(begin,end);
}

private String removeNotAllowedTags(String htmlFragment, URI docUri) {
  Source source = new Source(htmlFragment);
  OutputDocument outputDocument = new OutputDocument(source);
  List<Element> elements = source.getAllElements();
    Map<String, String> attrsUpdate = outputDocument.replace(attrs, true);
    if (!element.getName().contains("a")) {
      attrsUpdate.clear();
            outputDocument.remove(element);
          || element.getName() == "style"
          || element.getName() == "form") {
        outputDocument.remove(content);
      outputDocument.remove(element.getStartTag());
        outputDocument.remove(element.getEndTag());
  String out = outputDocument.toString();
  out = out.replaceAll("\\n", "");
  out = out.replaceAll("\\t", "");

OutputDocument outputDocument = new OutputDocument(source);
      CharacterReference characterReference = (CharacterReference) segment;
    } else {
      outputDocument.replace(segment, doProcessText(segment.toString()));
return outputDocument.toString();

void replaceInOutputDocument(final OutputDocument outputDocument) {
  if (outputStyle==FormControlOutputStyle.REMOVE) {
    outputDocument.remove(getElement());
  } else if (outputStyle==FormControlOutputStyle.DISPLAY_VALUE) {
    outputDocument.replace(getElement(),getDisplayValueHTML(getValue(),true));
  } else {
    replaceAttributesInOutputDocumentIfModified(outputDocument);
    if (value!=UNCHANGED)
      outputDocument.replace(getElement().getContent(),CharacterReference.encode(value,false));
  }
}
private String getValue() {

  buf.append(")");
document.insert(element.getBegin(), buf.toString()); // 插入块指令
document.remove(new Segment(source, attribute.getBegin() - 1, attribute.getEnd())); // 移除属性
      if (oriattr != null) {
        String buf = String.format("#if(%s)%s=\"%s\"#end()", expression, oriattr.getName(), oriattr.getValue());
        document.replace(new Segment(source, oriattr.getBegin(), oriattr.getEnd()), buf);
        document.remove(new Segment(source, attribute.getBegin(), attribute.getEnd())); // 移除ifattr控制属性
      document.replace(new Segment(source, attribute.getBegin(), attribute.getEnd()), buf);
        document.remove(new Segment(source, attribute.getBegin(), attribute.getEnd())); // 移除setattr控制属性
document.insert(element.getEnd(), buf.toString()); // 插入结束指令

OutputDocument outputDocument=new OutputDocument(source);
for (Element element : source.getAllElementsByClass("constantValuesContainer"))
          outputDocument.replace(childContent, String.valueOf((char)value));
outputDocument.writeTo(
  new OutputStreamWriter(new FileOutputStream(file.toFile())));
System.out.println("Processing "+file+" DONE");

private boolean processTag(Tag tag, OutputDocument outputDocument) {
  String elementName = tag.getName().toLowerCase();
  if (!allowedTags.contains(elementName)) {
    return false;
  }
  if (tag.getTagType() == StartTagType.NORMAL) {
    Element element = tag.getElement();
    if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) {
      if (element.getEndTag() == null) {
        return false;
      }
    } else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName) && element.getEndTag() == null) {
      outputDocument.insert(element.getEnd(), getEndTagHTML(elementName));
    }
    outputDocument.replace(tag, getStartTagHTML(element.getStartTag()));
    return true;
  }
  if (tag.getTagType() == EndTagType.NORMAL) {
    if (tag.getElement() == null) {
      return false;
    }
    outputDocument.replace(tag, getEndTagHTML(elementName));
    return true;
  }
  return false;
}

public OutputDocument getOutputDocument() {
  return new OutputDocument(fSource);
}

/**
 * Removes all the segments from this output document represented by the specified source {@linkplain Segment} objects.
 * <p>
 * This is equivalent to the following code:<pre>
 *  for (Iterator i=segments.iterator(); i.hasNext();)
 *    {@link #remove(Segment) remove}((Segment)i.next());</pre>
 *
 * @param segments  a collection of segments to remove, represented by source {@link Segment} objects.
 */
public void remove(final Collection<? extends Segment> segments) {
  for (Segment segment : segments) remove(segment);
}

/**
 * Returns the {@linkplain ParseText parse text} of this source document.
 * <p>
 * This method is normally only of interest to users who wish to create <a href="TagType.html#Custom">custom tag types</a>.
 * <p>
 * The parse text is defined as the entire text of the source document in lower case, with all
 * {@linkplain Segment#ignoreWhenParsing() ignored} segments replaced by space characters.
 *
 * @return the {@linkplain ParseText parse text} of this source document.
 */
public final ParseText getParseText() {
  if (parseText==null) {
    if (parseTextOutputDocument!=null) {
      parseText=new CharSequenceParseText(parseTextOutputDocument.toString());
      parseTextOutputDocument=null;
    } else {
      parseText=new CharSequenceParseText(sourceText);
    }
  }
  return parseText;
}

Source sourceHtml = new Source(source);
sourceHtml.setLogger(null);
OutputDocument outputDocument = new OutputDocument(sourceHtml);
List<StartTag> tags = sourceHtml.getAllStartTags(FORMULA_TAG_NAME);
for (StartTag tag : tags) {
      continue;
    outputDocument.replace(texElement.getStartTag(), TEX_SCRIPT_TAG_START);
    String content = texElement.getContent().toString().trim();
    Pair<Integer, Integer> bounds = getBounds(content);
    if(bounds.getRight() == 0){
      logger.info("Empty source in Tex tag");
      outputDocument.replace(texElement.getContent(), StringUtils.EMPTY);
    } else {
      String strippedContent = content.substring(bounds.getLeft(), bounds.getRight());
      String unescapedContent = StringEscapeUtils.unescapeHtml4(strippedContent);
      outputDocument.replace(texElement.getContent(), unescapedContent);
    outputDocument.replace(texElement.getEndTag(), TEX_SCRIPT_TAG_END);
  outputDocument.remove(tag);
  outputDocument.remove(endTag);
return outputDocument.toString();

public String filter(String key, String value) {
  Source source = new Source(value);
  OutputDocument document = new OutputDocument(source);
  replaceChildren(source, source, document);
  return document.toString();
}

  void replaceInOutputDocument(final OutputDocument outputDocument) {
    if (outputStyle==FormControlOutputStyle.REMOVE) {
      outputDocument.remove(getElement());
    } else if (outputStyle==FormControlOutputStyle.DISPLAY_VALUE) {
      String output=null;
      if (formControlType!=FormControlType.HIDDEN) {
        String value=elementContainer.getAttributeValue(Attribute.VALUE);
        if (formControlType==FormControlType.PASSWORD && value!=null) value=getString(FormControlOutputStyle.ConfigDisplayValue.PasswordChar,value.length());
        output=getDisplayValueHTML(value,false);
      }
      outputDocument.replace(getElement(),output);
    } else {
      replaceAttributesInOutputDocumentIfModified(outputDocument);
    }
  }
}

  buf.append(")");
document.insert(element.getBegin(), buf.toString()); // 插入块指令
document.remove(new Segment(source, attribute.getBegin() - 1, attribute.getEnd())); // 移除属性
      if (oriattr != null) {
        String buf = String.format("#if(%s)%s=\"%s\"#end()", expression, oriattr.getName(), oriattr.getValue());
        document.replace(new Segment(source, oriattr.getBegin(), oriattr.getEnd()), buf);
        document.remove(new Segment(source, attribute.getBegin(), attribute.getEnd())); // 移除ifattr控制属性
      document.replace(new Segment(source, attribute.getBegin(), attribute.getEnd()), buf);
        document.remove(new Segment(source, attribute.getBegin(), attribute.getEnd())); // 移除setattr控制属性
document.insert(element.getEnd(), buf.toString()); // 插入结束指令

OutputDocument doc = new OutputDocument(source);
List<Tag> tags = source.getAllTags();
int pos = 0;

/**
 * Constructs a new output document based on the specified {@link Segment}.
 * @param segment  the original {@link Segment}.
 */
public OutputDocument(final Segment segment) {
 if (segment==null) throw new IllegalArgumentException("segment argument must not be null");
 this.segment=segment;
 Source source=segment.source;
  this.sourceText=source;
  if (segment.begin>0) remove(0,segment.begin);
  if (segment.end<source.end) remove(segment.end,source.end);
}

Javadoc

Represents a modified version of an original Source document or Segment.

An OutputDocument represents an original Source document or Segment that has been modified by substituting segments of it with other text. Each of these substitutions must be registered in the output document, which is most commonly done using the various replace, remove or insert methods in this class. These methods internally #register(OutputSegment) one or more OutputSegment objects to define each substitution.

If a Segment is used to construct the output document, all character positions are relative to the source document of the specified segment.

After all of the substitutions have been registered, the modified text can be retrieved using the #writeTo(Writer) or #toString() methods.

The registered OutputSegment may be adjacent and may also overlap. An output segment that is completely enclosed by another output segment is not included in the output. An output segment that starts at the same position as a longer output segment is also not included in the output, unless the former has zero length. See the documentation of OutputSegment#COMPARATOR for a more precise description.

If unexpected results are being generated from an OutputDocument, the #getDebugInfo() method provides information on each #getRegisteredOutputSegments(), which should provide enough information to determine the cause of the problem. In most cases the problem will be caused by overlapping output segments.

The following example converts all externally referenced style sheets to internal style sheets:

 
URL sourceUrl=new URL(sourceUrlString); 
Source source=new Source(sourceUrl); 
OutputDocument outputDocument=new OutputDocument(source); 
StringBuilder sb=new StringBuilder(); 
List linkStartTags=source.getAllStartTags(HTMLElementName.LINK); 
for (StartTag startTag : linkStartTags) { 
Attributes attributes=startTag.getAttributes(); 
String rel=attributes.getValue("rel"); 
if (!"stylesheet".equalsIgnoreCase(rel)) continue; 
String href=attributes.getValue("href"); 
if (href==null) continue; 
String styleSheetContent; 
try { 
styleSheetContent=Util.getString(new InputStreamReader(new URL(sourceUrl,href).openStream())); 
} catch (Exception ex) { 
continue; // don't convert if URL is invalid 
} 
sb.setLength(0); 
sb.append("<style"); 
Attribute typeAttribute=attributes.get("type"); 
if (typeAttribute!=null) sb.append(' ').append(typeAttribute); 
sb.append(">\n").append(styleSheetContent).append("\n</style>"); 
outputDocument.replace(startTag,sb.toString()); 
} 
String convertedHtmlText=outputDocument.toString();

Most used methods

<init>
Constructs a new output document based on the specified source document.
replace
Replaces the specified Segment in this output document with the specified text. Specifying a null a
toString
Returns the final content of this output document as a String.
remove
Removes the specified Segment from this output document. This is equivalent to #replace(Segment,Char
insert
Inserts the specified text at the specified character position in this output document.
appendTo
Appends the specified portion of the final content of this output document to the specified Appendab
getRegisteredOutputSegments
Returns a list all of the #register(OutputSegment) OutputSegment objects in this output document. Th
register
Registers the specified OutputSegment in this output document. Use this method if you want to use a
replaceWithSpaces
Replaces the specified segment of this output document with a string of spaces of the same length. T
writeTo
Writes the specified portion of the final content of this output document to the specified Writer. A

Popular in Java

Updating database using SQL prepared statement
setScale (BigDecimal)
getSharedPreferences (Context)
compareTo (BigDecimal)
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
LinkedHashMap (java.util)
LinkedHashMap is an implementation of Map that guarantees iteration order. All optional operations a
List (java.util)
An ordered collection (also known as a sequence). The user of this interface has precise control ove
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
Top plugins for WebStorm

How to useOutputDocument in net.htmlparser.jericho

Best Java code snippets using net.htmlparser.jericho.OutputDocument (Showing top 20 results out of 315)

How to use
OutputDocument
in
net.htmlparser.jericho