import java.util.Vector;
import org.htmlparser.*;
import org.htmlparser.util.*;public class DelHtml { public static void main(String[] args) throws ParserException {
StringBuffer sbHtml = new StringBuffer(); sbHtml.append("<script>alert(\"123\");</script>");
sbHtml
.append("<p><font face=\"宋\" color=\"red\"size=\"30\">font1</font></p>");
sbHtml.append("<ifream>");
sbHtml
.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"head.css\"/>"); System.out.println("*&&&&&&&START&&&&&&&&&*");
System.out.println(sbHtml.toString());
System.out.println("*&&&&&&&START&&&&&&&&&*");
Parser parser = Parser.createParser(sbHtml.toString(), "utf-8");
NodeList list = parser.parse(null);
visitNodeList(list);
System.out.println("*&&&&&&&END&&&&&&&&&*");
System.out.println(list.toHtml());
System.out.println("*&&&&&&&END&&&&&&&&&*");
} // 递归
private static void visitNodeList(NodeList list) {
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i); if (node instanceof Tag) {
Tag _tag = (Tag) node;
String tagName = _tag.getTagName();
for (String filterTag : Tags) {
if (tagName != null && tagName.indexOf(filterTag) == 0)
list.remove(i);
}
visitTag((Tag) node);
} NodeList children = node.getChildren();
if (children != null && children.size() > 0)
visitNodeList(children); }
} // 获取tag
private static void visitTag(Tag tag) {
Vector attrs = tag.getAttributesEx();
for (int i = 0; i < attrs.size(); i++) {
Object obj = attrs.elementAt(i);
if (obj != null && obj instanceof Attribute) {
visitAttribute((Attribute) obj, tag);
}
}
} // 获取tag属性
private static void visitAttribute(Attribute attribute, Tag tag) {
String attName = attribute.getName();
System.out.println("********visitAttribute*********");
System.out.println(attName);
System.out.println(attribute.getValue());
System.out.println("********visitAttribute*********");
String[] jsEvents = Event.split(",");
// 去掉js标签
for (String jsEvent : jsEvents) {
if (jsEvent.equalsIgnoreCase(attName))
tag.removeAttribute(jsEvent);
} } private static final String Tags[] = { "SCRIPT", "IFREAM", "LINK" }; private static final String Event = "onClick";}上面的字符串是
<script>alert("123");</script>
<p><font face="宋" color="red"size="30">font1</font></p>
<ifream>
<link rel="stylesheet" type="text/css" href="head.css"/>我想得到结果:<p><font face="宋" color="red"size="30">font1</font></p>
我在
if (tagName != null && tagName.indexOf(filterTag) == 0)
list.remove(i);
这里把满足ifrem script link的tag都remove了!
可实际得到结果是
<p><font face="宋" color="red"size="30">font1</font></p>
<link rel="stylesheet" type="text/css" href="head.css"/>而且如果我不去掉list.remove(i);这句,很多属性就,打印不出来如:
********visitAttribute*********
face
宋
********visitAttribute*********
如果我去掉list.remove(i);就可以打印出上面的属性!其实我只是想把tag名字是 "SCRIPT", "IFREAM", "LINK"的tag都整个删除而已!
是不是不能list.remove(i);这样删除啊?应该怎么做呢?才能用htmlparser实现字符串是
<script>alert("123");</script>
<p><font face="宋" color="red"size="30">font1</font></p>
<ifream>
<link rel="stylesheet" type="text/css" href="head.css"/>我想得到结果:<p><font face="宋" color="red"size="30">font1</font></p>
2 String.replace(“1235234”, “”);把不要的去掉
list.remove(i);//元素删除了之后,之后的所有元素index都会减1.
i--;
}
还有很多其他需求必须用htmlparser啊!
import org.htmlparser.util.*;
这个包是做什么的....不懂惭愧呀!
以及接口什么的啊必须用htmlparser啊
以及接口什么的啊必须用htmlparser啊
那就其他需求你在用你的htmlParser吧,至少我觉得你这个需求用正则会很方便
可以用下OrFilter
parser = new Parser();
parser.setURL(urlStr);
OrFilter lastFilter = new OrFilter();
List parseTagsList = null;
try {
parseTagsList = ConfigManager.readTags(TAGSFILE);
} catch (IOException e) {
e.printStackTrace();
}
NodeFilter[] nodeFilterArrays = new NodeFilter[parseTagsList.size()];
for (int i = 0; i < nodeFilterArrays.length; i++) {
try {
nodeFilterArrays[i] = new NodeClassFilter(Class.forName(parseTagsList.get(i).toString()));
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
lastFilter.setPredicates(nodeFilterArrays);
NodeList nodelist = parser.parse(lastFilter);
这是配置文件的内容
#Below is the tag type list which I will parse
#Please be careful not to add any new tags
#You can only enable it or disable it(#)
#For example, if you do want to ignore the div part, then just add "#" before Div line
org.htmlparser.tags.Bullet
org.htmlparser.tags.BulletList
org.htmlparser.tags.Div
replaceAll("<link " + "(.*?)" + "/>", "");应该可以把那个link去掉吧
import java.util.Vector;
import org.htmlparser.*;
import org.htmlparser.util.*;public class DelHtml { public static void main(String[] args) throws ParserException {
StringBuffer sbHtml = new StringBuffer(); sbHtml.append("<script>alert(\"123\");</script>");
sbHtml
.append("<p><font face=\"宋\" color=\"red\"size=\"30\">font1</font></p>");
sbHtml.append("<ifream>");
sbHtml
.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"head.css\"/>");
Parser parser = Parser.createParser(sbHtml.toString(), "utf-8");
NodeList list = parser.parse(null);
visitNodeList(list);
System.out.println(list.toHtml());
} // 递归
private static void visitNodeList(NodeList list) {
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i); if (node instanceof Tag) {
Tag _tag = (Tag) node;
String tagName = _tag.getTagName();
for (String filterTag : Tags) {
if (tagName != null && tagName.indexOf(filterTag) == 0){
list.remove(i);
i--;
}
}
visitTag((Tag) node);
} NodeList children = node.getChildren();
if (children != null && children.size() > 0)
visitNodeList(children); }
} // 获取tag
private static void visitTag(Tag tag) {
Vector attrs = tag.getAttributesEx();
for (int i = 0; i < attrs.size(); i++) {
Object obj = attrs.elementAt(i);
if (obj != null && obj instanceof Attribute) {
visitAttribute((Attribute) obj, tag);
}
}
} // 获取tag属性
private static void visitAttribute(Attribute attribute, Tag tag) {
String attName = attribute.getName();
String[] jsEvents = Event.split(",");
// 去掉js标签
for (String jsEvent : jsEvents) {
if (jsEvent.equalsIgnoreCase(attName))
tag.removeAttribute(jsEvent);
} } private static final String Tags[] = { "SCRIPT", "IFREAM", "LINK" }; private static final String Event = "onClick";}