谁有用java写的html全解析程序. DOM已经支持html解析,Xerces也已经提供了DOM3的实现。BTW:no pains, no gains 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 ^_^ 有现成的,自己搜索去吧.no search ,no gains:) import java.io.*;import org.w3c.tidy.Tidy;/** * HTML文件整理 * * 从Test16.java修改而来 * * 运行参数 * java Test17 <html_file> <new_html_file> <log_file> <config_file> * */public class Test17 implements Runnable { private String srcFileName; private String outFileName; private String errOutFileName; private String configFileName; public Test17(String srcFileName, String outFileName, String errOutFileName, String confName) { this.srcFileName = srcFileName; this.outFileName = outFileName; this.errOutFileName = errOutFileName; this.configFileName= confName; } public void run() { BufferedInputStream in; FileOutputStream out; Tidy tidy = new Tidy(); tidy.setConfigurationFromFile(configFileName); try { tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true)); in = new BufferedInputStream(new FileInputStream(srcFileName)); out = new FileOutputStream(outFileName); tidy.parseDOM(in, out); } catch (IOException e) { System.out.println(this.toString() + e.toString()); } } public static void main(String[] args) { Test17 t1 = new Test17(args[0], args[1], args[2], args[3]); Thread th1 = new Thread(t1); th1.start(); }} 配置文件和错误日志可以为空! 下面是配置文件参考://JTidy设置文件参考如下:// sample config file for HTML tidyindent: autoindent-spaces: 4wrap: 72up: yesoutput-xml: yesinput-xml: noshow-warnings: yesnumeric-entities: yesquote-s: yesquote-nbsp: yesquote-ampersand: nobreak-before-br: nouppercase-tags: nouppercase-attributes: nochar-encoding: rawnew-inline-tags: cfif, cfelse, math, mroot, mrow, mi, mn, mo, msqrt, mfrac, msubsup, munderover,munder, mover, mmultiscripts, msup, msub, mtext,mprescripts, mtable, mtr, mtd, mthnew-blocklevel-tags: cfoutput, cfquerynew-empty-tags: cfelse 将html转换成DOM树以后,就可以用任何XML解析器进行解析了! 比如说遍历此html文档(当然是转换后的,也就是XHtml):Document document = tidy.parseDOM( in,out );travers( document );/** * Traverse DOM Tree * */public void traverse( Node cNode ) { String eleName = null; switch( cNode.getNodeType() ) { case Node.DOCUMENT_NODE: System.out.println( "Element " + cNode.getNodeName() ); processChildren( cNode.getChildNodes() ); break; case Node.ELEMENT_NODE: eleName = cNode.getNodeName(); System.out.println("Element " + eleName); NamedNodeMap attributeMap = cNode.getAttributes(); int numAttrs = attributeMap.getLength(); for(int i = 0; i < attributeMap.getLength(); i++ ) { Attr attribute = (Attr)attributeMap.item(i); String attrName = attribute.getNodeName(); String attrValue = attribute.getNodeValue(); System.out.println( attrName + " = " + attrValue ); } processChildren(cNode.getChildNodes()); break; case Node.CDATA_SECTION_NODE: case Node.TEXT_NODE: System.out.println( "Text " + cNode.getNodeValue() ); if( !cNode.getNodeValue().trim().equals("") ) { System.out.println( "eleName " + eleName ); System.out.println( "Text " + cNode.getNodeValue() ); } break; }}private void processChildren( NodeList nList ){ if( nList.getLength()!=0 ){ for( int i = 0; i < nList.getLength(); i++ ) { traverse( nList.item(i) ); } }} 标准的JDK里面就肯定有,理由:javax.swing.JComponent本身就能够显示(相对简单的,符合w3c规范的)HTML,具体是哪个class作为Parser不太清楚,但肯定在jdk内 see javax.swing.text.html.parser.DocumentParser 上面javax.swing.JComponent说错了,是JEditorPane 关于字符串分割的问题 正则表达式截取字符串问题 怎么经常说找不到符号啊? 如何建立不同的Node实例 非法表达的错误 取得当天日期最简单方法&&字符串默认值设置 如何去掉JTree节点的图标? 在txt文件中写入中文正常但是读取出来的时候是乱码 java中建立一个List的效率/性能问题 如何在数据库表的中间插入一条语句?(指定id) 请问在java中农历如何转换为新历 请教关于jlist的问题
import org.w3c.tidy.Tidy;
/**
* HTML文件整理
*
* 从Test16.java修改而来
*
* 运行参数
* java Test17 <html_file> <new_html_file> <log_file> <config_file>
*
*/
public class Test17 implements Runnable {
private String srcFileName;
private String outFileName;
private String errOutFileName;
private String configFileName; public Test17(String srcFileName, String outFileName, String errOutFileName,
String confName) {
this.srcFileName = srcFileName;
this.outFileName = outFileName;
this.errOutFileName = errOutFileName;
this.configFileName= confName;
} public void run() {
BufferedInputStream in;
FileOutputStream out;
Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(configFileName);
try {
tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
in = new BufferedInputStream(new FileInputStream(srcFileName));
out = new FileOutputStream(outFileName);
tidy.parseDOM(in, out);
} catch (IOException e) {
System.out.println(this.toString() + e.toString());
}
} public static void main(String[] args) {
Test17 t1 = new Test17(args[0], args[1], args[2], args[3]);
Thread th1 = new Thread(t1);
th1.start();
}
}
下面是配置文件参考:
//JTidy设置文件参考如下:
// sample config file for HTML tidy
indent: auto
indent-spaces: 4
wrap: 72
up: yes
output-xml: yes
input-xml: no
show-warnings: yes
numeric-entities: yes
quote-s: yes
quote-nbsp: yes
quote-ampersand: no
break-before-br: no
uppercase-tags: no
uppercase-attributes: no
char-encoding: raw
new-inline-tags: cfif, cfelse, math, mroot,
mrow, mi, mn, mo, msqrt, mfrac, msubsup, munderover,
munder, mover, mmultiscripts, msup, msub, mtext,
mprescripts, mtable, mtr, mtd, mth
new-blocklevel-tags: cfoutput, cfquery
new-empty-tags: cfelse
Document document = tidy.parseDOM( in,out );
travers( document );/**
* Traverse DOM Tree
*
*/
public void traverse( Node cNode ) {
String eleName = null;
switch( cNode.getNodeType() ) {
case Node.DOCUMENT_NODE:
System.out.println( "Element " + cNode.getNodeName() );
processChildren( cNode.getChildNodes() );
break;
case Node.ELEMENT_NODE:
eleName = cNode.getNodeName();
System.out.println("Element " + eleName);
NamedNodeMap attributeMap = cNode.getAttributes();
int numAttrs = attributeMap.getLength();
for(int i = 0; i < attributeMap.getLength(); i++ ) {
Attr attribute = (Attr)attributeMap.item(i);
String attrName = attribute.getNodeName();
String attrValue = attribute.getNodeValue();
System.out.println( attrName + " = " + attrValue );
}
processChildren(cNode.getChildNodes());
break;
case Node.CDATA_SECTION_NODE:
case Node.TEXT_NODE:
System.out.println( "Text " + cNode.getNodeValue() );
if( !cNode.getNodeValue().trim().equals("") ) {
System.out.println( "eleName " + eleName );
System.out.println( "Text " + cNode.getNodeValue() );
}
break;
}
}private void processChildren( NodeList nList ){
if( nList.getLength()!=0 ){
for( int i = 0; i < nList.getLength(); i++ ) {
traverse( nList.item(i) );
}
}
}