[code=HTML] <DIV class=book_right>
<DIV id=author_>作 者: 萧言生 著 </DIV>
<DIV id=publisher_>出 版 社: 东方出版社 </DIV>
<UL class=nostyle>
<LI>出版时间: 2007-3-1 </LI>
<LI>字 数: 190000 </LI>
<LI>版 次: 1 </LI>
<LI>页 数: 266 </LI>
<LI>印刷时间: 2007/03/01 </LI>
<LI>开 本: </LI>
<LI>印 次: </LI>
<LI>纸 张: 胶版纸 </LI>
<LI>I S B N : 9787506021715 </LI>
<LI>包 装: 平装 </LI> </UL>
[code]
怎么利用htmlparser解析出本地的html来呢,并保存 本人菜鸟,要是有代码支持,立刻给分
<DIV id=author_>作 者: 萧言生 著 </DIV>
<DIV id=publisher_>出 版 社: 东方出版社 </DIV>
<UL class=nostyle>
<LI>出版时间: 2007-3-1 </LI>
<LI>字 数: 190000 </LI>
<LI>版 次: 1 </LI>
<LI>页 数: 266 </LI>
<LI>印刷时间: 2007/03/01 </LI>
<LI>开 本: </LI>
<LI>印 次: </LI>
<LI>纸 张: 胶版纸 </LI>
<LI>I S B N : 9787506021715 </LI>
<LI>包 装: 平装 </LI> </UL>
[code]
怎么利用htmlparser解析出本地的html来呢,并保存 本人菜鸟,要是有代码支持,立刻给分
你是要把html的文子内容取出来然后在匹配,还是要按标签去把数据取出来?..
我自己以前做这类东西是先把文字取出来再用正则匹配.
htmlparser的一些常用用法我自己又写过些类,你看看对你有用不,正则我有时间可以帮你看看..你先用这个试试自己写下吧.
import java.net.URL;
import java.net.URLConnection;import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;/***/
/**
* HtmlParser
*
* @author shadowlin
*
*/
public class HtmlParser {
private static Logger logger = Logger.getLogger(HtmlParser.class);
private String url;
private static final String DEFAULT_ENCODING ="gb2312";
private String pageTime;
private String textContent;
private String strLinks;
private String strImgs;
private String strScriptCodes;
private String lastModifyDate;
private int contentSize;
private Page page;
private int readTimeout;
private int connectTimeout;
private int retryTimes;
/**
* Constructor
*/
public HtmlParser() {
this.connectTimeout=10000;
this.readTimeout=10000;
this.retryTimes=0;
}
/**
* Constructor
* @param url URL of page
* @param readTimeout set timeout of connect of HTTP connection unit is millisecond
* @param connectTimeout set timeout of read of HTTP connection unit is millisecond
* @throws UnreachablePageException
*/
public HtmlParser(String url,int connectTimeout,int readTimeout,int retryTimes) throws UnreachablePageException{
this.connectTimeout=connectTimeout;
this.readTimeout=readTimeout;
this.retryTimes=retryTimes;
this.url = url;
this.initWithHandleException();
}
/***/
/**
* Set page URL
* (if you want to set timeout please use setTimeOut()
* method first,or the timeout will be the default one)
* @param url URL of page
* @throws UnreachablPageException
*/
public void setUrl(String url) throws UnreachablePageException{
this.url = url;
this.initWithHandleException();
}
/**
* initialize and throw exception
* @throws UnreachablePageException
*/
private void initWithHandleException() throws UnreachablePageException{
boolean flag=true;
int counter=-1;
while(flag){
try {
this.initialize();
flag=false;
} catch (Exception e) {
counter++;
if(counter>=this.retryTimes){
flag=false;
String exceptionURL=this.url;
this.url=null;
throw new UnreachablePageException(exceptionURL,e.toString());
}
try{
Thread.sleep(2000);
}catch(Exception ee){
ee.printStackTrace();
}
}
}
}
* initialize
*/
private void initialize() throws Exception {
//set connection for stringbean
URLConnection sbConnection = null;
try {
URL mUrl = new URL(this.url);
sbConnection = (URLConnection) mUrl.openConnection();
sbConnection.setConnectTimeout(this.connectTimeout);
sbConnection.setReadTimeout(this.readTimeout);
this.lastModifyDate=sbConnection.getHeaderField("Last-Modified");
String strContentSize=sbConnection.getHeaderField("Content-Length");
if(strContentSize!=null){
contentSize=Integer.parseInt(strContentSize);
} else{
contentSize=0;
}
} catch (Exception e) {
logger.warn("Set connection failed");
logger.warn(e.toString());
}
//get text content
StringBean sb = new StringBean();
sb.setLinks(false);
sb.setReplaceNonBreakingSpaces(true);
sb.setCollapse(true);
sb.setConnection(sbConnection);
//format text content
textContent = sb.getStrings().replace("\r\n", "\n");
textContent = textContent.replace("\r", "\n");
textContent = textContent.replace("');", "");
//get links,image URLs,script codes
URLConnection pConnection = null;//set connection
try {
URL mUrl = new URL(this.url);
pConnection = (URLConnection) mUrl.openConnection();
pConnection.setConnectTimeout(this.connectTimeout);
pConnection.setReadTimeout(this.readTimeout);
} catch (Exception e) {
logger.warn("Set connection failed");
logger.warn(e.toString());
}
Parser parser = new Parser();
parser.setConnection(pConnection);
parser.setEncoding(DEFAULT_ENCODING);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter imgFilter = new NodeClassFilter(ImageTag.class);
NodeFilter scriptFilter = new NodeClassFilter(ScriptTag.class);
NodeList links = new NodeList();
NodeList imgs = new NodeList();
NodeList scripts = new NodeList();
NodeIterator nodeIterator = parser.elements();
StringBuffer sbLinks = new StringBuffer();
StringBuffer sbImgs = new StringBuffer();
StringBuffer sbScriptCodes = new StringBuffer();
while (nodeIterator.hasMoreNodes()) {
Node node = nodeIterator.nextNode();
node.collectInto(links, linkFilter);
node.collectInto(imgs, imgFilter);
node.collectInto(scripts, scriptFilter);
}
//get links
for (int i = 0; i < links.size(); i++) {
LinkTag linkTag = (LinkTag) links.elementAt(i);
sbLinks.append(linkTag.getLink() + "\n");//test
}
if(sbLinks!=null){
strLinks=sbLinks.toString();
}else{
strLinks=null;
}
//get image URLs
for (int i = 0; i < imgs.size(); i++) {
ImageTag imageTag = (ImageTag) imgs.elementAt(i);
sbImgs.append(imageTag.getImageURL() + "\n");
}
if(sbImgs!=null){
strImgs=sbImgs.toString();
}else{
strImgs=null;
}
//get script codes
for (int i = 0; i < scripts.size(); i++) {
ScriptTag scriptTag = (ScriptTag) scripts.elementAt(i);
sbScriptCodes.append(scriptTag.getScriptCode() + "\n");
}
if(sbScriptCodes!=null){
strScriptCodes=sbScriptCodes.toString();
}else{
strScriptCodes=null;
}
SBCAndDBCParser sbcAndDBCParser=new SBCAndDBCParser();
this.textContent=sbcAndDBCParser.toDBC(textContent);//parse all text into DBC
}
/***/
/**
* Get text from HTML
* @return Text
*/
public String getTextContent() {
if (url == null) {
logger.warn("please set url of page firset");
return null;
} else {
return this.textContent;
}
} /***/
/**
* Get all links
* @return Links(split with"\n")
*/
public String getPageLinks() {
if (url == null) {
logger.warn("please set url of page firset");
return null;
} else {
return this.strLinks;
}
} /***/
/**
* Get all image URLs
* @return Image URLs(split with"\n")
*/
public String getImgUrls() {
if (url == null) {
logger.warn("please set url of page firset");
return null;
} else {
return this.strImgs;
} } /***/
/**
* Get script codes
*
* @return Script codes
*/
public String getScriptCodes() {
if (url == null) {
logger.warn("please set url of page firset");
return null;
} else {
return this.strScriptCodes;
} } /***/
/**
* Get the time for parsing page
* @return time for parsing page
*/
public String getPageTime() {
if (page == null) {
logger.warn("got no page");
return null;
} else {
return this.pageTime;
}
} /***/
/**
* Seal page informations into Page object
* @return Page object
*/
public Page getPage() {
if (this.url == null) {
return null;
} else {
page = new Page();
page.setImgUrls(this.getImgUrls());
page.setLinks(this.getPageLinks());
page.setScriptCodes(this.getScriptCodes());
page.setText(this.getTextContent());
page.setUrl(this.url);
page.setLastModifyDate(this.lastModifyDate);
page.setContentSize(this.contentSize);
return page;
}
} public String getLastModifyDate() {
return lastModifyDate;
} public void setReadTimeout(int readTimeout) {
this.readTimeout = readTimeout;
} public void setConnectTimeout(int connectTimeout) {
this.connectTimeout = connectTimeout;
}
public void setRetryTimes(int retryTimes) {
this.retryTimes = retryTimes;
} public int getContentSize() {
return contentSize;
}
public static void main(String[] args){//test
HtmlParser htmlParser=new HtmlParser();
Page page=null;
try{
htmlParser.setRetryTimes(3);
htmlParser.setUrl("地址");
page=htmlParser.getPage();
}catch(Exception e){
e.printStackTrace();
}
System.out.println(page.getText());
}}
private String text;
private String links;
private String imgUrls;
private String url;
private String scriptCodes;
private String lastModifyDate;
private int contentSize;
public int getContentSize() {
return contentSize;
}
public void setContentSize(int contentSize) {
this.contentSize = contentSize;
}
public String getLastModifyDate() {
return lastModifyDate;
}
public void setLastModifyDate(String lastModifyDate) {
this.lastModifyDate = lastModifyDate;
}
public String getScriptCodes() {
return scriptCodes;
}
public void setScriptCodes(String scriptCodes) {
this.scriptCodes = scriptCodes;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getLinks() {
return links;
}
public void setLinks(String links) {
this.links = links;
}
public String getImgUrls() {
return imgUrls;
}
public void setImgUrls(String imgUrls) {
this.imgUrls = imgUrls;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
异常的定义public class UnreachablePageException extends Exception {
protected String url;
protected String reason;
public UnreachablePageException(String url,String reason) {
super("Can not get page from this url "+url+"\n"+reason);
this.reason=reason;
this.url=url;
}
public String getExceptionUrl(){
return this.url;
}
public String getReason(){
return this.reason;
}
}
楼主要解析成什么样子的,能说清楚些吗
你是要把html的文子内容取出来然后在匹配,还是要按标签去把数据取出来?..
我自己以前做这类东西是先把文字取出来再用正则匹配.
htmlparser的一些常用用法我自己又写过些类,你看看对你有用不,正则我有时间可以帮你看看..你先用这个试试自己写下吧.