这是我写的一个解析HTML的类,想要实现的是去掉所有的标签只获取其中的文本内容。但是现在出现了几个个问题
1.文本中的空格符解析不了,打印出来的时候会变成问号?
2.有的地方加的空格太多了,怎么把一串空格变成一个空格?
3.当调用以下类的时候:parser = new ParserDelegator; parser.parse(r,callback,ignoreCharSet);当把ignoreCharSet设成false时会抛出异常,设成true时有些网页下下来又会变成乱码?望高手帮忙解决?
package hkp.spider.parse;import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Enumeration;import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;/**
* 实现页面解析,获取页面中的链接,获取页面的标题,获取不含标签的页面文本
* 重写ParserCallback中的几个方法
*
* @作者:HKP
* @创建日期:2010-4-20
*
* @版本:V0.1
*/
public class PageParser extends ParserCallback {
/*
* 返回解析得到的地址
*/
private ArrayList<URL> urllist = new ArrayList<URL>();
/*
* 要解析的网页的基地址,用于把标签中不完整的地址装配完整
*/
private URL base;
/*
* 存储此HTML的标题,用于返回
*/
private StringBuffer title;
/*
* 存储此HTML去掉标签后的文本,用于返回
*/
private StringBuffer body;
/*
* 控制解析器以获取标签内的内容
*/
private boolean titleStatue = false;
private boolean isStyle = false;
private boolean isScript = false;
public PageParser(String base,ArrayList<URL> urllist){
this.urllist = urllist;
try {
this.base = new URL(base);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public PageParser(String base,ArrayList<URL> urllist,StringBuffer title,StringBuffer body){
this.urllist = urllist;
this.title = title;
this.body = body;
try {
this.base = new URL(base);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 从handleStartTag()方法分析得到的开始标签集中,
* 选择特定的标签,获取其中和URL有关的参数,装配成
* 完整的URL
*/
private void getUrlList(AttributeSet attributes){
Enumeration e = attributes.getAttributeNames();
while(e.hasMoreElements()){
Object name = e.nextElement();
Object v = attributes.getAttribute(name);
String value = null;
if(v.getClass()==String.class){
value = (String)v;
value = value.toLowerCase();
if(!value.contains("javascript:")){
try {
//获取几个特定标签中的值
if(name == HTML.Attribute.HREF ||
name == HTML.Attribute.SRC ||
name == HTML.Attribute.LOWSRC){
//装配成完整的URL
URL u = new URL(base,value);
urllist.add(u);
}
//此处留待处理input标签
//
//
//
//
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
//System.err.println(e1);
System.err.println("解析页面错误!");
System.err.println("基地址是:"+base);
System.err.println("值是:"+value);
System.out.println();
//e1.printStackTrace();
}
}
}
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
@Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
// TODO Auto-generated method stub
this.getUrlList(a);
if(t == HTML.Tag.TITLE)
if(!this.titleStatue)
this.titleStatue = true;
if(t == HTML.Tag.STYLE)
if(!this.isScript)
this.isStyle = true;
if(t == HTML.Tag.SCRIPT)
if(!this.isScript)
this.isScript = true;
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
*/
@Override
public void handleEndTag(Tag t, int pos) {
// TODO Auto-generated method stub
if(t == HTML.Tag.TITLE)
if(this.titleStatue)
this.titleStatue = false;
if(t == HTML.Tag.STYLE)
if(this.isStyle)
this.isStyle = false;
if(t == HTML.Tag.SCRIPT)
if(this.isScript)
this.isScript = false;
if(t.isBlock()){
body.append(" ");
}else if(t.breaksFlow()){
body.append(" ");
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
// TODO Auto-generated method stub
if(t.isBlock()){
body.append(" ");
}
else if(t.breaksFlow()){
body.append(" ");
}
else {
body.append(" ");
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleText(char[], int)
*/
@Override
public void handleText(char[] data, int pos) {
// TODO Auto-generated method stub
if(this.titleStatue){
title.append(data);
}
if(!this.isStyle&&!this.isScript)
body.append(data);
}
}
1.文本中的空格符解析不了,打印出来的时候会变成问号?
2.有的地方加的空格太多了,怎么把一串空格变成一个空格?
3.当调用以下类的时候:parser = new ParserDelegator; parser.parse(r,callback,ignoreCharSet);当把ignoreCharSet设成false时会抛出异常,设成true时有些网页下下来又会变成乱码?望高手帮忙解决?
package hkp.spider.parse;import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Enumeration;import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;/**
* 实现页面解析,获取页面中的链接,获取页面的标题,获取不含标签的页面文本
* 重写ParserCallback中的几个方法
*
* @作者:HKP
* @创建日期:2010-4-20
*
* @版本:V0.1
*/
public class PageParser extends ParserCallback {
/*
* 返回解析得到的地址
*/
private ArrayList<URL> urllist = new ArrayList<URL>();
/*
* 要解析的网页的基地址,用于把标签中不完整的地址装配完整
*/
private URL base;
/*
* 存储此HTML的标题,用于返回
*/
private StringBuffer title;
/*
* 存储此HTML去掉标签后的文本,用于返回
*/
private StringBuffer body;
/*
* 控制解析器以获取标签内的内容
*/
private boolean titleStatue = false;
private boolean isStyle = false;
private boolean isScript = false;
public PageParser(String base,ArrayList<URL> urllist){
this.urllist = urllist;
try {
this.base = new URL(base);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public PageParser(String base,ArrayList<URL> urllist,StringBuffer title,StringBuffer body){
this.urllist = urllist;
this.title = title;
this.body = body;
try {
this.base = new URL(base);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 从handleStartTag()方法分析得到的开始标签集中,
* 选择特定的标签,获取其中和URL有关的参数,装配成
* 完整的URL
*/
private void getUrlList(AttributeSet attributes){
Enumeration e = attributes.getAttributeNames();
while(e.hasMoreElements()){
Object name = e.nextElement();
Object v = attributes.getAttribute(name);
String value = null;
if(v.getClass()==String.class){
value = (String)v;
value = value.toLowerCase();
if(!value.contains("javascript:")){
try {
//获取几个特定标签中的值
if(name == HTML.Attribute.HREF ||
name == HTML.Attribute.SRC ||
name == HTML.Attribute.LOWSRC){
//装配成完整的URL
URL u = new URL(base,value);
urllist.add(u);
}
//此处留待处理input标签
//
//
//
//
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
//System.err.println(e1);
System.err.println("解析页面错误!");
System.err.println("基地址是:"+base);
System.err.println("值是:"+value);
System.out.println();
//e1.printStackTrace();
}
}
}
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
@Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
// TODO Auto-generated method stub
this.getUrlList(a);
if(t == HTML.Tag.TITLE)
if(!this.titleStatue)
this.titleStatue = true;
if(t == HTML.Tag.STYLE)
if(!this.isScript)
this.isStyle = true;
if(t == HTML.Tag.SCRIPT)
if(!this.isScript)
this.isScript = true;
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
*/
@Override
public void handleEndTag(Tag t, int pos) {
// TODO Auto-generated method stub
if(t == HTML.Tag.TITLE)
if(this.titleStatue)
this.titleStatue = false;
if(t == HTML.Tag.STYLE)
if(this.isStyle)
this.isStyle = false;
if(t == HTML.Tag.SCRIPT)
if(this.isScript)
this.isScript = false;
if(t.isBlock()){
body.append(" ");
}else if(t.breaksFlow()){
body.append(" ");
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
// TODO Auto-generated method stub
if(t.isBlock()){
body.append(" ");
}
else if(t.breaksFlow()){
body.append(" ");
}
else {
body.append(" ");
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleText(char[], int)
*/
@Override
public void handleText(char[] data, int pos) {
// TODO Auto-generated method stub
if(this.titleStatue){
title.append(data);
}
if(!this.isStyle&&!this.isScript)
body.append(data);
}
}
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货