怎么使用以下HTML抽取信息? 怎么 解析本地的 网页信息 ,并储存为txt文本信息求高手帮助 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 主要 提取 //书名 String title; //作者 String author; // 价格 String price; // 优惠价 String dprice1; // vip价 String dprice2; // // ISBN String isbn; // String publish; String publishdate; String content; 图片连接 String imgurl; 网页连接// String url; 类型 String type; //页数 String pagecount; JAVA有很多解析HMTL的框架,GOOGLE下。 htmlparser对吧 但问题没解决呀 package com.mysearch.parser; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.RegexFilter; import org.htmlparser.filters.StringFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class ParserFor2 { public static void main(String args[]) { String NEWLINE = "\r\n"; String SEPARATOR = "================================"; String title; String author; // �г��۸� String price; // String dprice1; // String dprice2; // // ISBN String isbn; // ����� String publish; String publishdate; String content; String imgurl; String url; String type; String pagecount; String url1; try { BufferedReader br = new BufferedReader(new FileReader("D:\\tt.txt")); String line; BufferedWriter bw = null; while (null != (line = br.readLine())) { try { StringBuffer t1 = new StringBuffer(); Parser parser = new Parser(line); parser.setEncoding("gb2312"); NodeFilter filter_title = new AndFilter(new TagNameFilter( "a"), new HasAttributeFilter("name", "top_bk")); NodeList nodelist = parser.parse(filter_title); Node node_title = nodelist.elementAt(0); title = node_title.getNextSibling().getNextSibling() .getText(); parser.reset(); NodeFilter filter_author = new AndFilter(new TagNameFilter( "div"), new HasAttributeFilter("id", "author_")); nodelist = parser.parse(filter_author); Node node_author = nodelist.elementAt(0); author = node_author.getChildren().asString().substring(6); parser.reset(); NodeFilter filter_publish = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter( "id", "publisher_")); nodelist = parser.parse(filter_publish); Node node_publish = nodelist.elementAt(0); publish = node_publish.getChildren().asString() .substring(7); parser.reset(); NodeFilter filter_isbn = new RegexFilter( "I S B N �� \\d\\d\\d\\d\\d\\d\\d\\d\\d\\d"); nodelist = parser.parse(filter_isbn); Node node_isbn = nodelist.elementAt(0); isbn = node_isbn.getText().substring(13); parser.reset(); NodeFilter filter_publishdate = new StringFilter("���ʱ�䣺"); nodelist = parser.parse(filter_publishdate); Node node_publishdate = nodelist.elementAt(1); publishdate = node_publishdate.getText().substring(6); parser.reset(); NodeFilter filter_type = new StringFilter(">> "); nodelist = parser.parse(filter_type); Node node_type = nodelist.elementAt(0); type = node_type.getNextSibling().getChildren().asString(); parser.reset(); NodeFilter filter_imgurl = new AndFilter(new TagNameFilter( "img"), new HasAttributeFilter("id", "img_show_prd")); nodelist = parser.parse(filter_imgurl); Node node_imgurl = nodelist.elementAt(0); imgurl = node_imgurl.getText(); imgurl = imgurl.substring(imgurl.indexOf("images") - 1, imgurl.indexOf(".jpg") + 4); parser.reset(); NodeFilter filter_price = new StringFilter("���ۣ�"); nodelist = parser.parse(filter_price); Node node_price = nodelist.elementAt(0); price = node_price.getText().substring(4); parser.reset(); NodeFilter filter_dprice1 = new StringFilter("�����ۣ�"); nodelist = parser.parse(filter_dprice1); Node node_dprice1 = nodelist.elementAt(0); dprice1 = node_dprice1.getNextSibling().getNextSibling() .getText().substring(1); parser.reset(); NodeFilter filter_dprice2 = new StringFilter("��ʯvip�ۣ�"); nodelist = parser.parse(filter_dprice2); Node node_dprice2 = nodelist.elementAt(0); try { dprice2 = node_dprice2.getText(); dprice2 = dprice2.substring( dprice2.indexOf("��ʯvip�ۣ�") + 8).trim(); } catch (NullPointerException e) { dprice2 = "null"; } parser.reset(); NodeFilter filter_pagecount = new StringFilter("ҳ������"); nodelist = parser.parse(filter_pagecount); Node node_pagecount = nodelist.elementAt(0); pagecount = node_pagecount.getText().substring(6).trim(); parser.reset(); NodeFilter filter_content = new StringFilter("���ݼ��"); nodelist = parser.parse(filter_content); Node node_content = nodelist.elementAt(0); try { content = node_content.getParent().getNextSibling() .getNextSibling().getChildren().asString(); } catch (NullPointerException e) { content = "null"; } t1.append("dangdang").append("-").append(isbn); bw = new BufferedWriter(new FileWriter( new File("E:\\book\\"+t1 + ".txt"))); bw.write(line); bw.write(NEWLINE); bw.write(imgurl); bw.write(NEWLINE); bw.write(title); bw.write(NEWLINE); bw.write(author); bw.write(NEWLINE); bw.write(price); bw.write(NEWLINE); bw.write(dprice1); bw.write(NEWLINE); bw.write(dprice2); bw.write(NEWLINE); bw.write(isbn); bw.write(NEWLINE); bw.write(type); bw.write(NEWLINE); bw.write(pagecount); bw.write(NEWLINE); bw.write(publish); bw.write(NEWLINE); bw.write(publishdate); bw.write(NEWLINE); bw.write(SEPARATOR); bw.write(NEWLINE); bw.write(content); } catch (ParserException e) { e.printStackTrace(); } try { if (bw != null) { bw.close(); } } catch (IOException e) { e.printStackTrace(); } } } catch (Exception e) { e.printStackTrace(); } } } 可是有错误,帮忙改正 编译小错误?? 简单的问题,需要大家帮我看下 一个非常棘手的JAVA程序 怎么样能遍历类的所有属性和所有方法? 我的处女作%%%%%%%%%%%……%¥……% 关于数字类型自动转换,请问怎么使用Number类,请大家帮忙看看,谢谢. "Hello".getBytes("UTF-16")=? 我想开始学JAVA啦 如何替换字符串中的连字符.为其他字符 java编写了一段简单的代码,编译后显示错误,请高手看看 java中关于文件问题 JAVA-api 打印的问题!(程序是调试通过的),麻烦高人们来看看
//书名
String title;
//作者
String author;
// 价格
String price;
// 优惠价
String dprice1;
// vip价
String dprice2;
//
// ISBN
String isbn;
//
String publish; String publishdate; String content;
图片连接
String imgurl;
网页连接
// String url;
类型
String type;
//页数
String pagecount;
但问题没解决呀
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException; import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.RegexFilter;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException; public class ParserFor2 { public static void main(String args[]) {
String NEWLINE = "\r\n"; String SEPARATOR = "================================"; String title; String author;
// �г��۸�
String price;
//
String dprice1;
//
String dprice2;
//
// ISBN
String isbn;
// �����
String publish; String publishdate; String content; String imgurl; String url; String type; String pagecount; String url1; try { BufferedReader br = new BufferedReader(new FileReader("D:\\tt.txt")); String line; BufferedWriter bw = null; while (null != (line = br.readLine())) {
try { StringBuffer t1 = new StringBuffer(); Parser parser = new Parser(line); parser.setEncoding("gb2312"); NodeFilter filter_title = new AndFilter(new TagNameFilter(
"a"), new HasAttributeFilter("name", "top_bk")); NodeList nodelist = parser.parse(filter_title); Node node_title = nodelist.elementAt(0); title = node_title.getNextSibling().getNextSibling()
.getText(); parser.reset(); NodeFilter filter_author = new AndFilter(new TagNameFilter(
"div"), new HasAttributeFilter("id", "author_")); nodelist = parser.parse(filter_author); Node node_author = nodelist.elementAt(0); author = node_author.getChildren().asString().substring(6); parser.reset(); NodeFilter filter_publish = new AndFilter(
new TagNameFilter("div"), new HasAttributeFilter(
"id", "publisher_")); nodelist = parser.parse(filter_publish); Node node_publish = nodelist.elementAt(0); publish = node_publish.getChildren().asString()
.substring(7); parser.reset(); NodeFilter filter_isbn = new RegexFilter(
"I S B N �� \\d\\d\\d\\d\\d\\d\\d\\d\\d\\d"); nodelist = parser.parse(filter_isbn); Node node_isbn = nodelist.elementAt(0); isbn = node_isbn.getText().substring(13); parser.reset(); NodeFilter filter_publishdate = new StringFilter("���ʱ�䣺"); nodelist = parser.parse(filter_publishdate); Node node_publishdate = nodelist.elementAt(1); publishdate = node_publishdate.getText().substring(6); parser.reset(); NodeFilter filter_type = new StringFilter(">> "); nodelist = parser.parse(filter_type); Node node_type = nodelist.elementAt(0); type = node_type.getNextSibling().getChildren().asString(); parser.reset(); NodeFilter filter_imgurl = new AndFilter(new TagNameFilter(
"img"),
new HasAttributeFilter("id", "img_show_prd")); nodelist = parser.parse(filter_imgurl); Node node_imgurl = nodelist.elementAt(0); imgurl = node_imgurl.getText(); imgurl = imgurl.substring(imgurl.indexOf("images") - 1,
imgurl.indexOf(".jpg") + 4); parser.reset(); NodeFilter filter_price = new StringFilter("���ۣ�"); nodelist = parser.parse(filter_price); Node node_price = nodelist.elementAt(0); price = node_price.getText().substring(4); parser.reset(); NodeFilter filter_dprice1 = new StringFilter("�����ۣ�"); nodelist = parser.parse(filter_dprice1); Node node_dprice1 = nodelist.elementAt(0); dprice1 = node_dprice1.getNextSibling().getNextSibling()
.getText().substring(1); parser.reset(); NodeFilter filter_dprice2 = new StringFilter("��ʯvip�ۣ�"); nodelist = parser.parse(filter_dprice2); Node node_dprice2 = nodelist.elementAt(0); try {
dprice2 = node_dprice2.getText();
dprice2 = dprice2.substring(
dprice2.indexOf("��ʯvip�ۣ�") + 8).trim();
} catch (NullPointerException e) {
dprice2 = "null";
} parser.reset(); NodeFilter filter_pagecount = new StringFilter("ҳ������"); nodelist = parser.parse(filter_pagecount); Node node_pagecount = nodelist.elementAt(0); pagecount = node_pagecount.getText().substring(6).trim(); parser.reset(); NodeFilter filter_content = new StringFilter("���ݼ��"); nodelist = parser.parse(filter_content); Node node_content = nodelist.elementAt(0); try {
content = node_content.getParent().getNextSibling()
.getNextSibling().getChildren().asString();
} catch (NullPointerException e) {
content = "null";
}
t1.append("dangdang").append("-").append(isbn);
bw = new BufferedWriter(new FileWriter(
new File("E:\\book\\"+t1 + ".txt"))); bw.write(line);
bw.write(NEWLINE);
bw.write(imgurl);
bw.write(NEWLINE);
bw.write(title);
bw.write(NEWLINE);
bw.write(author);
bw.write(NEWLINE);
bw.write(price);
bw.write(NEWLINE);
bw.write(dprice1);
bw.write(NEWLINE);
bw.write(dprice2);
bw.write(NEWLINE);
bw.write(isbn);
bw.write(NEWLINE);
bw.write(type);
bw.write(NEWLINE);
bw.write(pagecount);
bw.write(NEWLINE);
bw.write(publish);
bw.write(NEWLINE);
bw.write(publishdate);
bw.write(NEWLINE);
bw.write(SEPARATOR);
bw.write(NEWLINE);
bw.write(content); } catch (ParserException e) {
e.printStackTrace();
}
try {
if (bw != null) {
bw.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
} }
}
可是有错误,帮忙改正