现在用下面的代码获取了一篇网文的源码内容,现想获取到里面的每个章节的标题,代码要怎样写?各位大哥大姐帮帮忙!
package cn.test;import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class CaiJi01 { public static void main(String[] args) {
CaiJi01 cj=new CaiJi01();
String s=cj.getOneHtml();
System.out.println(s); } public String getOneHtml() {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL("http://www.luoqiu.com/html/38/38320/index.html");
final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}}
package cn.test;import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class CaiJi01 { public static void main(String[] args) {
CaiJi01 cj=new CaiJi01();
String s=cj.getOneHtml();
System.out.println(s); } public String getOneHtml() {
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try {
url = new URL("http://www.luoqiu.com/html/38/38320/index.html");
final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}}
解决方案 »
- webservice的一个问题,求救!(急!急!急!)
- 关于SetCharacterEncodingFilter很纠结的一个问题,求指教
- jsp中日期怎么转换啊!!!!!!
- axis2 发布webservice url路径的问题
- 怎么才能够把validate()返回的ActionErrors显示在html页面上?
- Mysql数据库能胜任 易趣eachnet.com 这样的数据量吗?
- tomcat4.1.18的root的问题
- 求助:关于一个下拉框的小问题,在线等,真的很急!!!!
- 输入“——”符号时,再从数据库读出来显示到网页上,就变成了??,其他一切正常?
- weblogic 6.2 中修改servlet后总是要重启服务,如何才能取消???
- 关于lucene在弹出窗口中高亮的问题
- 求救:type Exception report
用这个 Parser.parse(sb.toString(), "") 生成Document对象
Elements trs = Document.select("tr[class=smallText]");
能不能举个小例子?例如:Html源代码中有这些
<span><a href="4015239.html">第一章 星空中的青铜巨棺</a></span>
<span><a href="4026955.html">第二章 素问</a></span>
<span><a href="4052138.html">第三章 今昔</a></span>我想获得所有像“第一章 星空中的青铜巨棺”的标题,我要怎么写?
Document doc = Parser.parse(html, "");
Elements spans = doc.select("span");
for(int i = 0; i < spans.length; i++) {
System.err.println(spans[i].text());
}
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;/**
* @author Sugar.Tan
* 2011-08-05
*/public class ReadNetXml {
public static void main(String[] args) throws Exception{
List<String> lstTitle = new ArrayList<String>();
URL url = new URL("http://www.luoqiu.com/html/38/38320/index.html");
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
StringBuffer sb = new StringBuffer();
String temp;
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
while (true) {
int start = sb.indexOf("<span><a");
if (start == -1) break;//结束
sb.delete(0, start);//把以<span><a开始的删掉
start = sb.indexOf("第");
int end = sb.indexOf("</a></span>");
String title = null;
if (start < end && start > 0) {
title = sb.substring(start, end);
} else {
//有异常,“第” 怎么能在“</a></span>”标签的后面?把</a></span>及前面的都删掉
start = sb.indexOf("</a></span>") + "</a></span>".length() - 1;
sb.delete(0, start);
continue;
}
lstTitle.add(title);
start = sb.indexOf("</a></span>") + "</a></span>".length() - 1;
sb.delete(0, start);//把</a></span>及前面的都删掉
}
for (String str : lstTitle) {
System.out.println(str);
}
}
}