小弟最近在弄网页抓取..就是想实现一下多线程抓多网页..下面的只是多线程抓同一页面..想请哪位大侠指点..如果用多线程来抓取不同的网页.
import java.net.*;
import java.io.*;
import java.util.regex.*;public class urls { String sourceURL;//需要采集的网页网址
String sourceContent;//网页页面内容
//String URLs; //采集到的超链接
//String title;//采集到的链接文字
String beginStr;//网页内容匹配区域开始字符串
String endStr;//网页内容匹配区域结束字符串
String matchContent;//网页内容匹配区域
public static void main(String[] args) {
clay c = new clay("morethreads");
Thread t = new Thread (c);
Thread t1 = new Thread (c);
t.start();
t1.start(); } public urls(String sourceURL1, String beginStr1, String endStr1) {
sourceURL=sourceURL1;
beginStr=beginStr1;
endStr=endStr1; }
public void getSourseContent(String URLStr){
StringBuffer sb=new StringBuffer();
try {
URL newURL=new URL(URLStr);
BufferedReader br=new BufferedReader(new InputStreamReader(newURL.openStream()));
String temp = null;
while((temp=br.readLine())!=null)
{
sb.append(temp);//append详细看书36页
}
sourceContent=sb.toString();
//System.out.println(sourceContent);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public String getMatchContent(String beginStr, String endStr) {
String regex=beginStr+".*?"+endStr;
//System.out.println(regex);
Pattern pt=Pattern.compile(regex);
Matcher mt=pt.matcher(sourceContent);
if(mt.find())
{
return matchContent=mt.group();
}
else {
System.out.println("i'm null");
return null;
}
}
public void getString(String s)
{
int counter=0;//计算器 计算匹配的个数
String regexURL="/.{1,5}/\\d{4}-\\d{2}-\\d{2}/\\w{10,15}.shtml\\sTARGET=_blank>.*?/a>";
Pattern pt=Pattern.compile(regexURL);
Matcher mt=pt.matcher(s);
while(mt.find())
{
String s2=mt.group();
counter++;
System.out.println(mt.group());
//获取并打印标题
String titleRegex=">.*?</a>";
String title;
Matcher mt1=Pattern.compile(titleRegex).matcher(s2);
while(mt1.find())
{
title=mt1.group().replaceAll(">|</a>|<font.*?>|</font>","");
System.out.println("标题:"+title);
}
//获取并打印网址
String urlsRegex="/.{1,5}/\\d{4}-\\d{2}-\\d{2}/\\w{10,15}.shtml";
String urls;
Matcher mt2=Pattern.compile(urlsRegex).matcher(s2);
while(mt2.find())
{
urls=mt2.group().replaceAll("<a href=|>","");
System.out.println("网址:http://news.sina.com.cn"+urls);
}
System.out.println();//空行
}
System.out.println("共有"+counter+"个符合结果");
}}class clay implements Runnable{
clay (String s){
super();
}
public void run(){
urls url = new urls("http://news.sina.com.cn/society/wx/index.html","<body","</BODY>");
url.getSourseContent(url.sourceURL);
url.matchContent=url.getMatchContent(url.beginStr,url.endStr);
url.getString(url.matchContent);
}
}
import java.net.*;
import java.io.*;
import java.util.regex.*;public class urls { String sourceURL;//需要采集的网页网址
String sourceContent;//网页页面内容
//String URLs; //采集到的超链接
//String title;//采集到的链接文字
String beginStr;//网页内容匹配区域开始字符串
String endStr;//网页内容匹配区域结束字符串
String matchContent;//网页内容匹配区域
public static void main(String[] args) {
clay c = new clay("morethreads");
Thread t = new Thread (c);
Thread t1 = new Thread (c);
t.start();
t1.start(); } public urls(String sourceURL1, String beginStr1, String endStr1) {
sourceURL=sourceURL1;
beginStr=beginStr1;
endStr=endStr1; }
public void getSourseContent(String URLStr){
StringBuffer sb=new StringBuffer();
try {
URL newURL=new URL(URLStr);
BufferedReader br=new BufferedReader(new InputStreamReader(newURL.openStream()));
String temp = null;
while((temp=br.readLine())!=null)
{
sb.append(temp);//append详细看书36页
}
sourceContent=sb.toString();
//System.out.println(sourceContent);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public String getMatchContent(String beginStr, String endStr) {
String regex=beginStr+".*?"+endStr;
//System.out.println(regex);
Pattern pt=Pattern.compile(regex);
Matcher mt=pt.matcher(sourceContent);
if(mt.find())
{
return matchContent=mt.group();
}
else {
System.out.println("i'm null");
return null;
}
}
public void getString(String s)
{
int counter=0;//计算器 计算匹配的个数
String regexURL="/.{1,5}/\\d{4}-\\d{2}-\\d{2}/\\w{10,15}.shtml\\sTARGET=_blank>.*?/a>";
Pattern pt=Pattern.compile(regexURL);
Matcher mt=pt.matcher(s);
while(mt.find())
{
String s2=mt.group();
counter++;
System.out.println(mt.group());
//获取并打印标题
String titleRegex=">.*?</a>";
String title;
Matcher mt1=Pattern.compile(titleRegex).matcher(s2);
while(mt1.find())
{
title=mt1.group().replaceAll(">|</a>|<font.*?>|</font>","");
System.out.println("标题:"+title);
}
//获取并打印网址
String urlsRegex="/.{1,5}/\\d{4}-\\d{2}-\\d{2}/\\w{10,15}.shtml";
String urls;
Matcher mt2=Pattern.compile(urlsRegex).matcher(s2);
while(mt2.find())
{
urls=mt2.group().replaceAll("<a href=|>","");
System.out.println("网址:http://news.sina.com.cn"+urls);
}
System.out.println();//空行
}
System.out.println("共有"+counter+"个符合结果");
}}class clay implements Runnable{
clay (String s){
super();
}
public void run(){
urls url = new urls("http://news.sina.com.cn/society/wx/index.html","<body","</BODY>");
url.getSourseContent(url.sourceURL);
url.matchContent=url.getMatchContent(url.beginStr,url.endStr);
url.getString(url.matchContent);
}
}
//获取并打印网址
String urlsRegex="/.{1,5}/\\d{4}-\\d{2}-\\d{2}/\\w{10,15}.shtml";
String urls;
Matcher mt2=Pattern.compile(urlsRegex).matcher(s2);
while(mt2.find())
{
urls=mt2.group().replaceAll(" <a href= ¦>","");
System.out.println("网址:http://news.sina.com.cn"+urls);
}
System.out.println();//空行
这个地方获取到新的url后可以启动一个新的线程对象来开始新的抓取不过建议你建立一个url的缓冲池(集合就可以),在抓取过程中不停的向这个缓冲池中添加url ,当然如果遇到已经有个url就不用添加了
另外启动一个线程,不停的从这个url的缓冲池中读取url并生成线程对象开始抓取,抓取过程中也会不停的向这个缓冲池中添加url,
当然你要判断是否已经添加,是否允许外部url ,最后这个url抓取完了还做一个完成标志
形成典型的生产者、消费者模型