花了2天时间写了个多线程网络爬虫,可以通过配置文件设定根网址和需要过滤的网站,以及保存的目录,JAVA版本的

package com.yinwf.main;import com.yinwf.tool.ContentThread;
import com.yinwf.tool.ParseThread;
import com.yinwf.util.Constant;
import com.yinwf.util.UrlQueue;public class Start {
public static void main(String[] args) {
try {
Constant.getConfig();
} catch (Exception e) {
e.printStackTrace();
System.out.println("加载配置文件出错,无法获得根网址");
}
System.out.println("获取根网址是:" + Constant.rootUrl);
System.out.println("获取过滤信息是:" + Constant.fiter);
System.out.println("获取保存目录是:" + Constant.saveDir);
System.out.println("配置文件读取结束"); UrlQueue urlQueue = Constant.getUrlQueue();
System.out.println("创建网址队列,初始化长度为0"); // 第一次把根网址放进队列
urlQueue.getParseLinkedList().addLast(Constant.rootUrl);
urlQueue.getContentLinkedList().addLast(Constant.rootUrl); // 主线程结束,启动2个新的进程,分别是抓取网址和下载内容进程
System.out.println("把根网址加入爬取网址队列");
System.out.println("把根网址加入下载网址队列");
System.out.println("现在爬取网址队列长度是"
+ Constant.getUrlQueue().getParseLinkedList().size());
System.out.println("现在下载网址队列长度是"
+ Constant.getUrlQueue().getContentLinkedList().size());
ParseThread parseThread = new ParseThread();
ContentThread contentThread = new ContentThread();
parseThread.start();
System.out.println("put线程启动");
contentThread.start();
System.out.println("get线程启动");
}
}

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

package com.yinwf.tool;import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;import com.yinwf.util.Constant;public class ConnectWeb { public String getUrlParse() throws Exception {
String realUrl = (String) Constant.getUrlQueue().getParseLinkedList()
.removeFirst();
System.out.println("------"+Constant.getUrlQueue().getParseLinkedList().size());
System.out.println("------"+Constant.getUrlQueue().getContentLinkedList().size());

Constant.getParseMap().put(realUrl, "");
URL url = new URL(realUrl);
URLConnection conn = url.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(conn
.getInputStream()));
StringBuffer document = new StringBuffer();
String line = "";
while ((line = reader.readLine()) != null) {
document.append(line);
} reader.close();
// 获得内容字符串
String content = document.toString();
return content;
}

public String getUrlContent(String realUrl) throws Exception {

System.out.println("写入" + realUrl + "到hashmap和队列中");
String fileName = ++Constant.fileNameNumber + ".htm";
Constant.getContentMap().put(realUrl, fileName);
URL url = new URL(realUrl);
URLConnection conn = url.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(conn
.getInputStream()));
StringBuffer document = new StringBuffer();
String line = "";
while ((line = reader.readLine()) != null) {
document.append(line);
} reader.close();
// 获得内容字符串
String content = document.toString();
return content;
}}
package com.yinwf.tool;import com.yinwf.util.Constant;public class ContentThread extends Thread { public void run() {
while (Constant.getUrlQueue().getContentLinkedList().size() > 0) {
try {
Constant.getUrlQueue().get();
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
}
}
package com.yinwf.tool;import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;import com.yinwf.util.Constant;public class DownContent { public void down(String content)throws Exception {
File downFile = new File(Constant.saveDir, Constant.fileNameNumber
+ ".htm");
downFile.createNewFile();
BufferedWriter bw = new BufferedWriter(new FileWriter(downFile
.getPath(), true));
bw.write(content);
bw.flush();
bw.close();
}
}
这个太弱了,要实现的话用htmlparser,效果要远比这个好