int index1 = 0; int index2 = 0; String link = null; String path = null; String dir = null; URL url = null; while (m.find()) { link = m.group(1).trim(); if (link.length() == 0) continue;
// 跳过链到本页面内链接。 if (link.charAt(0) == '#') continue;
// 跳过mailto: if (link.indexOf("mailto:") != -1) continue;
// 跳过javascript if (link.toLowerCase().indexOf("javascript") != -1) continue; if (link.indexOf("://") == -1) { if (link.charAt(0) == '/') { //处理绝对地址 link = "http://" + webName + link; } else { //处理相对地址 path = pageUrl.getPath(); dir = path.substring(0, path.lastIndexOf('/') + 1); link = "http://" + webName + '/' + dir + link; } } else { if (!link.startsWith("http")) continue; else { index1 = link.indexOf('/', 7); if (index1 == -1) link = link + '/'; } }
index2 = link.indexOf('#'); if (index2 != -1) { link = link.substring(0, index2); }
link = removewww(link); // 跳过非本网站的URL url = makeURL(link); if (url == null) continue; if (!isInHost(url)) continue;
if (urlQueue.contains(link)) continue; urlQueue.add(link); } } /** * 主函数 */ public void search() throws Exception { if (urlQueue == null) { System.out.println("urlQueue is null!"); return ; }
String sURL = null; URL url = null; String pageContents = null; while (urlQueue.size() > 0) { // Get URL at bottom of the list. sURL = urlQueue.getHead();
你去看看这篇文章
http://blog.csdn.net/zzr173/archive/2006/12/11/1438691.aspx
String一但创建,就不可改变,如果改变(我们看来的改变)的话都是会new新的String
如果经常要改变值的String,建议改成StringBuffer,减少String的从堆上不停分配内存
事实上很可能是没有关闭连接,那些String 可能是连接里临时的sql语句和得到的数据对象。
1、我没有用stirng=string+string之类的,碰到这种情况我都是用的stringbuffer.append
2、也没有忘记关闭连接,像FileChannel我都在用完了之后close()
3、用完了string之后我也将string的reference置为空,gc应该回收阿
可能是 StringBuffer的申明对象 占用了空间。
关键是char[]太多,jvm内存70M的时候char[]就占了50M,怎么回事??
程序中没有那么多String阿,就算用string我也在用完后将string的reference置为空,gc应该回收阿
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line + "\n");
}
这段代码由问题吗?
每次读取网页内容都用下面这段代码:
BufferedReader reader = new BufferedReader(new InputStreamReader
(con.getInputStream()));
String line = null;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line + "\n");
}
然后将pageBuffer.toString()作为参数传给写文件的方法writeFile和分析网页内容的方法analyse,大概到三四百个网页的时候就outOfMemoryError!
一个StringBuffer只放一个网页的内容
BufferedReader reader = new BufferedReader(new InputStreamReader
(con.getInputStream()));
String line = null;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line + "\n");
}
在这部分代码中,LINE一直在重新创建,长时间运行的情况下,JVM中可能会有没有清理的对象
由于没有程序的实际运行状态,只能猜测是这个问题
你先测试下你的程序在不同长度的时间中,内存的占用情况分别是什么样的
程序从开始到outOfMemoryError都是char[]占了堆中大部分内存,大概80%;BufferedReader.readline()方法也占了80%多的内存
由于我把String s = pageBuffer.toString()作为参数传给写文件的方法writeFile和分析网页内容的方法analyse,我怀疑这些String是不是没有被回收而导致outOfMemoryError
程序框架如下:private String downloadPage(URL url){//下载网页
.........
BufferedReader reader = new BufferedReader(new InputStreamReader
(con.getInputStream()));
String line = null;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line + "\n");
}
return pageBuffer.toString();
}private void writeFile(String page){
Matcher m = pattern.matcher(page);
......//进行预处理
//写网页
}private void retrieveLinks(URL pageUrl, String pageContents){
Matcher m = pattern.matcher(pageContents);
......//获取网页中的URL
}//主函数
public void search() throws Exception{
.....
sURL = urlQueue.getHead();
.....
pageContents = downloadPage(url);
writeFile(pageContents);
retrieveLinks(url, pageContents);
}经过测试证实,在outOfmemoryError时,这些pageContents未被释放,并且占了80%以上的内存
为什么pageContents没有被释放呢??是不是Matcher m = pattern.matcher(pageContents)之后pageContents就不会释放呢?还是我的程序中有bug???
大虾们救救我阿!!!!!
pageBuffer.append(line + "\n");LZ可否改为
pageBuffer.append(line).append("\n");
试试。
写漏了,程序中BufferedReader关掉了
King_liubin(悠游键客):
改成pageBuffer.append(line).append("\n");也还是一样会outOfmemoryError
import java.net.*;
import java.io.*;
import java.util.regex.*;
import file.FileOP;public class SearchOneWeb extends Thread {
private String webName;
private MutexList urlQueue;//MutexList是我写的线程安全的链表类
private static Pattern pattern = Pattern.compile("<a\\s+href\\s*=\\s*[\"|']?(.*?)['|\"|>]", Pattern.CASE_INSENSITIVE);
public SearchOneWeb(String webName, MutexList urlQueue) {
this.webName = removewww(webName);
this.urlQueue = urlQueue;
urlQueue.add("http://" + webName + "/");
start();
}
public SearchOneWeb(SearchOneWeb sow) {
this.webName = sow.webName;
this.urlQueue = sow.urlQueue;
start();
} public void run(){
try {
search();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 去掉www
* @param url
*/
private static String removewww(String url) {
if(url != null) {
int index = url.indexOf("www.");
if (index != -1)
return url.substring(0, index) + url.substring(index + 4);
else
return url;
}
else
return null;
}
/**
* 构造URL对象
* @param url
*/
private static URL makeURL(String url) {
if(url == null)
return null;
// 只处理HTTP
if (!url.toLowerCase().startsWith("http://"))
return null; URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
if(verifiedUrl.getPath() == null)
verifiedUrl = new URL(verifiedUrl.toString() + "/");
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
/**
* 测试url是否属于本站
* @param url
*/
private boolean isInHost(URL url) {
boolean result = false;
if(url != null) {
String host = url.getHost().toLowerCase();
if(host.indexOf(webName) != -1)
result = true;
}
return result;
}
/**
* 下载更新的网页
* @param url
* @return 网页的内容
*/
private String downloadPage(URL url) {
String sURL = url.toString();
String pageContent = null;
try {
HttpURLConnection con = (HttpURLConnection)url.openConnection();
con.connect();
BufferedReader reader = new BufferedReader(new InputStreamReader(con.getInputStream()));
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line).append('\n');
}
pageContent = pageBuffer.toString();
reader.close();
con.disconnect();
} catch(IOException e) {
System.out.println("error: can't connect to " + sURL);
}
return pageContent;
}
/**
* 写文件
* @param url
* @param content
*/
private void writeFile(String url, String content, String charSet) {
try {
FileOP.write(url, content, charSet);
} catch (IOException e) {
e.printStackTrace();
System.out.println("can't write file " + url);
}
}
/**
* 解析页面并找出链接
* @param pageUrl
* @param pageContents
*/
private void retrieveLinks(URL pageUrl, String pageContents) {
//
Matcher m = pattern.matcher(pageContents);
int index1 = 0;
int index2 = 0;
String link = null;
String path = null;
String dir = null;
URL url = null;
while (m.find()) {
link = m.group(1).trim();
if (link.length() == 0)
continue;
// 跳过链到本页面内链接。
if (link.charAt(0) == '#')
continue;
// 跳过mailto:
if (link.indexOf("mailto:") != -1)
continue;
// 跳过javascript
if (link.toLowerCase().indexOf("javascript") != -1)
continue; if (link.indexOf("://") == -1) {
if (link.charAt(0) == '/') { //处理绝对地址
link = "http://" + webName + link;
} else { //处理相对地址
path = pageUrl.getPath();
dir = path.substring(0, path.lastIndexOf('/') + 1);
link = "http://" + webName + '/' + dir + link;
}
} else {
if (!link.startsWith("http"))
continue;
else {
index1 = link.indexOf('/', 7);
if (index1 == -1)
link = link + '/';
}
}
index2 = link.indexOf('#');
if (index2 != -1) {
link = link.substring(0, index2);
}
link = removewww(link);
// 跳过非本网站的URL
url = makeURL(link);
if (url == null)
continue;
if (!isInHost(url))
continue;
if (urlQueue.contains(link))
continue;
urlQueue.add(link);
} }
/**
* 主函数
*/
public void search() throws Exception {
if (urlQueue == null) {
System.out.println("urlQueue is null!");
return ;
}
String sURL = null;
URL url = null;
String pageContents = null;
while (urlQueue.size() > 0) {
// Get URL at bottom of the list.
sURL = urlQueue.getHead();
url = makeURL(sURL);
if (url == null)
continue;
pageContents = downloadPage(url);
if (pageContents != null && pageContents.length() > 0) {
writeFile(url.toString(), pageContents, "GBK");
retrieveLinks(url, pageContents);
} }
}
/**
* 用于测试
* @param args
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
MutexList urlQueue = new MutexList();
SearchOneWeb sow = new SearchOneWeb("news.163.com", urlQueue);
sleep(10000);
SearchOneWeb sow1 = new SearchOneWeb(sow);
SearchOneWeb sow2 = new SearchOneWeb(sow);
}
}快要死了,问题依旧
各位大虾们,帮帮忙,谢谢了