自己试着写了一个从网页抓取文本的代码,运行到一段时间之后中断了,出现了以下的错误信息:
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOfRange(Arrays.java:3209)
at java.lang.String.<init>(String.java:216)
at java.lang.StringBuilder.toString(StringBuilder.java:430)
at sina.GetWebcode(sina.java:45)
at sina.main(sina.java:282)
代码如下:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Hashtable;import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import java.util.Hashtable;public class sina
{
public static ArrayList<String> NewUrls = new ArrayList<String>();
public static Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>();// 存储所有URL深度
public static String Nowurl;
public static ArrayList<String> BlogUrls = new ArrayList<String>();
public static int Maxdeepth = 10;
public static int WebIndex = 0;
public static String fPath = "F:/JAVA代码存储/sina-travel/data";
public static int num = 0;
static String GetWebcode(String Nowurl) //通过url得到网页源码
{
String Webcode = "";
try
{
URL nowurl;
nowurl=new URL(Nowurl);
URLConnection Conn;
Conn=nowurl.openConnection();
Conn.setDoInput(true);
InputStreamReader read = new InputStreamReader(Conn.getInputStream());
BufferedReader reader=new BufferedReader(read);
String str;
while((str=reader.readLine())!=null){
str=str+"\n";
Webcode = Webcode+str;
}
reader.close();
read.close();
}catch(IOException e){}
return Webcode.toString();
}
static void DrawUrl(String Precode)//PreCode为网页源码
{
if(deepUrls.get(Nowurl)>Maxdeepth)
{
return;
}
String link = "";
String temple = "";
String reg1 = "tp://travel.sina";
int index0 = Precode.indexOf(reg1);
//String reg2 = "";
int index1 = -1;
int index2 = -1;
int index3 = -1;//用于表示链接之前的单引号
int index4 = -1;//用于表示链接之后的单引号
if(index0>0)
{
temple = Precode.substring(0, index0-1);
index1 = temple.lastIndexOf('"');
index3 = temple.lastIndexOf("'");
index2 = Precode.indexOf('"', index0);
index4 = Precode.indexOf("'", index0);
if(index3>index1&&index4<index2)
{
link = Precode.substring(index3+1, index4);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
else
{
link = Precode.substring(index1+1, index2);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
}
index0 = Precode.indexOf(reg1,index2+1);
while(index0>0)
{
temple = Precode.substring(index2+1, index0-1);
index1 = index2+1+temple.lastIndexOf('"');
index3 = index2+1+temple.lastIndexOf("'");
index2 = Precode.indexOf('"', index0);
index4 = Precode.indexOf("'",index0);
if(index3>index1&&index4<index2)
{
link = Precode.substring(index3+1, index4);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
else
{
link = Precode.substring(index1+1, index2);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
index0 = Precode.indexOf(reg1,index2+1);
}
//////////////////////////
/////抓取相关博客部分
link = "";
temple = "";
reg1 = "ttp://blog.sina";
index0 = Precode.indexOf(reg1);
index1 = -1;
index2 = -1;
if(index0>0)
{
temple = Precode.substring(0, index0-1);
index1 = temple.lastIndexOf('"');
index2 = Precode.indexOf('"', index0);
if(index1>0&&index2>index1)
{
link = Precode.substring(index1+1, index2);
if(!NewUrls.contains(link))
BlogUrls.add(link);
}
}
index0 = Precode.indexOf(reg1,index2+1);
while(index0>0)
{
temple = Precode.substring(index2+1, index0-1);
index1 = index2+1+temple.lastIndexOf('"');
index2 = Precode.indexOf('"', index0);
if(index1>0&&index2>index1)
{
link = Precode.substring(index1+1, index2);
if(!BlogUrls.contains(link))
BlogUrls.add(link);
}
index0 = Precode.indexOf(reg1,index2+1);
}
}
static String GetNewUrl(ArrayList<String> al)
{
String tmpAUrl = al.get(0);
return tmpAUrl;
}
static void removeurl(ArrayList<String> al)
{
al.remove(0);
}
static String Drawtext1(String reg1,String reg2,String PreCode) //只抓取两个标签之间内容并且去除空格
{
String temple = "";
//String result = "";
int index1 = -1;
int index2 = -1;
int length1 = reg1.length();
//int length2 = reg2.length();
index1 = PreCode.indexOf(reg1);
index2 = PreCode.indexOf(reg2);
if(index1<0||index2<0)
return "";
temple = PreCode.substring(index1+length1, index2);
//String lastreg = "<[^>]+>";
//temple = temple.replaceAll(lastreg, "");
String kongge = "\\s*";
temple = temple.replaceAll(kongge, "");
return temple;
}
static String Drawtext2(String reg1,String reg2,String PreCode)//在Drawtext1的基础上抓起<p><\p>之间内容,然后去除其他标签
{
String endtext = "";
String result = "";
endtext = Drawtext1(reg1,reg2,PreCode);
String reg3 = "<p>";
String reg4 = "</p>";
int dex3 = -1;
int dex4 = -1;
dex3 = endtext.indexOf(reg3);
while(dex3>0)
{
dex4 = endtext.indexOf(reg4,dex3+3);
if(dex4>dex3)
result = result+endtext.substring(dex3, dex4);
else break;
dex3 = endtext.indexOf(reg3, dex4+4);
}
String lastreg = "<[^>]+>";
result = result.replaceAll(lastreg, "");
return result;
}
static void save(String text) //向文件夹中存入文本
{
try
{
File file=new File(fPath,WebIndex+".txt");
file.createNewFile();
BufferedWriter bw=new BufferedWriter(new FileWriter(file));
bw.write(text);
bw.close();
}catch(IOException e){}
WebIndex++;
}
static String deal01(String input)
{
//String output = "";
String pic = "【图\\d+】";
String pic2 = "(该图取自网络)";
input = input.replaceAll(pic, "");
input = input.replaceAll(pic2, "");
//input = input.replaceAll("[\\pP‘’“”,。·《》=]", " ");
return input;
}
static void DrawTCode(String PreCode) //网页源码为参数,抓取旅游内容
{
String result = "";
result = result+Drawtext1("<title>","</title>",PreCode);
result = result+'\n';
result = result+Nowurl+'\n';
result = result+Drawtext2("<!-- 正文页概述信息 begin -->","<!-- 正文页概述信息 end -->",PreCode);
result = result+Drawtext2("<!-- 正文部分 begin -->","<!-- 正文部分 end -->",PreCode);
result = result+Drawtext2("<!-- 正文内容 begin -->","<!-- 正文内容 end -->",PreCode);
//添加标签
if(result.length()<10)
return;
save(result);
}
static void DrawBCode(String PreCode) //网页源码为参数,抓取博客内容
{
String result = "";
result = result+Drawtext1("<title>","</title>",PreCode);
result = result+'\n';
result = result+Drawtext1("<!-- 正文开始 -->","<!-- 正文结束 -->",PreCode);
result = deal01(result);
save(result);
}
static boolean isshtml(String url)
{
int lastd = url.lastIndexOf('.');
int size = url.length();
String end = url.substring(lastd+1, size);
if(end.compareTo("shtml")==0)
return true;
else return false;
}
public static void main(String[] args)
{
Nowurl = "http://travel.sina.com.cn/";
deepUrls.put(Nowurl, 0);
DrawUrl(GetWebcode(Nowurl));
while(NewUrls.size()>0)
{
Nowurl = GetNewUrl(NewUrls);
DrawUrl(GetWebcode(Nowurl));
if(isshtml(Nowurl)==true)
{
System.out.println(Nowurl);
DrawTCode(GetWebcode(Nowurl));
removeurl(NewUrls);
}
else
{
removeurl(NewUrls);
}
System.out.println(num);
num++;
}
while(BlogUrls.size()>0)
{
DrawBCode(GetWebcode(GetNewUrl(BlogUrls)));
}
}
}希望哪位高手能够帮我解决一下这个问题~谢谢!
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOfRange(Arrays.java:3209)
at java.lang.String.<init>(String.java:216)
at java.lang.StringBuilder.toString(StringBuilder.java:430)
at sina.GetWebcode(sina.java:45)
at sina.main(sina.java:282)
代码如下:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Hashtable;import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import java.util.Hashtable;public class sina
{
public static ArrayList<String> NewUrls = new ArrayList<String>();
public static Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>();// 存储所有URL深度
public static String Nowurl;
public static ArrayList<String> BlogUrls = new ArrayList<String>();
public static int Maxdeepth = 10;
public static int WebIndex = 0;
public static String fPath = "F:/JAVA代码存储/sina-travel/data";
public static int num = 0;
static String GetWebcode(String Nowurl) //通过url得到网页源码
{
String Webcode = "";
try
{
URL nowurl;
nowurl=new URL(Nowurl);
URLConnection Conn;
Conn=nowurl.openConnection();
Conn.setDoInput(true);
InputStreamReader read = new InputStreamReader(Conn.getInputStream());
BufferedReader reader=new BufferedReader(read);
String str;
while((str=reader.readLine())!=null){
str=str+"\n";
Webcode = Webcode+str;
}
reader.close();
read.close();
}catch(IOException e){}
return Webcode.toString();
}
static void DrawUrl(String Precode)//PreCode为网页源码
{
if(deepUrls.get(Nowurl)>Maxdeepth)
{
return;
}
String link = "";
String temple = "";
String reg1 = "tp://travel.sina";
int index0 = Precode.indexOf(reg1);
//String reg2 = "";
int index1 = -1;
int index2 = -1;
int index3 = -1;//用于表示链接之前的单引号
int index4 = -1;//用于表示链接之后的单引号
if(index0>0)
{
temple = Precode.substring(0, index0-1);
index1 = temple.lastIndexOf('"');
index3 = temple.lastIndexOf("'");
index2 = Precode.indexOf('"', index0);
index4 = Precode.indexOf("'", index0);
if(index3>index1&&index4<index2)
{
link = Precode.substring(index3+1, index4);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
else
{
link = Precode.substring(index1+1, index2);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
}
index0 = Precode.indexOf(reg1,index2+1);
while(index0>0)
{
temple = Precode.substring(index2+1, index0-1);
index1 = index2+1+temple.lastIndexOf('"');
index3 = index2+1+temple.lastIndexOf("'");
index2 = Precode.indexOf('"', index0);
index4 = Precode.indexOf("'",index0);
if(index3>index1&&index4<index2)
{
link = Precode.substring(index3+1, index4);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
else
{
link = Precode.substring(index1+1, index2);
if(!NewUrls.contains(link))
NewUrls.add(link);
deepUrls.put(link,deepUrls.get(Nowurl)+1);
}
index0 = Precode.indexOf(reg1,index2+1);
}
//////////////////////////
/////抓取相关博客部分
link = "";
temple = "";
reg1 = "ttp://blog.sina";
index0 = Precode.indexOf(reg1);
index1 = -1;
index2 = -1;
if(index0>0)
{
temple = Precode.substring(0, index0-1);
index1 = temple.lastIndexOf('"');
index2 = Precode.indexOf('"', index0);
if(index1>0&&index2>index1)
{
link = Precode.substring(index1+1, index2);
if(!NewUrls.contains(link))
BlogUrls.add(link);
}
}
index0 = Precode.indexOf(reg1,index2+1);
while(index0>0)
{
temple = Precode.substring(index2+1, index0-1);
index1 = index2+1+temple.lastIndexOf('"');
index2 = Precode.indexOf('"', index0);
if(index1>0&&index2>index1)
{
link = Precode.substring(index1+1, index2);
if(!BlogUrls.contains(link))
BlogUrls.add(link);
}
index0 = Precode.indexOf(reg1,index2+1);
}
}
static String GetNewUrl(ArrayList<String> al)
{
String tmpAUrl = al.get(0);
return tmpAUrl;
}
static void removeurl(ArrayList<String> al)
{
al.remove(0);
}
static String Drawtext1(String reg1,String reg2,String PreCode) //只抓取两个标签之间内容并且去除空格
{
String temple = "";
//String result = "";
int index1 = -1;
int index2 = -1;
int length1 = reg1.length();
//int length2 = reg2.length();
index1 = PreCode.indexOf(reg1);
index2 = PreCode.indexOf(reg2);
if(index1<0||index2<0)
return "";
temple = PreCode.substring(index1+length1, index2);
//String lastreg = "<[^>]+>";
//temple = temple.replaceAll(lastreg, "");
String kongge = "\\s*";
temple = temple.replaceAll(kongge, "");
return temple;
}
static String Drawtext2(String reg1,String reg2,String PreCode)//在Drawtext1的基础上抓起<p><\p>之间内容,然后去除其他标签
{
String endtext = "";
String result = "";
endtext = Drawtext1(reg1,reg2,PreCode);
String reg3 = "<p>";
String reg4 = "</p>";
int dex3 = -1;
int dex4 = -1;
dex3 = endtext.indexOf(reg3);
while(dex3>0)
{
dex4 = endtext.indexOf(reg4,dex3+3);
if(dex4>dex3)
result = result+endtext.substring(dex3, dex4);
else break;
dex3 = endtext.indexOf(reg3, dex4+4);
}
String lastreg = "<[^>]+>";
result = result.replaceAll(lastreg, "");
return result;
}
static void save(String text) //向文件夹中存入文本
{
try
{
File file=new File(fPath,WebIndex+".txt");
file.createNewFile();
BufferedWriter bw=new BufferedWriter(new FileWriter(file));
bw.write(text);
bw.close();
}catch(IOException e){}
WebIndex++;
}
static String deal01(String input)
{
//String output = "";
String pic = "【图\\d+】";
String pic2 = "(该图取自网络)";
input = input.replaceAll(pic, "");
input = input.replaceAll(pic2, "");
//input = input.replaceAll("[\\pP‘’“”,。·《》=]", " ");
return input;
}
static void DrawTCode(String PreCode) //网页源码为参数,抓取旅游内容
{
String result = "";
result = result+Drawtext1("<title>","</title>",PreCode);
result = result+'\n';
result = result+Nowurl+'\n';
result = result+Drawtext2("<!-- 正文页概述信息 begin -->","<!-- 正文页概述信息 end -->",PreCode);
result = result+Drawtext2("<!-- 正文部分 begin -->","<!-- 正文部分 end -->",PreCode);
result = result+Drawtext2("<!-- 正文内容 begin -->","<!-- 正文内容 end -->",PreCode);
//添加标签
if(result.length()<10)
return;
save(result);
}
static void DrawBCode(String PreCode) //网页源码为参数,抓取博客内容
{
String result = "";
result = result+Drawtext1("<title>","</title>",PreCode);
result = result+'\n';
result = result+Drawtext1("<!-- 正文开始 -->","<!-- 正文结束 -->",PreCode);
result = deal01(result);
save(result);
}
static boolean isshtml(String url)
{
int lastd = url.lastIndexOf('.');
int size = url.length();
String end = url.substring(lastd+1, size);
if(end.compareTo("shtml")==0)
return true;
else return false;
}
public static void main(String[] args)
{
Nowurl = "http://travel.sina.com.cn/";
deepUrls.put(Nowurl, 0);
DrawUrl(GetWebcode(Nowurl));
while(NewUrls.size()>0)
{
Nowurl = GetNewUrl(NewUrls);
DrawUrl(GetWebcode(Nowurl));
if(isshtml(Nowurl)==true)
{
System.out.println(Nowurl);
DrawTCode(GetWebcode(Nowurl));
removeurl(NewUrls);
}
else
{
removeurl(NewUrls);
}
System.out.println(num);
num++;
}
while(BlogUrls.size()>0)
{
DrawBCode(GetWebcode(GetNewUrl(BlogUrls)));
}
}
}希望哪位高手能够帮我解决一下这个问题~谢谢!
at java.util.Arrays.copyOfRange(Arrays.java:3209)
at java.lang.String.<init>(String.java:216)
at java.lang.StringBuilder.toString(StringBuilder.java:430)
at sina.GetWebcode(sina.java:45)
at sina.main(sina.java:282)
这个异常说明创建了太多的栈对象,栈内存不够用了。
at sina.GetWebcode(sina.java:45)从这一句就可以找到出现问题的代码在哪 String Webcode = "";
................... while((str=reader.readLine())!=null){
str=str+"\n";
Webcode = Webcode+str;
}
如果是太多的字符串连接起来,不要使用String类型,而应该使用StringBuilder或者StringBuffer。
可以修改成下面的样子 StringBuilder Webcode = new StringBuilder();
................... while((str=reader.readLine())!=null){
Webcode.append(str).append("\n");
}
内存用不溢出 参数
//堆内存溢出问题。堆内存存储的是数组和对象,凡是new建立的都在堆中。你加上System.gc();试试。
把这些编译参数加上,把jvm运行时的堆栈调大。3.System.gc()调用并不能立刻解决问题。new出的对象不用时要及时赋值为null。
这种一般都是代码问题。。