小弟正在做一个小说采集系统 做到文件采集保存时发现 文章内容页采集卡死问题 具体问题为:内容页采集 刚开始时非常流畅 并且能够顺利保存到txt 但是采集到后面(一般到第二十章)时页面非常卡 具体代码如下:
String Url=caijiInfo_YueduURL.replace("index.html","");
java.io.InputStream m_urlStream;
java.net.URL m_url = new java.net.URL(Url);
java.net.HttpURLConnection m_connection = (java.net.HttpURLConnection) m_url.openConnection();
m_connection.connect();
m_urlStream = m_connection.getInputStream();
java.io.BufferedReader m_reader = new java.io.BufferedReader(new java.io.InputStreamReader(m_urlStream));
while ((sCurrentLine = m_reader.readLine()) != null)
{
sTotalString+=sCurrentLine;
}
//采集章节名称
String caijiDirectory_Name_Ex =caijDirectory_Name_Start+"(.+?)"+caijDirectory_Name_End;
Pattern Directory_Namep = Pattern.compile(caijiDirectory_Name_Ex, Pattern.CASE_INSENSITIVE);
Matcher Directory_Namem = Directory_Namep.matcher(sTotalString);
String caijDirectory_Url_Ex =caijDirectory_Url_Start+"([^\"]*)"+caijDirectory_Url_End;
Pattern Directory_Urlp = Pattern.compile(caijDirectory_Url_Ex, Pattern.CASE_INSENSITIVE);
Matcher Directory_Urlm = Directory_Urlp.matcher(sTotalString);
sTotalString=null;
out.print("开始采集内容页:<br>");
while(Directory_Namem.find()&&Directory_Urlm.find()){
caijDirectory_Name=Directory_Namem.group(0);
caijDirectory_Name=GuolvHtmlBiaoqian.Html2Text(caijDirectory_Name);
caijDirectory_Url=Directory_Urlm.group(0);
caijDirectory_Url=caijDirectory_Url.replace(caijDirectory_Url_Start,"");
caijDirectory_Url=caijDirectory_Url.replace(caijDirectory_Url_End,"");
//ht.put(caijDirectory_Url,caijDirectory_Name);
//list.add(caijDirectory_Url);
if(null!=caijDirectory_Url){
java.io.InputStream n_urlStream;
java.net.URL n_url = new java.net.URL(Url+caijDirectory_Url+".html");
java.net.HttpURLConnection n_connection = (java.net.HttpURLConnection) n_url.openConnection();
n_connection.connect();
n_urlStream = n_connection.getInputStream();
java.io.BufferedReader n_reader = new java.io.BufferedReader(new java.io.InputStreamReader(n_urlStream));
while ((sCurrentLine = n_reader.readLine()) != null)
{
sTotalString+=sCurrentLine;
}
String caijiContentEx =caijiContent_Start+"(.+?)"+caijiContent_End;
Pattern Contentp = Pattern.compile(caijiContentEx, Pattern.CASE_INSENSITIVE);
Matcher Contentm = Contentp.matcher(sTotalString);
sTotalString=null;
while(Contentm.find()){
caijiContent=Contentm.group(0);
caijiContent=caijiContent.replace(caijiContent_Start,"");
caijiContent=caijiContent.replace(caijiContent_End,"");
textCount=textCount+1; //采集内容保存到txt
menu.Savetxt(wenjian,"\r\n "+caijDirectory_Name+":\r\n "+caijiContent);
//就是上面的代码卡死问题 可能是文件大了读写速度慢的原因
out.print(textCount+"."+caijDirectory_Name+" ");
out.flush();
}
n_urlStream.close();
n_reader.close();
}
m_urlStream.close();
m_reader.close();
sTotalString=null;
}
String Url=caijiInfo_YueduURL.replace("index.html","");
java.io.InputStream m_urlStream;
java.net.URL m_url = new java.net.URL(Url);
java.net.HttpURLConnection m_connection = (java.net.HttpURLConnection) m_url.openConnection();
m_connection.connect();
m_urlStream = m_connection.getInputStream();
java.io.BufferedReader m_reader = new java.io.BufferedReader(new java.io.InputStreamReader(m_urlStream));
while ((sCurrentLine = m_reader.readLine()) != null)
{
sTotalString+=sCurrentLine;
}
//采集章节名称
String caijiDirectory_Name_Ex =caijDirectory_Name_Start+"(.+?)"+caijDirectory_Name_End;
Pattern Directory_Namep = Pattern.compile(caijiDirectory_Name_Ex, Pattern.CASE_INSENSITIVE);
Matcher Directory_Namem = Directory_Namep.matcher(sTotalString);
String caijDirectory_Url_Ex =caijDirectory_Url_Start+"([^\"]*)"+caijDirectory_Url_End;
Pattern Directory_Urlp = Pattern.compile(caijDirectory_Url_Ex, Pattern.CASE_INSENSITIVE);
Matcher Directory_Urlm = Directory_Urlp.matcher(sTotalString);
sTotalString=null;
out.print("开始采集内容页:<br>");
while(Directory_Namem.find()&&Directory_Urlm.find()){
caijDirectory_Name=Directory_Namem.group(0);
caijDirectory_Name=GuolvHtmlBiaoqian.Html2Text(caijDirectory_Name);
caijDirectory_Url=Directory_Urlm.group(0);
caijDirectory_Url=caijDirectory_Url.replace(caijDirectory_Url_Start,"");
caijDirectory_Url=caijDirectory_Url.replace(caijDirectory_Url_End,"");
//ht.put(caijDirectory_Url,caijDirectory_Name);
//list.add(caijDirectory_Url);
if(null!=caijDirectory_Url){
java.io.InputStream n_urlStream;
java.net.URL n_url = new java.net.URL(Url+caijDirectory_Url+".html");
java.net.HttpURLConnection n_connection = (java.net.HttpURLConnection) n_url.openConnection();
n_connection.connect();
n_urlStream = n_connection.getInputStream();
java.io.BufferedReader n_reader = new java.io.BufferedReader(new java.io.InputStreamReader(n_urlStream));
while ((sCurrentLine = n_reader.readLine()) != null)
{
sTotalString+=sCurrentLine;
}
String caijiContentEx =caijiContent_Start+"(.+?)"+caijiContent_End;
Pattern Contentp = Pattern.compile(caijiContentEx, Pattern.CASE_INSENSITIVE);
Matcher Contentm = Contentp.matcher(sTotalString);
sTotalString=null;
while(Contentm.find()){
caijiContent=Contentm.group(0);
caijiContent=caijiContent.replace(caijiContent_Start,"");
caijiContent=caijiContent.replace(caijiContent_End,"");
textCount=textCount+1; //采集内容保存到txt
menu.Savetxt(wenjian,"\r\n "+caijDirectory_Name+":\r\n "+caijiContent);
//就是上面的代码卡死问题 可能是文件大了读写速度慢的原因
out.print(textCount+"."+caijDirectory_Name+" ");
out.flush();
}
n_urlStream.close();
n_reader.close();
}
m_urlStream.close();
m_reader.close();
sTotalString=null;
}
public static void Savetxt(String file, String txt) {
String s = new String();
String s1 = new String();
txt=txt.replace("<br />", "\r\n");
txt=txt.replace(" ", " ");
txt=txt.replace(" ", " ");
try {
File f = new File(file);
if (f.exists()) {
} else {
if (f.createNewFile()) {
}
}
BufferedReader input = new BufferedReader(new FileReader(f)); while ((s = input.readLine()) != null) {
s1 += s + "\r\n";
}
input.close();
s1 += txt; BufferedWriter output = new BufferedWriter(new FileWriter(f));
output.write(s1);
output.close();
} catch (Exception e) {
e.printStackTrace();
}
}
1.我是让你在循环外面定义,在循环内赋值。比如:
String str = null;
while(it.hasNext()){
str = it.next();
}
这样的话,开销会比较小。
3.jsp是多线程,但不代表你的pg不能再产生多个线程。jsp的多线程对你来说,是透明的。