我正在做一个网络蜘蛛程序,但是当遇到一个特殊网页时,由于响应时间太久而无法继续进行,我的思路是这样,通过监听这个线程的运行时间,当时间超过限制的时候,就检索其他网页,不知道怎么解决,我的代码是这样的:
package spider;
import java.net.*;
import java.util.*;
import java.io.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.*;
import javax.swing.text.html.*;
import javax.swing.text.*;
/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2006</p>
*
* <p>Company: </p>
*
* @author not attributable
* @version 1.0
*/
public class SpiderWorker extends Thread { protected UrlTreeNode target;
protected WebSpider owner;
protected boolean busy;
private UrlTreeNode node;
Vector JavaScriptLink=new Vector();
Vector realLink=new Vector();
private connData db;
private RamWorkloadStroable workLoad;
private int threadNo;
private TempResource temp;
public SpiderWorker(WebSpider owner,UrlTreeNode node,RamWorkloadStroable w)
{
this.owner=owner;
this.node=node;
this.workLoad=w; }
public int getThreadNo()
{
return this.threadNo;
}
public UrlTreeNode getTarget()
{
return this.target;
} public boolean isBusy()
{
return this.busy;
}
public void setThreadNO(int number)
{
this.threadNo=number;
} public void run()
{
System.out.println("线程"+this.threadNo+"启动!");
for(;;)
{
target=owner.getWorkload();
System.out.println("线程"+this.threadNo+"正在运行!");
if(target==null&&this.workLoad.getRunningList().size()==0)
{
System.out.println("target=null,线程将中止运作。");
return;
}
owner.getWorkDone().workerBegin();
processWorkload();
owner.getWorkDone().workerEnd();
} } public void processWorkload()
{
try{
busy = true;
System.out.println("线程"+this.threadNo+"正在处理网页:"+target.getUrl());
isStop timer=new isStop(this);
timer.start();
URL url = new URL(target.getUrl());
URLConnection urlc = url.openConnection();
urlc.connect();
InputStream is = urlc.getInputStream();
InputStream in = url.openStream();
InputStreamReader isr = new InputStreamReader(in);
Parser cb = new Parser(target, this.owner);
ParserDelegator pd = new ParserDelegator();
pd.parse(isr, cb, true);
this.owner.processPage(target);
LinkWorker linkworker = new LinkWorker(is, this.owner, cb, workLoad);
linkworker.getLink();
linkworker.getJavaScript();
Vector tempLink=new Vector();
tempLink=linkworker.getLastLink();
for(int i=0;i<tempLink.size();i++)
{
this.owner.foundInternalLink((UrlTreeNode)tempLink.elementAt(i));
}
isr.close(); owner.completePage(this.target);
System.out.println("网页:"+this.target.getUrl()+"已经由线程"+this.threadNo+"完成!"); }
catch (MalformedURLException ex)
{ }
catch (IOException e)
{ }
finally {
busy = false;
} } public void init(UrlTreeNode node)
{
}
public RamWorkloadStroable getWorkLoad()
{
return this.workLoad;
}
class isStop
{
private final Timer timer = new Timer();
private final int minutes=1;
private SpiderWorker spiderWorker;
public isStop(SpiderWorker worker)
{
this.spiderWorker=worker;
}
public void start()
{
timer.schedule(new TimerTask() {
public void run()
{
System.out.print("线程"+spiderWorker.getThreadNo()+"已经延时!");
spiderWorker.interrupt();
spiderWorker.getWorkLoad().completeWorkload(spiderWorker.getTarget());
timer.cancel(); } }, minutes * 10 * 1000
);
}
public void end()
{
timer.cancel();
} }
}
但是这样做问题还是没有解决,碰到那些响应时间很长的网页还是卡住,请教高手怎么解决,不甚感激!
package spider;
import java.net.*;
import java.util.*;
import java.io.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.*;
import javax.swing.text.html.*;
import javax.swing.text.*;
/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2006</p>
*
* <p>Company: </p>
*
* @author not attributable
* @version 1.0
*/
public class SpiderWorker extends Thread { protected UrlTreeNode target;
protected WebSpider owner;
protected boolean busy;
private UrlTreeNode node;
Vector JavaScriptLink=new Vector();
Vector realLink=new Vector();
private connData db;
private RamWorkloadStroable workLoad;
private int threadNo;
private TempResource temp;
public SpiderWorker(WebSpider owner,UrlTreeNode node,RamWorkloadStroable w)
{
this.owner=owner;
this.node=node;
this.workLoad=w; }
public int getThreadNo()
{
return this.threadNo;
}
public UrlTreeNode getTarget()
{
return this.target;
} public boolean isBusy()
{
return this.busy;
}
public void setThreadNO(int number)
{
this.threadNo=number;
} public void run()
{
System.out.println("线程"+this.threadNo+"启动!");
for(;;)
{
target=owner.getWorkload();
System.out.println("线程"+this.threadNo+"正在运行!");
if(target==null&&this.workLoad.getRunningList().size()==0)
{
System.out.println("target=null,线程将中止运作。");
return;
}
owner.getWorkDone().workerBegin();
processWorkload();
owner.getWorkDone().workerEnd();
} } public void processWorkload()
{
try{
busy = true;
System.out.println("线程"+this.threadNo+"正在处理网页:"+target.getUrl());
isStop timer=new isStop(this);
timer.start();
URL url = new URL(target.getUrl());
URLConnection urlc = url.openConnection();
urlc.connect();
InputStream is = urlc.getInputStream();
InputStream in = url.openStream();
InputStreamReader isr = new InputStreamReader(in);
Parser cb = new Parser(target, this.owner);
ParserDelegator pd = new ParserDelegator();
pd.parse(isr, cb, true);
this.owner.processPage(target);
LinkWorker linkworker = new LinkWorker(is, this.owner, cb, workLoad);
linkworker.getLink();
linkworker.getJavaScript();
Vector tempLink=new Vector();
tempLink=linkworker.getLastLink();
for(int i=0;i<tempLink.size();i++)
{
this.owner.foundInternalLink((UrlTreeNode)tempLink.elementAt(i));
}
isr.close(); owner.completePage(this.target);
System.out.println("网页:"+this.target.getUrl()+"已经由线程"+this.threadNo+"完成!"); }
catch (MalformedURLException ex)
{ }
catch (IOException e)
{ }
finally {
busy = false;
} } public void init(UrlTreeNode node)
{
}
public RamWorkloadStroable getWorkLoad()
{
return this.workLoad;
}
class isStop
{
private final Timer timer = new Timer();
private final int minutes=1;
private SpiderWorker spiderWorker;
public isStop(SpiderWorker worker)
{
this.spiderWorker=worker;
}
public void start()
{
timer.schedule(new TimerTask() {
public void run()
{
System.out.print("线程"+spiderWorker.getThreadNo()+"已经延时!");
spiderWorker.interrupt();
spiderWorker.getWorkLoad().completeWorkload(spiderWorker.getTarget());
timer.cancel(); } }, minutes * 10 * 1000
);
}
public void end()
{
timer.cancel();
} }
}
但是这样做问题还是没有解决,碰到那些响应时间很长的网页还是卡住,请教高手怎么解决,不甚感激!
我不知道你的
pd.parse(isr, cb, true);
这个方法中是怎么使用isr的。如果是直接使用,那么就是你错误理解InputStreamReader类的用法了。不过应该不会那么做。不过接口方法这么设计不是很好。
另外,最好使用BufferedInputStream或者BufferedReader,如果你还没有这么做的话。2)URLConnection类有一个方法
setConnectTimeout(int timeout)
设定一个值,如果超时可以自动断开。
不知道能不能解决你的问题,你试一下吧。