我的代码是这样的。static ArrayList goodlist = new ArrayList();public static void urlList(String url) throws Exception {
String link = null;
URL myurl = new URL(url);
if(checkLink(myurl))
{
BufferedReader in = new BufferedReader(new InputStreamReader(myurl
.openStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
// System.out.println(inputLine);
Pattern p = Pattern.compile("(.*href=?[\"])(.*?)([\"].*)");
Matcher m = p.matcher(inputLine);
while (m.find()) {
if (!m.group(2).startsWith("http")) { link = url + "/" + m.group(2);
} else {
link = m.group(2);
}
if(!goodlist.contains(link))
{
if(link.contains("/index.html"))
{
link = link.replaceAll("/index.html", "");
goodlist.add(link);
}
goodlist.add(link);
}
}
}
in.close();
//System.out.println(goodlist.size());
/*
for(int i=0; i<goodlist.size(); i++)
{
System.out.println(goodlist.get(i));
}
*/
}
else
{
badlist.add(link);
}
}这样用String str = "http://www.yahoo.co.uk"; pl.urlList(str); 就可以得到goodlist,里面的值是这个str这个URL里面找到的所有的子页面。
我的问题是子页面里还会又子页面,要全找到的话,应该是用递归把。我的想法是:在上面代码的in.close()后面加上这段代码。如下:in.close();
//System.out.println(goodlist.size());
for(int i=0; i<goodlist.size(); i++)
{
System.out.println(goodlist.get(i));
}
但是貌似不行,运行报错,请高手看看这个递归有问题么?谢谢了啊。
String link = null;
URL myurl = new URL(url);
if(checkLink(myurl))
{
BufferedReader in = new BufferedReader(new InputStreamReader(myurl
.openStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
// System.out.println(inputLine);
Pattern p = Pattern.compile("(.*href=?[\"])(.*?)([\"].*)");
Matcher m = p.matcher(inputLine);
while (m.find()) {
if (!m.group(2).startsWith("http")) { link = url + "/" + m.group(2);
} else {
link = m.group(2);
}
if(!goodlist.contains(link))
{
if(link.contains("/index.html"))
{
link = link.replaceAll("/index.html", "");
goodlist.add(link);
}
goodlist.add(link);
}
}
}
in.close();
//System.out.println(goodlist.size());
/*
for(int i=0; i<goodlist.size(); i++)
{
System.out.println(goodlist.get(i));
}
*/
}
else
{
badlist.add(link);
}
}这样用String str = "http://www.yahoo.co.uk"; pl.urlList(str); 就可以得到goodlist,里面的值是这个str这个URL里面找到的所有的子页面。
我的问题是子页面里还会又子页面,要全找到的话,应该是用递归把。我的想法是:在上面代码的in.close()后面加上这段代码。如下:in.close();
//System.out.println(goodlist.size());
for(int i=0; i<goodlist.size(); i++)
{
System.out.println(goodlist.get(i));
}
但是貌似不行,运行报错,请高手看看这个递归有问题么?谢谢了啊。
for(int i=0; i<goodlist.size(); i++)
{
System.out.println(goodlist.get(i));
urlList(goodlist.get(i));
}
这样的
{
for( int i = 0; i < goodlist.size(); i++ ) {
extractUrls( (String) goodlist.get( i ) );
}
}public static void extractUrls(String url) throws Exception
{
String link = null;
URL myurl = new URL(url);
if(checkLink(myurl)) {
BufferedReader in = new BufferedReader(new InputStreamReader(myurl .openStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
// System.out.println(inputLine);
Pattern p = Pattern.compile("(.*href=?[\"])(.*?)([\"].*)");
Matcher m = p.matcher(inputLine);
while (m.find()) {
if (!m.group(2).startsWith("http")) {
link = url + "/" + m.group(2); }
else {
link = m.group(2);
}
if(!goodlist.contains(link)) {
if(link.contains("/index.html")) {
link = link.replaceAll("/index.html", "");
goodlist.add(link);
}
goodlist.add(link);
}
}
}
in.close();
} else {
badlist.add(link);
}
}
{
goodlist.add( url );
for( int i = 0; i < goodlist.size(); i++ ) {
extractUrls( (String) goodlist.get( i ) );
}
}public static void extractUrls(String url) throws Exception
{
String link = null;
URL myurl = new URL(url);
if(checkLink(myurl)) {
BufferedReader in = new BufferedReader(new InputStreamReader(myurl .openStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
// System.out.println(inputLine);
Pattern p = Pattern.compile("(.*href=?[\"])(.*?)([\"].*)");
Matcher m = p.matcher(inputLine);
while (m.find()) {
if (!m.group(2).startsWith("http")) {
link = url + "/" + m.group(2); }
else {
link = m.group(2);
}
if(!goodlist.contains(link)) {
if(link.contains("/index.html")) {
link = link.replaceAll("/index.html", "");
goodlist.add(link);
}
goodlist.add(link);
}
}
}
in.close();
} else {
badlist.add(link);
}
}
这个goodlist就是找到这些超连接的一个list。
感觉好像没有什么问题的啊,有没有高手来帮我分析下啊。还有我这种程序应该注意一些什么样的东西呢?