接着上面的
// the URL as searched (we want this one way or the other)
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL); // can only search http: protocol URLs
if (url.getProtocol().compareTo("http") != 0)
break; // test to make sure it is before searching
if (!robotSafe(url))
break; try {
// try opening the URL
URLConnection urlConnection = url.openConnection(); urlConnection.setAllowUserInteraction(false); InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream);
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break; // search the input stream for links
// first, read in the entire URL
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close(); if (Thread.currentThread() != searchThread)
break; String lowerCaseContent = content.toLowerCase(); int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != searchThread)
break; index++;
String remaining = content.substring(index); StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken(); URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
} // only look at http links
if (urlLink.getProtocol().compareTo("http") != 0)
break; if (Thread.currentThread() != searchThread)
break; try {
// try opening the URL
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkStream);
linkStream.close(); // if another page, add to the end of search list
if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
// check to see if this URL has already been
// searched or is going to be searched
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) { // test to make sure it is robot-safe!
if (robotSafe(urlLink))
vectorToSearch.addElement(strLink);
}
} // if the proper type, add it to the results list
// unless we have already seen it
if (strType.compareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
} numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
} if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
searchThread = null;
// searchThread.stop();
} void setStatus(String status) {
labelStatus.setText(status);
} public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand(); if (command.compareTo(SEARCH) == 0) {
setStatus("searching..."); // launch a thread to do the search
if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.compareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("WebFrame");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);/* Behind a firewall set your proxy and port here!
*/
Properties props= new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "8080"); Properties newprops = new Properties(props);
System.setProperties(newprops);
/**/
applet.init();
applet.start();
f.pack();
f.show();
}}
// the URL as searched (we want this one way or the other)
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL); // can only search http: protocol URLs
if (url.getProtocol().compareTo("http") != 0)
break; // test to make sure it is before searching
if (!robotSafe(url))
break; try {
// try opening the URL
URLConnection urlConnection = url.openConnection(); urlConnection.setAllowUserInteraction(false); InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream);
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break; // search the input stream for links
// first, read in the entire URL
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close(); if (Thread.currentThread() != searchThread)
break; String lowerCaseContent = content.toLowerCase(); int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != searchThread)
break; index++;
String remaining = content.substring(index); StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken(); URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
} // only look at http links
if (urlLink.getProtocol().compareTo("http") != 0)
break; if (Thread.currentThread() != searchThread)
break; try {
// try opening the URL
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkStream);
linkStream.close(); // if another page, add to the end of search list
if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
// check to see if this URL has already been
// searched or is going to be searched
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) { // test to make sure it is robot-safe!
if (robotSafe(urlLink))
vectorToSearch.addElement(strLink);
}
} // if the proper type, add it to the results list
// unless we have already seen it
if (strType.compareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
} numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
} if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
searchThread = null;
// searchThread.stop();
} void setStatus(String status) {
labelStatus.setText(status);
} public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand(); if (command.compareTo(SEARCH) == 0) {
setStatus("searching..."); // launch a thread to do the search
if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.compareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("WebFrame");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);/* Behind a firewall set your proxy and port here!
*/
Properties props= new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "8080"); Properties newprops = new Properties(props);
System.setProperties(newprops);
/**/
applet.init();
applet.start();
f.pack();
f.show();
}}
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL); // can only search http: protocol URLs
if (url.getProtocol().compareTo("http") != 0)
break; // test to make sure it is before searching
if (!robotSafe(url))
break; try {
// try opening the URL
URLConnection urlConnection = url.openConnection(); urlConnection.setAllowUserInteraction(false); InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream);
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break; // search the input stream for links
// first, read in the entire URL
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close(); if (Thread.currentThread() != searchThread)
break; String lowerCaseContent = content.toLowerCase(); int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != searchThread)
break; index++;
String remaining = content.substring(index); StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken(); URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
} // only look at http links
if (urlLink.getProtocol().compareTo("http") != 0)
break; if (Thread.currentThread() != searchThread)
break; try {
// try opening the URL
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkStream);
linkStream.close(); // if another page, add to the end of search list
if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
// check to see if this URL has already been
// searched or is going to be searched
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) { // test to make sure it is robot-safe!
if (robotSafe(urlLink))
vectorToSearch.addElement(strLink);
}
} // if the proper type, add it to the results list
// unless we have already seen it
if (strType.compareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
} numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
} if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
searchThread = null;
// searchThread.stop();
} void setStatus(String status) {
labelStatus.setText(status);
} public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand(); if (command.compareTo(SEARCH) == 0) {
setStatus("searching..."); // launch a thread to do the search
if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.compareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("WebFrame");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);/* Behind a firewall set your proxy and port here!
*/
Properties props= new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "8080"); Properties newprops = new Properties(props);
System.setProperties(newprops);
/**/
applet.init();
applet.start();
f.pack();
f.show();
}}
http://www.csdn.net/expert/topic/701/701403.xml?temp=.7961542