我需要传入一个URL,然后打开该URL,从返回的页面中提取出所有的链接地址 代码如下:
import java.io.*;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class URLTest {
public static void getUrl(String u){
//一个URL实例
URL url ;
//输入流
InputStream is ;
InputStreamReader isr ;
//缓冲输入流
BufferedReader br ;
String str ;
try{
System.out.println("READING URL:"+u);
//实例化url
url = new URL(u);
is = url.openStream() ;
isr = new InputStreamReader(is,"gb2312") ;
br = new BufferedReader(isr) ;
while(br.read()!= -1){
str = br.readLine();
//如果网页有内容
if(str!=null){
//定义正则
String zhengzhe = "<table.*?/table>" ;
//匹配时不区分大小写
Pattern patt = Pattern.compile(zhengzhe, Pattern.CASE_INSENSITIVE);
//进行正则验证
Matcher macher = patt.matcher(str);
//如果有匹配
if(macher.find()){
int start = macher.start() ;
int end = macher.end() ;
//截取表格
String content = str.substring(start, end);
//再次找href
Matcher macher2 = Pattern.compile("<a.*?/a>", Pattern.CASE_INSENSITIVE).matcher(content);
if(macher2.find()){
int start1 = macher2.start() ;
int end1 = macher2.end() ;
String a_tag = content.substring(start1,end1) ;
Matcher macher3 = Pattern.compile("href=\".*?\"", Pattern.CASE_INSENSITIVE).matcher(a_tag);
if(macher3.find()){
String href = a_tag.substring(macher3.start(),macher3.end()) ;
System.out.println("href is :-------------->"+href);
}
}
}
}
}
}catch(MalformedURLException e){
System.out.println("出错啦!");
}catch(Exception e){
}
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
//测试代码
URLTest.getUrl("http://www.baidu.com/s?wd=china") ;
}}
如果是输入的百度的,就可以得到正确的结果,
href is :-------------->href="/"
href is :-------------->href="http://www.china.com/"
href is :-------------->href="http://baike.baidu.com/view/4806.htm"
href is :-------------->href="http://www.china.com.cn/"
href is :-------------->href="http://military.china.com/"
href is :-------------->href="http://www.china.com/zh_cn/"
href is :-------------->href="http://open.baidu.com/"
href is :-------------->href="http://news.china.com/"
href is :-------------->href="http://www.chinadaily.com.cn/"
href is :-------------->href="http://www.microsoft.com/china"
href is :-------------->href="s?wd=china%20daily&rsp=0&oq=china&f=1"
href is :-------------->href="/s?wd=china&tn=baidufir"
但是输入谷歌的地址(http://www.google.com.hk/search?q=china),就得不到结果,请高手们瞧瞧是怎么回事呢?
import java.io.*;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class URLTest {
public static void getUrl(String u){
//一个URL实例
URL url ;
//输入流
InputStream is ;
InputStreamReader isr ;
//缓冲输入流
BufferedReader br ;
String str ;
try{
System.out.println("READING URL:"+u);
//实例化url
url = new URL(u);
is = url.openStream() ;
isr = new InputStreamReader(is,"gb2312") ;
br = new BufferedReader(isr) ;
while(br.read()!= -1){
str = br.readLine();
//如果网页有内容
if(str!=null){
//定义正则
String zhengzhe = "<table.*?/table>" ;
//匹配时不区分大小写
Pattern patt = Pattern.compile(zhengzhe, Pattern.CASE_INSENSITIVE);
//进行正则验证
Matcher macher = patt.matcher(str);
//如果有匹配
if(macher.find()){
int start = macher.start() ;
int end = macher.end() ;
//截取表格
String content = str.substring(start, end);
//再次找href
Matcher macher2 = Pattern.compile("<a.*?/a>", Pattern.CASE_INSENSITIVE).matcher(content);
if(macher2.find()){
int start1 = macher2.start() ;
int end1 = macher2.end() ;
String a_tag = content.substring(start1,end1) ;
Matcher macher3 = Pattern.compile("href=\".*?\"", Pattern.CASE_INSENSITIVE).matcher(a_tag);
if(macher3.find()){
String href = a_tag.substring(macher3.start(),macher3.end()) ;
System.out.println("href is :-------------->"+href);
}
}
}
}
}
}catch(MalformedURLException e){
System.out.println("出错啦!");
}catch(Exception e){
}
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
//测试代码
URLTest.getUrl("http://www.baidu.com/s?wd=china") ;
}}
如果是输入的百度的,就可以得到正确的结果,
href is :-------------->href="/"
href is :-------------->href="http://www.china.com/"
href is :-------------->href="http://baike.baidu.com/view/4806.htm"
href is :-------------->href="http://www.china.com.cn/"
href is :-------------->href="http://military.china.com/"
href is :-------------->href="http://www.china.com/zh_cn/"
href is :-------------->href="http://open.baidu.com/"
href is :-------------->href="http://news.china.com/"
href is :-------------->href="http://www.chinadaily.com.cn/"
href is :-------------->href="http://www.microsoft.com/china"
href is :-------------->href="s?wd=china%20daily&rsp=0&oq=china&f=1"
href is :-------------->href="/s?wd=china&tn=baidufir"
但是输入谷歌的地址(http://www.google.com.hk/search?q=china),就得不到结果,请高手们瞧瞧是怎么回事呢?
2.你的读取方式很奇怪,
while(br.read()!= -1){
str = br.readLine();
这样不就跳过了一个字符了吗。
建议你整个网页完全读进一个String里面,然后再写对应的正则表达式。例如:
BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8"));
char[] buff = new char[4 * 1024];
StringBuilder sb = new StringBuilder();
int i = 0;
while ((i = br.read(buff)) != -1) {
sb.append(buff, 0, i);
}
return sb.toString();
顺带一提GOOGLE是UTF-8编码。
3.提取连接的话为什么要截取Table?
临时写的,测试了一下的确有结果出,但是结果是否完整,未经严格验证。我自己也有东西要捣鼓,你就参考参考吧:package hitsukiTest;import java.io.*;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class URLTest { public static void getUrl(String u) {
// 一个URL实例
URL url;
// 输入流
InputStream is;
InputStreamReader isr;
// 缓冲输入流
BufferedReader br;
try {
// 实例化url
url = new URL(u);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn
.setRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3");
conn.connect();
is = conn.getInputStream();
isr = new InputStreamReader(is, "UTF-8");
br = new BufferedReader(isr);
char[] buff = new char[4 * 1024];
StringBuilder sb = new StringBuilder();
int i = 0;
while ((i = br.read(buff)) != -1) {
sb.append(buff, 0, i);
}
String page = sb.toString();
Pattern p=Pattern.compile("<a.+?(href\\s*?=\\s*?\".+?\").*?>");
Matcher m=p.matcher(page);
while(m.find()){
System.out.println("href is :-------------->"+m.group(1));
}
} catch (Exception ex) {
ex.printStackTrace();
}
} /**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
// 测试代码
URLTest.getUrl("http://www.google.com.hk/search?q=china");
}}