正则表达式获取网页中链接和内容

我想用正则表达式获取网页中  链接  和   内容<img src="http://www.baidu.com/icon.png" /><a href="http://guide.sina.cn/?pos=1&vt=1">导航</a><a href="http://sina.cn/nc.php?pos=1&vt=1">新闻</a><a href="http://mil.sina.cn/?pos=1&vt=1">军事</a><a href="http://weibo.cn/?gotoreg=1&from=index&s2w=index&wm=ig_0001_index&pos=1&vt=1">微博</a><a href="http://finance.sina.cn/?sa=t60d13v512&pos=1&vt=1">股票</a><br/>
我想得到内容中的href连接和内容例如http://guide.sina.cn/?pos=1&vt=1      导航
http://sina.cn/nc.php?pos=1&vt=1     新闻
...
http://finance.sina.cn/?sa=t60d13v512&pos=1&vt=   股票

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

第一个方法是读取数据 /**
* 文件中读取目标文件
* @return
* @author wangjikuan
*/
private static StringBuffer getSb(){
StringBuffer sb = new StringBuffer();
File f = new File("c:/xx.txt");
try {
BufferedReader reader = new BufferedReader( new InputStreamReader(new FileInputStream(f), "gbk"));
String s = "";
while((s = reader.readLine()) != null){
sb.append(s);
}

} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb;
}

/**
* 解析字符串，得到目标
* @param sb
* @author wangjikuan
*/
private static void parse(StringBuffer sb){
String regx = "<a.*?</a>";
Pattern p = Pattern.compile(regx);
Matcher m = p.matcher(sb.toString());

String regx1 = "(?<=href=\").*(?=\")";
Pattern p1 = Pattern.compile(regx1);

String regx2 = "(?<=>).*(?=<)";
Pattern p2 = Pattern.compile(regx2);

while(m.find()){
String child = m.group();
Matcher m1 = p1.matcher(child);

if(m1.find()){
System.out.print(m1.group());
}

Matcher m2 = p2.matcher(child);

if(m2.find()){
System.out.println(m2.group());
}

}

}

public static void main(String[] args) {
parse(getSb());
}
public static void main(String args[]) {
String str = "<img src=\"http://www.baidu.com/icon.png\" /><a href=\"http://guide.sina.cn/?pos=1&vt=1\">导航</a><a href=\"http://sina.cn/nc.php?pos=1&vt=1\">新闻</a><a href=\"http://mil.sina.cn/?pos=1&vt=1\">军事</a><a href=\"http://weibo.cn/?gotoreg=1&from=index&s2w=index&wm=ig_0001_index&pos=1&vt=1\">微博</a><a href=\"http://finance.sina.cn/?sa=t60d13v512&pos=1&vt=1\">股票</a><br/>";
String regex = "href=\"(.*?)\">(.*?)<"; Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(str);
while (m.find()) {
System.out.println(m.group(1));
System.out.println(m.group(2));
System.out.println("-------------");
}
} private static String converse(String str, int pos) {
String result = "";
for (int i = 0; i < str.length(); i++) {
int index = pos;
if (i > pos)
index = str.length() - i + pos;
else if (i < pos)
index = pos - i - 1;
result += str.charAt(index);
}
return result;
}