/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2012</p>
*
* <p>Company: </p>
*
* @author not attributable
* @version 1.0
*/
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import java.util.zip.*;
import com.tcsos.util.Regexer;public class test {
//返回所有组的正则
public static ArrayDeque<String[]> regexAllGroups(String original, String regex) {
int total = 0;
String[] ary = null;
ArrayDeque Q = new ArrayDeque();
if (original == null || regex == null) {
return Q;
}
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(original);
while (m != null && m.find()) { //该正则在这里m.find卡死。。
total = m.groupCount();
if (total < 1) {
continue;
}
ary = new String[total];
for (int i = 1; i <= total; i++) {
ary[i - 1] = new String(m.group(i));
}
Q.add(ary);
} m = null;
p = null;
return Q;
} //获取网页源码
public static String getUrlHtml(String strURL) {
String body = null;
String contentEncoding = null;
URL _URL = null;
InputStream IN = null;
HttpURLConnection CONNECTION = null; try {
_URL = new URL(strURL);
CONNECTION = (HttpURLConnection) _URL.openConnection();
CONNECTION.setConnectTimeout(3000);
CONNECTION.setReadTimeout(3000);
CONNECTION.setRequestProperty("Accept-Encoding", "gzip,deflate");
CONNECTION.setRequestProperty("Accept", "*/*");
CONNECTION.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)");
CONNECTION.setRequestProperty("Connection", "close");
CONNECTION.setRequestMethod("GET");
CONNECTION.setFollowRedirects(true);
CONNECTION.setUseCaches(false);
CONNECTION.setInstanceFollowRedirects(true);
/*判断是不是GZIP/DEFLATE压缩格式的网页*/
int type = 0;
contentEncoding = CONNECTION.getContentEncoding();
if (contentEncoding != null) {
contentEncoding = contentEncoding.toLowerCase();
if (contentEncoding.indexOf("gzip") != -1) {
type = 1;
}
if (contentEncoding.indexOf("deflate") != -1) {
type = 2;
}
} switch (type) {
case 1:
IN = new GZIPInputStream(CONNECTION.getInputStream());
break;
case 2:
IN = new InflaterInputStream(CONNECTION.getInputStream());
break;
default:
IN = CONNECTION.getInputStream();
break;
} byte[] b = null; if (IN != null && (b = inputStreamToByte(IN)) != null) {
body = new String(b, "utf-8");
IN.close();
}
CONNECTION.disconnect();
b = null;
} catch (Exception e) { try {
if (IN != null) {
IN.close();
}
if (CONNECTION != null) {
CONNECTION.disconnect();
}
} catch (Exception ex) { }
body = null;
} IN = null;
_URL = null; CONNECTION = null;
return body;
}
public static byte[] inputStreamToByte(InputStream in) { if (in == null) {
return null;
}
int ch;
byte[] b = null;
ByteArrayOutputStream stream = new ByteArrayOutputStream();
try {
while ((ch = in.read()) != -1) {
stream.write(ch);
}
b = stream.toByteArray();
stream.reset();
stream.close();
in.close();
} catch (Exception e) {
e.printStackTrace();
}
in = null;
stream = null;
return b;
}
public static void main(String[] args) {
//正则表达式
String regex = "(?s)<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\".*><tr><td class=f>.*<h3 class=\"t\"><a.*href=\"(.*?)\".*target=\"_blank\".*>(.*?)</a>(.*?)<br>.*<span class=\"g\">(.*?)</span>";
//获取该网页地址的html源代码
String html = getUrlHtml("http://www.baidu.com/s?wd=火车票&pn=0&rn=100&usm=1"); //这里卡很久,而且只出1个值,搞了几天无解。
ArrayDeque<String[]> Q = regexAllGroups(html, regex);
System.out.println(Q.size()); }}请问怎么处理,谢谢
<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\".*>
我想你是想匹配"table"这个标签,建议改成非贪婪的:
<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\".*?>
或者
<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\"[^>]?>
<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\"[^>]?>
改为
<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\"[^>]*>