通过httpwebrequest抓取,再使用正则格式化
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, encoding);
string html = (sr.ReadToEnd());
resStream.Close();
sr.Close();
System.Net.WebClient wc = new System.Net.WebClient();
wc.Credentials = System.Net.CredentialCache.DefaultCredentials;
Byte[] pageData = wc.DownloadData(PageUrl);
string Content= System.Text.Encoding.Default.GetString(pageData);
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, encoding);
string html = (sr.ReadToEnd());
resStream.Close();
sr.Close();
System.Net.WebClient wc = new System.Net.WebClient();
wc.Credentials = System.Net.CredentialCache.DefaultCredentials;
Byte[] pageData = wc.DownloadData(PageUrl);
string Content= System.Text.Encoding.Default.GetString(pageData);
string s=@"<tr bgcolor= [^>]+>\s+<td align=""center"">\s+<a href=""[^""]+"" target=""_blank"">(?<dm>\d+)</a></td>\s+<td align=""center""> <a href=""[^""]+"" target=""_blank"">(?<xx>[^<]+)</a></td>\s+<td align=""center""> (?<jhs>\d+)</td>\s+<td align=""left"">(?<dz>[^<]+)</td>\s+<td align=""left"">(?<dh>[^<]+)</td>\s+<td align=""center""><a href=""[^""]+""><font color=""green"">查询专业<font color=""""></font></a></td>\s+</tr>";MatchCollection mc=Regex.Matchs(html,s);
foreach(Match m in mc)
{
Console.WriteLine(m.Groups["dm"].Value;
Console.WriteLine(m.Groups["xx"].Value;
Console.WriteLine(m.Groups["jhs"].Value;
Console.WriteLine(m.Groups["dz"].Value;
Console.WriteLine(m.Groups["dh"].Value;
}
{
string html = File.ReadAllText("test.html");
MatchCollection sections = Regex.Matches(html, @"(?isn)<tr bgcolor= #FFFFFF>.+?<a href=""(?<id_url>[^""]+)[^>]+>(?<id>\d+)([^>]+>){3}.+?<a href=""(?<school_url>[^""]+)[^>]*>(?<school>[^<]+)([^>]+>){3}\D*(?<plan>\d+)([^>]+>){2}\W*(?<address>[^<]+)([^>]+>){2}\W+(?<tel>[\d-]*).+?<a href=""(?<search>[^""]+)"); List<HtmlResult> result = new List<HtmlResult>();
foreach (Match section in sections)
{
HtmlResult item = new HtmlResult();
item.ID = section.Groups["id"].Value;
item.IdUrl = section.Groups["id_url"].Value;
item.School = section.Groups["school"].Value;
item.SchoolUrl = section.Groups["school_url"].Value;
item.PlanCount = section.Groups["plan"].Value;
item.Address = section.Groups["address"].Value;
item.Tel = section.Groups["tel"].Value;
item.SearchUrl = section.Groups["search"].Value;
result.Add(item);
} //result就是分析后的结果 foreach (HtmlResult item in result)
{
Console.WriteLine(item.ToString());
}
Console.ReadKey();
}public class HtmlResult
{
public string ID { get; set; }
public string IdUrl { get; set; }
public string School { get; set; }
public string SchoolUrl { get; set; }
public string PlanCount { get; set; }
public string Address { get; set; }
public string Tel { get; set; }
public string SearchUrl { get; set; }
public override string ToString()
{
return string.Format("{0},{1},{2},{3},{4},{5},{6},{7}", ID, IdUrl, School, SchoolUrl, PlanCount, Address, Tel, SearchUrl);
}
}
完整例子private static void TestRegex01()
{
string html = Encoding.UTF8.GetString(new WebClient().DownloadData("http://zxks.jseea.cn:8081/czweb/school/zsjhcx.jsp")); //File.ReadAllText("test.html");
MatchCollection sections = Regex.Matches(html, @"(?isn)<tr bgcolor= #FFFFFF>.+?<a href=""(?<id_url>[^""]+)[^>]+>(?<id>\d+)([^>]+>){3}.+?<a href=""(?<school_url>[^""]+)[^>]*>(?<school>[^<]+)([^>]+>){3}\D*(?<plan>\d+)([^>]+>){2}\W*(?<address>[^<]+)([^>]+>){2}\W+(?<tel>[\d-]*).+?<a href=""(?<search>[^""]+)"); List<HtmlResult> result = new List<HtmlResult>();
foreach (Match section in sections)
{
HtmlResult item = new HtmlResult();
item.ID = section.Groups["id"].Value;
item.IdUrl = section.Groups["id_url"].Value;
item.School = section.Groups["school"].Value;
item.SchoolUrl = section.Groups["school_url"].Value;
item.PlanCount = section.Groups["plan"].Value;
item.Address = section.Groups["address"].Value;
item.Tel = section.Groups["tel"].Value;
item.SearchUrl = section.Groups["search"].Value;
result.Add(item);
} //result就是分析后的结果 foreach (HtmlResult item in result)
{
Console.WriteLine(item.ToString());
}
}public class HtmlResult
{
public string ID { get; set; }
public string IdUrl { get; set; }
public string School { get; set; }
public string SchoolUrl { get; set; }
public string PlanCount { get; set; }
public string Address { get; set; }
public string Tel { get; set; }
public string SearchUrl { get; set; }
public override string ToString()
{
return string.Format("{0},{1},{2},{3},{4},{5},{6},{7}", ID, IdUrl, School, SchoolUrl, PlanCount, Address, Tel, SearchUrl);
}
}
如果你能读懂下面的内容,那你实现所要的功能就非常简单了...var pars='';
$("tbody tr").each(function(i, tr) {//获取tbody下的所有tr
Catalog = $(tr).find("td").eq(0).text();//获取tr下面的第0个td的值,下面类似
size = $(tr).find("td").eq(1).text();
price = $(tr).find("td").eq(2).text(); if (Catalog != '' && Catalog != 'custom') {
pars = pars + "subCat=" + Catalog + "&size=" + size + "&price="+ price+"|";
}
}); try {
if (pars == '')
return;
//下面是通过json的方式进异步操作,如:操作数据库之类的
$.getJSON('../../Handler.ashx', { param: pars, CatNo: getRequest("Catalog") },
function(json) {
window.open('', '_top');
window.top.close(); });
}
catch (e) {
alert(e.Message);
}