html页面内容,我想匹配 <li><a href="http://news.sina.com.cn/c/2013-06-26/152627504374.shtml" target="_blank">原重庆大学校长林建华任浙江大学校长</a></li>这里面的URL地址,它有两个特点:
1、.shtml 结尾;
2、最后的.shtml之前的名称是数字,如上面:152627504374正则
1、.shtml 结尾;
2、最后的.shtml之前的名称是数字,如上面:152627504374正则
string pattern=@"(?is)<li>\s*<a\s*href=([""']?)(?<href>[^""']*?/(?<num>\d+)\.shtml)\1[^>]*?>.*?</a>\s*</li>";取Groups["href"].Value和Groups["num"].Value
会匹配不到?
"(?<=href=\")[^\s\"]+?(?=\")"
string ass ="(?<=href=\")[^\\s\"]+?(?=\")";
string a= detailrg.Match(htmldata).Value;
string htmldata = @"<li><a href=""http://news.sina.com.cn/c/2013-06-26/152627504374.shtml"" target=""_blank"">原重庆大学校长林建华任浙江大学校长</a></li>";
Regex detailrg = new Regex(@"(?i)<a[^>]*?href=(['""]?)(?<Url>[^'""]*?/(\d+?)\.shtml)\1[^>]*?>(?<Text>[^<>]*?)</a>", RegexOptions.Multiline | RegexOptions.Singleline);
Match _m = detailrg.Match(htmldata);
string url = _m.Groups["Url"].Value;//http://news.sina.com.cn/c/2013-06-26/152627504374.shtml
string Num = _m.Groups[2].Value;//152627504374
string text = _m.Groups["Text"].Value;//原重庆大学校长林建华任浙江大学校长
WebRequest req = WebRequest.Create(url);
using (WebResponse res = req.GetResponse()) // GetResponse blocks until the response arrives
{
using (Stream ReceiveStream = res.GetResponseStream()) // Read the stream into a string
{
StreamReader sr = new StreamReader(ReceiveStream);
string resultstring = sr.ReadToEnd();
var list = Regex.Matches(resultstring, @"(?i)<a[^>]*?href=(['""]?)(?<Url>[^'""]*?/(\d+?)\.shtml)\1[^>]*?>(?<Text>[^<>]*?)</a>").OfType<Match>().Select(a=>a.Groups["Url"].Value).ToList();
/*
[0] "http://news.sina.com.cn/c/2013-06-26/150027504392.shtml" string
[1] "http://news.sina.com.cn/c/2013-06-26/150027504392.shtml" string
[2] "http://news.sina.com.cn/c/2013-06-26/180027505390.shtml" string
[3] "http://news.sina.com.cn/c/2013-06-27/023927507832.shtml" string
* .
* .
* .
* .
*/
}
}