<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418001122&pos=news2&icfa=news_01\">广东遭大风冰雹已致18人死亡</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000838&pos=news2&icfa=news_01\">宁波发现400年前重要水利设施</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000615&pos=news2&icfa=news_01\">3月全国仅2城市房价同比下跌</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000405&pos=news2&icfa=news_01\">洋奶粉获利惊人价格说涨就涨</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000305&pos=news2&icfa=news_01\">媒体称药家鑫算不上"富二代"</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000265&pos=news2&icfa=news_01\">邢丹遗体火化后将与丛飞合葬</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000339&pos=news2&icfa=news_01\">中科院科学家研制出戒烟
产品</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01\">北方沙尘天气今结束气温回升</a>
我现在要把这几条用正则匹配出来 因为整个页面的超链接太多 但我只要这八个 这是抓取QQ的新闻
有会的吗?
产品</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01\">北方沙尘天气今结束气温回升</a>
我现在要把这几条用正则匹配出来 因为整个页面的超链接太多 但我只要这八个 这是抓取QQ的新闻
有会的吗?
Regex reg = new Regex("(?i)(?:(?<key><a[^>]*>[^><]+</a>)(?!<a).*){8}");
Match m = reg.Match(str);
foreach (Capture c in m.Groups["key"].Captures)
{
Console.WriteLine(c);
}
string links = "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418001122&pos=news2&icfa=news_01\">广东遭大风冰雹已致18人死亡</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000838&pos=news2&icfa=news_01\">宁波发现400年前重要水利设施</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000615&pos=news2&icfa=news_01\">3月全国仅2城市房价同比下跌</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000405&pos=news2&icfa=news_01\">洋奶粉获利惊人价格说涨就涨</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000305&pos=news2&icfa=news_01\">媒体称药家鑫算不上"富二代"</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000265&pos=news2&icfa=news_01\">邢丹遗体火化后将与丛飞合葬</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000339&pos=news2&icfa=news_01\">中科院科学家研制出戒烟产品</a><br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01\">北方沙尘天气今结束气温回升</a>";
new Regex("(?i)?<=href=\")(?<link>/g/s\\?sid=AY8M6wwMGACapz1QnW4_dx8n[^\"]+(?:\">)(?<text>[^<>]*)").Matches(links).Cast<Match>().ToList().ForEach(m =>
Response.Write(string.Format("{0} {1}<br/>", m.Groups["link"].Value, m.Groups["text"].Value))
);/g/s\\?sid=AY8M6wwMGACapz1QnW4_dx8n貌似这个是链接的特征码?
Regex reg = new Regex(@"(?i)(?:<a\s*href=\""?(?<url>[^\""]+)"">(?<value>[^><]+)</a>(?!<a).*){8}");
Match m = reg.Match(str);
for (int i = 0; i < 8; i++)
{
Console.WriteLine("{0} : {1}",m.Groups["url"].Captures[i],m.Groups["value"].Captures[i]);
}
/*不知道你要的这8个链接和其他链接有啥区别,
就用了取href=/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n的8个
*/
string str = "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418001122&pos=news2&icfa=news_01\">广东遭大风冰雹已致18人死亡</a><br/>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000838&pos=news2&icfa=news_01\">宁波发现400年前重要水利设施</a><br/>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000615&pos=news2&icfa=news_01\">3月全国仅2城市房价同比下跌</a><br/>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000405&pos=news2&icfa=news_01\">洋奶粉获利惊人价格说涨就涨</a><br/>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000305&pos=news2&icfa=news_01\">媒体称药家鑫算不上"富二代"</a>"
+ "<br/><a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000265&pos=news2&icfa=news_01\">邢丹遗体火化后将与丛飞合葬</a><br/>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000339&pos=news2&icfa=news_01\">中科院科学家研制出戒烟产品</a><br/>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01\">北方沙尘天气今结束气温回升</a>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01\">北方沙尘天气今结束气温回升</a>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000339&pos=news2&icfa=news_01\">中科院科学家研制出戒烟产品</a><br/>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01\">北方沙尘天气今结束气温回升</a>"
+ "<a href=\"/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01\">北方沙尘天气今结束气温回升</a>";
Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""\s]?)(?<href>\s*/g/s\?sid=AY8M6wwMGACapz1QnW4_dx8n[^'""\s]+)\1[^>]*?>(?<content>((?!</?a).)*)");
int i = 0;
foreach (Match m in reg.Matches(str))
{
i++;
if (i > 8)
return;
Response.Write("HREF:" + m.Groups["href"].Value + " Content:" + m.Groups["content"].Value + "<br/>");
}
/*
输出
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418001122&pos=news2&icfa=news_01 Content:广东遭大风冰雹已致18人死亡
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000838&pos=news2&icfa=news_01 Content:宁波发现400年前重要水利设施
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000615&pos=news2&icfa=news_01 Content:3月全国仅2城市房价同比下跌
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000405&pos=news2&icfa=news_01 Content:洋奶粉获利惊人价格说涨就涨
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000305&pos=news2&icfa=news_01 Content:媒体称药家鑫算不上"富二代"
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000265&pos=news2&icfa=news_01 Content:邢丹遗体火化后将与丛飞合葬
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000339&pos=news2&icfa=news_01 Content:中科院科学家研制出戒烟产品
HREF:/g/s?sid=AY8M6wwMGACapz1QnW4_dx8n&aid=news_ss&id=news_20110418000217&pos=news2&icfa=news_01 Content:北方沙尘天气今结束气温回升
*/
七楼 如果中间的这个sid后面的参数是变化的呢