但是那样会整个页面中<a></a>之间的都提取出来,不过,我只要提取格式为这种的就可以, <a href="/studentinfo/XXXXXXXXXXX.html" Target="_blank">文本</a> (就是一定要为html结尾的,且前面的开头一定要为=/studentinfo/这种的<a></a>之间的值) [/Quote] string str = @"<a href=""/studentinfo/12d12345-3456-7a9a-12e4-53fbd.html"" target=""_blank"">张三</a> <a href=""/studentinfo/XXXXXXXXXXX.html"" Target=""_blank"">文本</a>"; Regex reg = new Regex(@"(?<=<a[^>]*?href=['""]/studentinfo/.*?\.html[^>]*?>)(?:(?!</?a).)*"); foreach (Match m in reg.Matches(str)) Console.WriteLine(m.Value);
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions;namespace ConsoleApplication13 { class Program { static void Main(string[] args) { string tempStr = "<a href=\"/studentinfo/12d12345-3456-7a9a-12e4-53fbd.html\" target=\"_blank\">张三</a>"; string pattern = @"<a[^>]*href=['""]/studentinfo/[^.]*\.html['""][^>]*>(?<text>[\s\S]*)</a>"; MatchCollection mc = Regex.Matches(tempStr, pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase); for (int i = 0; i < mc.Count; i++) { Console.WriteLine(mc[i].Groups["text"].Value);//得到张三 } } } }
/<a.+\/studentinfo\/.+\.html.*>(.+)<\/a>/
采集吗? <a href="/studentinfo/12d12345-3456-7a9a-12e4-53fbd.html" target="_blank">张三</a> Match match = new Regex(@"<a href=([\s|\S]*?) target="_blank">([\s|\S]*?)</a>", RegexOptions.IgnoreCase).Match(pageList); if (match.Length == 0) { break; } while (match.Success) { string url = "http://www.baidu.com" + match.Groups[1].Value; string name = match.Groups[2].Value; Console.WriteLine(name); GetInfoPath(url, name); match = match.NextMatch(); }
string tempStr = "<a href=\"/studentinfo/12d12345-3456-7a9a-12e4-53fbd.html\" target=\"_blank\">张三</a>";
string pattern = @"<a[^>]*href=(""(?<href>[^""]*)""|'(?<href>[^']*)'|(?<href>[^\s>]*))[^>]*>(?<text>[\s\S]*?)</a>";
MatchCollection mc = Regex.Matches(tempStr, pattern, RegexOptions.Multiline|RegexOptions.IgnoreCase);
for (int i = 0; i < mc.Count; i++)
{
string str = mc[i].Value;//得到匹配的字符串
string href = mc[i].Groups["href"].Value;//href
string text = mc[i].Groups["text"].Value;//得到张三
}
提问要求不明确是要匹配还是提取Regex re = new Regex("<a\\s*href=\".*?\"\\s*target=\"_blank\"\\s*>.*?</a>", RegexOptions.None);
Regex reg = new Regex(@"(?<=<a[^>]*?>)(?:(?!</?a).)*");
Console.WriteLine(reg.Match(str).Value);
2楼的不就能用!~
[/Quote]但是那样会整个页面中<a></a>之间的都提取出来,不过,我只要提取格式为这种的就可以,
<a href="/studentinfo/XXXXXXXXXXX.html" Target="_blank">文本</a>
(就是一定要为html结尾的,且前面的开头一定要为=/studentinfo/这种的<a></a>之间的值)
但是那样会整个页面中<a></a>之间的都提取出来,不过,我只要提取格式为这种的就可以,
<a href="/studentinfo/XXXXXXXXXXX.html" Target="_blank">文本</a>
(就是一定要为html结尾的,且前面的开头一定要为=/studentinfo/这种的<a></a>之间的值)
[/Quote] string str = @"<a href=""/studentinfo/12d12345-3456-7a9a-12e4-53fbd.html"" target=""_blank"">张三</a>
<a href=""/studentinfo/XXXXXXXXXXX.html"" Target=""_blank"">文本</a>";
Regex reg = new Regex(@"(?<=<a[^>]*?href=['""]/studentinfo/.*?\.html[^>]*?>)(?:(?!</?a).)*");
foreach (Match m in reg.Matches(str))
Console.WriteLine(m.Value);
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;namespace ConsoleApplication13
{
class Program
{
static void Main(string[] args)
{
string tempStr = "<a href=\"/studentinfo/12d12345-3456-7a9a-12e4-53fbd.html\" target=\"_blank\">张三</a>";
string pattern = @"<a[^>]*href=['""]/studentinfo/[^.]*\.html['""][^>]*>(?<text>[\s\S]*)</a>";
MatchCollection mc = Regex.Matches(tempStr, pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);
for (int i = 0; i < mc.Count; i++)
{
Console.WriteLine(mc[i].Groups["text"].Value);//得到张三
}
}
}
}
<a href="/studentinfo/12d12345-3456-7a9a-12e4-53fbd.html" target="_blank">张三</a>
Match match = new Regex(@"<a href=([\s|\S]*?) target="_blank">([\s|\S]*?)</a>", RegexOptions.IgnoreCase).Match(pageList);
if (match.Length == 0)
{
break;
}
while (match.Success)
{
string url = "http://www.baidu.com" + match.Groups[1].Value;
string name = match.Groups[2].Value;
Console.WriteLine(name);
GetInfoPath(url, name);
match = match.NextMatch();
}