取出一个页面中关于keywords和description的meta主要有以下几种情况
<meta name="keywords" content="123" />
<meta name="description" content="321" /><meta content="123" name="keywords" />
<meta content="321" name="description" /><meta id="fdsafds" name="keywords" content="123" />
<meta id="fdsafdsa" name="description" content="321" />以上几种情况都存在
1、没有双引号,也没有单引号
2、只有单引号
3、存在空格的地方都有可能是多个空格
4、大小写问题
5、可能会存在干扰的问题,如同时存在
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta http-equiv="Content-Language" content="zh-CN" />
<meta name="author" content="Csdn" />
<meta name="keywords" content="123" />
<meta name="description" content="321" /><meta content="123" name="keywords" />
<meta content="321" name="description" /><meta id="fdsafds" name="keywords" content="123" />
<meta id="fdsafdsa" name="description" content="321" />以上几种情况都存在
1、没有双引号,也没有单引号
2、只有单引号
3、存在空格的地方都有可能是多个空格
4、大小写问题
5、可能会存在干扰的问题,如同时存在
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta http-equiv="Content-Language" content="zh-CN" />
<meta name="author" content="Csdn" />
<meta name=description content=321 /><meta name='keywords' content='123' />
<meta name='description' content='321' />
<meta name='keywords' content='123' />
<meta name='description' content='321' />
<meta name=description content=""321"" />
<meta content=""123"" name=keywords />
<meta content=""321"" name=""description"" />
<meta id=""fdsafds"" name=""keywords"" content=""123"" />
<meta id=""fdsafdsa"" name=description content=""321"" />";
Regex reg = new Regex(@"<meta[^>]+name=[""|']?(keywords|description)[""|']?[^>]+>");
MatchCollection mc = reg.Matches(str); foreach (Match m in mc)
{
Console.WriteLine(m.Groups[0].ToString());
} /*
<meta name="keywords" content="123" />
<meta name=description content="321" />
<meta content="123" name=keywords />
<meta content="321" name="description" />
<meta id="fdsafds" name="keywords" content="123" />
<meta id="fdsafdsa" name=description content="321" />
*/
MatchCollection mc = reg.Matches(yourStr);
foreach (Match m in mc)
{
richTextBox2.Text += m.Value + "\n";
}
<meta name="keywords" content="123" />我想得到 123,可以用两个正则表达式,分别去keywords和description
private static void TestRegex11()
{
string html = @"<meta name=""keywords"" content=""123"" />
<meta name=""description"" content=""321"" /><meta content=""123"" name=""keywords"" />
<meta content=""321"" name=""description"" /><meta id=""fdsafds"" name=""keywords"" content=""123"" />
<meta id=""fdsafdsa"" name=""description"" content=""321"" />";
MatchCollection keywords = Regex.Matches(html, @"(?<=<meta(?=[^>]+?name=""keywords"").+?content="")[^""]+");
MatchCollection descriptions = Regex.Matches(html, @"(?<=<meta(?=[^>]+?name=""description"").+?content="")[^""]+");
Console.WriteLine("keywords");
foreach (Match key in keywords)
{
Console.WriteLine(key.Value);
}
Console.WriteLine("descriptions");
foreach (Match description in descriptions)
{
Console.WriteLine(description.Value);
}
}
没给你配对顺序。你可以自己吧keyword和description配对起来。顺序应该是一样的。
{
string html = @"<meta name=""keywords"" content=""123"" />
<meta name=""description"" content=""321"" /><meta content=""123"" name=""keywords"" />
<meta content=""321"" name=""description"" /><meta id=""fdsafds"" name=""keywords"" content=""123"" />
<meta id=""fdsafdsa"" name=""description"" content=""321"" />";
MatchCollection keywords = Regex.Matches(html, @"(?<=<meta(?=[^>]+?name=(['""]?)keywords\1).+?content=(['""])?)[^""]+(?=\1| )");
MatchCollection descriptions = Regex.Matches(html, @"(?<=<meta(?=[^>]+?name=(['""]?)description\1).+?content=(['""])?)[^""]+(?=\1| )");
Console.WriteLine("keywords");
foreach (Match key in keywords)
{
Console.WriteLine(key.Value);
}
Console.WriteLine("descriptions");
foreach (Match description in descriptions)
{
Console.WriteLine(description.Value);
}
}
@"(?is)(?<=<meta(?=((?!name).)*name=['""]?(keywords|description)[""'\s>]?)((?!content).)*content=[""']?)[^'""\s>]+"
正常情况下都可以,但当我把字符串改成这样就不行了 string html = @"<meta name=""keywords"" content=""123"" />
<meta name=""description"" content=""321"" /><meta content='123hhhhhh' name='keywords' />
<meta content=""321ggggggg"" name=""description"" /><meta id=fdsafds name=keywords content=123 />
<meta id=""fdsafdsa"" name=""description"" content=""321"" />";
{
string html = @"<meta name=""keywords"" content=""123"" />
<meta name=""description"" content=""321"" /><meta content='123hhhhhh' name='keywords' />
<meta content=""321ggggggg"" name=""description"" /><meta id=fdsafds name=keywords content=123 />
<meta id=""fdsafdsa"" name=""description"" content=""321"" />";
MatchCollection keywords = Regex.Matches(html, @"(?<=<meta(?=[^>]+?name=(['""]?)keywords\1).+?content=(['""]?))[^'"">\s]+(?=\1| )");
MatchCollection descriptions = Regex.Matches(html, @"(?<=<meta(?=[^>]+?name=(['""]?)keywords\1).+?content=(['""]?))[^'"">\s]+(?=\1| )");
Console.WriteLine("keywords");
foreach (Match key in keywords)
{
Console.WriteLine(key.Value);
}
Console.WriteLine("descriptions");
foreach (Match description in descriptions)
{
Console.WriteLine(description.Value);
}
}