string html = "<div>hhh<span>kkkk</span></div>"; string pattern = "(?is)<div[^>]*>(?<text1>[^<]*)<span[^>]*>(?<text2>[^<]*)</span></div>"; Regex regex = new Regex(pattern); Match match = regex.Match(html); if (match.Success) { Console.WriteLine(match.Groups["text1"].Value); Console.WriteLine(match.Groups["text2"].Value); }
string pattern = "(?is)<div[^>]*>(?<text1>[^<]*)<span[^>]*>(?<text2>[^<]*)</span></div>";
Regex regex = new Regex(pattern);
Match match = regex.Match(html);
if (match.Success)
{
Console.WriteLine(match.Groups["text1"].Value);
Console.WriteLine(match.Groups["text2"].Value);
}
<div id="header">
<div id="site-nav">
<p class="login-info">
<script>TB.Header.writeLoginInfo({"memberServer": "http://member1.taobao.com", "loginServer": "https://login.taobao.com", redirectUrl : "", logoutUrl : ""});</script>
</p>
<ul class="quick-menu">
<li class="home"><a href="http://www.taobao.com/">淘宝网首页</a></li>
<li><a href="http://list.taobao.com/browse/cat-0.htm" target="_top">我要买</a></li>
<li class="mytaobao menu-item">
<div class="menu">
<a class="menu-hd" href="http://i.taobao.com/my_taobao.htm" target="_top">我的淘宝<b></b></a>
<div class="menu-bd">
<div class="menu-bd-panel">
<div>
<a href="http://trade.taobao.com/trade/itemlist/list_bought_items.htm" target="_top">已买到的宝贝</a><br>
<a href="http://trade.taobao.com/trade/itemlist/list_sold_items.htm" target="_top">已卖出的宝贝</a>
</div>
</div>
<s class="r"></s><s class="rt"></s><s class="lt"></s><s class="b"></s><s class="b b2"></s><s class="rb"></s><s class="lb"></s>
</div>
</div>
提取出所有的文本 并获取该文字在整段html中的位置
private static void TestRegex34()
{
string yourStr = @"<div>hhh<span>kkkk</span></div>";
string[] items = Regex.Split(yourStr, @"<[^<>]*>");
foreach (string s in items)
{
Console.WriteLine(s);
}
}
除了刚贴的,还可以用string yourStr = @"<div>hhh<span>kkkk</span></div>";
MatchCollection mc = Regex.Matches(yourStr, @"(?<=<(div|span)[^>]*>)[^<>]+");
foreach (Match m in mc)
{
Console.WriteLine("位置:" + m.Index.ToString());
Console.WriteLine(m.Value);
}结果:
位置:5
hhh
位置:14
kkkk
就是比如有一段html"<div class=''>abc<span>def</span>gtr</div><div style="">wsdl</div><a href=""/><img src=""/>.......
然后获取html标签中的文字:abc def gtr 并得到abc def gtr在整个html中的位置
就是想做自己的个人博客,如sina blog那种的,每篇文章有自己的摘要,但还是保留原来的样式如:
正则容错性差