【【DataTable每次增加为什么会覆盖之前的数据?求助!!!】】 本帖最后由 kayoo 于 2009-08-29 18:40:08 编辑 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 PS:每次单击按钮获取的XML都是不同的 还有一个小问题,就是如何才能实现HTML的解析,我从网上查到设置HtmlEncode="false" 可是我是动态创建的 如何添加 if(!isPostBack){}使用mshtml和正则解析html或XPath string tag = @"(?:[\w-:]+)"; string attribute = @"(?:[\w-:]+)(?:=(?:[^\s\>\<]*|\""[\s\S]*?\""|\'[\s\S]*?\'))?"; string name = @"(?:[\w-:]+)"; string xmlDirective = @"(?:\<!" +name + @"(?:\s+" +argument + @")*\s*\>)"; string xmlCData = @"(?:\<!\[CDATA\[(?:[\s\S]*?)\]\]\>)"; 参考 using System; 2using System.Collections.Generic; 3using System.Text; 4using System.Text.RegularExpressions; 5using MIL.Html; 6 7namespace Yuanso.Sitework.Crawler 8{ 9 public class HtmlUtil 10 { 11 /**//// <summary> 12 /// Written: [CHINA] Zhang Liu 13 /// Date: 1,Jun,2006 14 /// Version: 1.0 15 /// Support: MYBASK <see cref="http://www.mybask.net"/> 16 /// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/> 17 /// Summary: 18 /// Picking up text content from a html document. This function will remove: 19 /// 1. <%=%> 20 /// 2. script 21 /// 3. style 22 /// 4. html tags 23 /// 6. and others 24 /// 7. html comments 25 /// After all above removed, \r\n will be replaced by an empty character. 26 /// </summary> 27 /// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param> 28 /// <returns>string: Stripped text</returns> 29 public static string ExtractContent(string strHtml) 30 { 31 //All the regular expression for matching html, javascript, style elements and others. 32 string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+", 33 @"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);", 34 @"&#(\d+);", @"-->", @"<!--.*\n"}; 35 //Corresponding replacment to the regular expressions. 36 //string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" }; 37 string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" }; 38 string strStripped = strHtml; 39 //Loop to replacing. 40 for (int i = 0; i < aryRegex.Length; i++) 41 { 42 Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase); 43 strStripped = regex.Replace(strStripped, aryReplacment[i]); 44 } 45 //Replace "\r\n" to an empty character. 46 strStripped.Replace("\r\n", ""); 47 strStripped.Replace("\t", ""); 48 //Return stripped string. 49 return strStripped; 50 } 51 public static string ExtractTitle(string strHtml) 52 { 53 54 string title; 55 //string titleResult; 56 Match m; 57 string titlePatern = @"<title[^>]*?>.*?</title>"; 58 Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase); 59 m = regex.Match(strHtml); 60 if (m.Success) 61 { 62 title = m.Value.ToString(); 63 title = title.Replace("<title>", ""); 64 title = title.Replace("</title>", ""); 65 } 66 else title = "无标题"; 67 68 return title; 69 } 70 /**//// <summary> 71 /// 此私有方法从一段HTML文本中提取出一定字数的纯文本 72 /// </summary> 73 /// <param name="instr">HTML代码</param> 74 /// <param name="firstN">提取从头数多少个字</param> 75 /// <param name="withLink">是否要链接里面的字</param> 76 /// <returns>纯文本</returns> 77 public static string getFirstNchar(string instr, int firstN, bool withLink) 78 { 79 string strStripped; 80 strStripped = instr.Clone() as string; 81 strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, ""); 82 strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, ""); 83 strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, ""); 84 if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, ""); 85 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase); 86 strStripped = objReg.Replace(strStripped, ""); 87 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase); 88 strStripped = objReg2.Replace(strStripped, " "); 89 //return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped; 90 return strStripped; 91 } 92 93 public static string getTitle(string strHtml) 94 { 95 string title=""; 96 Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase); 97 Match mc = reg.Match(strHtml); 98 if (mc.Success) 99 title = mc.Groups["title"].Value.Trim();100101 return title;102 }103 }104 public class Htmlpage105 {106 public static string GetTitle(string strHtml)107 {108 MIL.Html.HtmlDocument documnet;109 HtmlParser parser = new HtmlDomainTreeParser();110 documnet = parser.Parse(strHtml);111 StringBuilder text = new StringBuilder("");112 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))113 {114115 HtmlText textNode;116 textNode = (HtmlText)node;117 if (!textNode.Text.Contains("\r") && !textNode.Text.Contains("\n"))118 {119 text.Append(textNode.Text);120 break;121 }122 123 }124 return text.ToString();125126 }127 public static string GetContent(string strHtml)128 {129 MIL.Html.HtmlDocument documnet;130 HtmlParser parser = new HtmlDomainTreeParser();131 documnet = parser.Parse(strHtml);132 StringBuilder text = new StringBuilder();133 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))134 {135136 HtmlText textNode;137 textNode = (HtmlText)node;138 if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))139 continue;140 else text.Append(textNode.Text);141142 }143 return text.ToString();144145 }146 }147148}149 求下面代码能在火狐浏览器上运行 带自定义模板的后台管理系统 在线等,如何实现动态的资源帮定。 如何解决数据大量访问量的问题 不知道有没有人用ASP.NET WEB MATRIX来开发程序的,对中文的问题你们是怎么解决的 请问对在asp:table 中,如何其居中设计呢,因为HorizontalAlign=center无效 乱码问题~ 小弟做了一个网站,进去载入图片特别慢,哪位大神能帮忙看一下 关于asp.net Ilist集合的问题? 到底那里出错了? 请问用linq的话怎么做查询拼接 请问哪里有免费的彩票开奖查询接口?
{
}
使用mshtml和正则解析html
或XPath
string tag = @"(?:[\w-:]+)";
string attribute = @"(?:[\w-:]+)(?:=(?:[^\s\>\<]*|\""[\s\S]*?\""|\'[\s\S]*?\'))?";
string name = @"(?:[\w-:]+)";
string xmlDirective = @"(?:\<!" +name + @"(?:\s+" +argument + @")*\s*\>)";
string xmlCData = @"(?:\<!\[CDATA\[(?:[\s\S]*?)\]\]\>)";
参考
2using System.Collections.Generic;
3using System.Text;
4using System.Text.RegularExpressions;
5using MIL.Html;
6
7namespace Yuanso.Sitework.Crawler
8{
9 public class HtmlUtil
10 {
11 /**//// <summary>
12 /// Written: [CHINA] Zhang Liu
13 /// Date: 1,Jun,2006
14 /// Version: 1.0
15 /// Support: MYBASK <see cref="http://www.mybask.net"/>
16 /// Looking for latest version or similar implementation of this function, please visit: <seealso cref="http://www.mybask.net"/>
17 /// Summary:
18 /// Picking up text content from a html document. This function will remove:
19 /// 1. <%=%>
20 /// 2. script
21 /// 3. style
22 /// 4. html tags
23 /// 6. and others
24 /// 7. html comments
25 /// After all above removed, \r\n will be replaced by an empty character.
26 /// </summary>
27 /// <param name="strHtml">string:Waiting for striping html,javascript, style elements</param>
28 /// <returns>string: Stripped text</returns>
29 public static string ExtractContent(string strHtml)
30 {
31 //All the regular expression for matching html, javascript, style elements and others.
32 string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+",
33 @"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);",
34 @"&#(\d+);", @"-->", @"<!--.*\n"};
35 //Corresponding replacment to the regular expressions.
36 //string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
37 string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };
38 string strStripped = strHtml;
39 //Loop to replacing.
40 for (int i = 0; i < aryRegex.Length; i++)
41 {
42 Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
43 strStripped = regex.Replace(strStripped, aryReplacment[i]);
44 }
45 //Replace "\r\n" to an empty character.
46 strStripped.Replace("\r\n", "");
47 strStripped.Replace("\t", "");
48 //Return stripped string.
49 return strStripped;
50 }
51 public static string ExtractTitle(string strHtml)
52 {
53
54 string title;
55 //string titleResult;
56 Match m;
57 string titlePatern = @"<title[^>]*?>.*?</title>";
58 Regex regex = new Regex(titlePatern, RegexOptions.IgnoreCase);
59 m = regex.Match(strHtml);
60 if (m.Success)
61 {
62 title = m.Value.ToString();
63 title = title.Replace("<title>", "");
64 title = title.Replace("</title>", "");
65 }
66 else title = "无标题";
67
68 return title;
69 }
70 /**//// <summary>
71 /// 此私有方法从一段HTML文本中提取出一定字数的纯文本
72 /// </summary>
73 /// <param name="instr">HTML代码</param>
74 /// <param name="firstN">提取从头数多少个字</param>
75 /// <param name="withLink">是否要链接里面的字</param>
76 /// <returns>纯文本</returns>
77 public static string getFirstNchar(string instr, int firstN, bool withLink)
78 {
79 string strStripped;
80 strStripped = instr.Clone() as string;
81 strStripped = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
82 strStripped = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
83 strStripped = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
84 if (!withLink) strStripped = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(strStripped, "");
85 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
86 strStripped = objReg.Replace(strStripped, "");
87 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
88 strStripped = objReg2.Replace(strStripped, " ");
89 //return strStripped.Length > firstN ? strStripped.Substring(0, firstN) : strStripped;
90 return strStripped;
91 }
92
93 public static string getTitle(string strHtml)
94 {
95 string title="";
96 Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
97 Match mc = reg.Match(strHtml);
98 if (mc.Success)
99 title = mc.Groups["title"].Value.Trim();
100
101 return title;
102 }
103 }
104 public class Htmlpage
105 {
106 public static string GetTitle(string strHtml)
107 {
108 MIL.Html.HtmlDocument documnet;
109 HtmlParser parser = new HtmlDomainTreeParser();
110 documnet = parser.Parse(strHtml);
111 StringBuilder text = new StringBuilder("");
112 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
113 {
114
115 HtmlText textNode;
116 textNode = (HtmlText)node;
117 if (!textNode.Text.Contains("\r") && !textNode.Text.Contains("\n"))
118 {
119 text.Append(textNode.Text);
120 break;
121 }
122
123 }
124 return text.ToString();
125
126 }
127 public static string GetContent(string strHtml)
128 {
129 MIL.Html.HtmlDocument documnet;
130 HtmlParser parser = new HtmlDomainTreeParser();
131 documnet = parser.Parse(strHtml);
132 StringBuilder text = new StringBuilder();
133 foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
134 {
135
136 HtmlText textNode;
137 textNode = (HtmlText)node;
138 if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))
139 continue;
140 else text.Append(textNode.Text);
141
142 }
143 return text.ToString();
144
145 }
146 }
147
148}
149