文章采集 思路是怎么样的!? 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 就是去代码请求一个url,然后分析源代码,提取自己感兴趣的,。网上很多代码的 使用HttpWebRequest对象给要采集的页面发送一个请求,然后可以得到源码,自己想取什么就取什么。 抓取页面,定时string param = ""; byte[] bs = Encoding.ASCII.GetBytes(param); HttpWebRequest req = (HttpWebRequest) HttpWebRequest.Create( "" ); req.Method = "POST"; req.ContentType = "application/x-www-form-urlencoded"; req.ContentLength = bs.Length; webclient等 如Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>"); MatchCollection mc = reg.Matches(""); foreach (Match m in mc) { Console.Write(m.Groups["url"].Value); } /// <summary> /// 获取网页源代码 /// </summary> /// <param name="url"></param> /// <returns></returns> public string GetHtmlEx(string url) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 60000; request.UserAgent = userAgent; request.ContentType = contentType; request.CookieContainer = cookie; request.Accept = accept; request.Method = "get"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.ContentType.Length == 9) { _encode = "gbk"; } else { _encode = getEncoding(response); } Stream responseStream = response.GetResponseStream(); StreamReader reader = new StreamReader(responseStream, Encoding.GetEncoding(_encode)); String html = reader.ReadToEnd(); response.Close(); return html; } string Htmlstring = GetHtmlEx(url); Htmlstring = Regex.Replace(Htmlstring, "\"", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, "'", "", RegexOptions.IgnoreCase); string[] arr0 = Get_url_Array(Htmlstring, rulesObj.RRegex_0);//标题 Get_url_Array方法怎么没有啊 ? 怎么取得<div class="title"><h1></h1></div>之内的内容,正则表达式怎么写? /// <summary> /// 返回超连接的数组 /// </summary> /// <param name="userInput"></param> /// <param name="WebText"></param> /// <returns></returns> public string[] Get_url_Array(string userInput, string WebText) { MatchCollection mc = Regex.Matches(userInput, WebText); ArrayList Url_List = new ArrayList(); foreach (Match m in mc) { Url_List.Add(m.Value.ToString().Replace(" ", " ")); } return (string[])Url_List.ToArray(typeof(string)); } (?is)<div class="title"><h1>.+?</h1></div> string pattern = @"(?is)<div\s+class=\"title\">(.*?)</div>"; repeater控件一行显示2条数据 js如何动态生成<ul><li>arry[i]</li></li> ASP.NET怎么把图片上传到Access数据库,然后在页面显示? 用了AJAXPANEL,也用了AJAX.为什么页面还是照样很闪 急需:用C#做可视的工作流编辑器.最好有点源码提示.非常感谢 请问:这样的功能怎么实现?? 怎样访问网上邻居里的内容 gridview选中行问题 IIS访问权限 固定某几台可以访问 请问各位高手一个关于bbs的问题 拿分问题,速度! IE6下如何处理图片宽度太大的问题?
string param = "";
byte[] bs = Encoding.ASCII.GetBytes(param);
HttpWebRequest req = (HttpWebRequest) HttpWebRequest.Create( "" );
req.Method = "POST";
req.ContentType = "application/x-www-form-urlencoded";
req.ContentLength = bs.Length;
webclient等
MatchCollection mc = reg.Matches("");
foreach (Match m in mc)
{
Console.Write(m.Groups["url"].Value);
}
/// <summary>
/// 获取网页源代码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public string GetHtmlEx(string url)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 60000;
request.UserAgent = userAgent;
request.ContentType = contentType;
request.CookieContainer = cookie;
request.Accept = accept;
request.Method = "get";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.ContentType.Length == 9)
{
_encode = "gbk";
}
else
{
_encode = getEncoding(response);
}
Stream responseStream = response.GetResponseStream();
StreamReader reader = new StreamReader(responseStream, Encoding.GetEncoding(_encode));
String html = reader.ReadToEnd();
response.Close();
return html;
} string Htmlstring = GetHtmlEx(url);
Htmlstring = Regex.Replace(Htmlstring, "\"", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, "'", "", RegexOptions.IgnoreCase);
string[] arr0 = Get_url_Array(Htmlstring, rulesObj.RRegex_0);//标题
/// <summary>
/// 返回超连接的数组
/// </summary>
/// <param name="userInput"></param>
/// <param name="WebText"></param>
/// <returns></returns>
public string[] Get_url_Array(string userInput, string WebText)
{
MatchCollection mc = Regex.Matches(userInput, WebText);
ArrayList Url_List = new ArrayList();
foreach (Match m in mc)
{
Url_List.Add(m.Value.ToString().Replace(" ", " "));
}
return (string[])Url_List.ToArray(typeof(string));
}