在网页上抓取数据 怎么才能在网页上抓到自己想要的数据啊!而且还要去掉标签之类的!!求各位大神帮帮忙啊!!给说了详细的流程啊!! 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 抓取博客园首页推荐文章标题: HttpWebRequest httpWebRequest = WebRequest.Create(@"http://www.cnblogs.com") as HttpWebRequest; HttpWebResponse httpWebResponse = httpWebRequest.GetResponse() as HttpWebResponse; Stream stream = httpWebResponse.GetResponseStream(); StreamReader reader = new StreamReader(stream, Encoding.UTF8); string s = reader.ReadToEnd(); reader.Close(); stream.Close(); httpWebResponse.Close(); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(s); HtmlNodeCollection anchors = htmlDoc.DocumentNode.SelectNodes(@"//a[@class='titlelnk']"); foreach (HtmlNode anchor in anchors) Response.Write(anchor.InnerHtml + "<br/>"); Response.End();用到了HtmlAgilityPack这个第三方类库。 网页抓取 使用HttpWebRequest方式对抓取过后的源码,进行正则匹配,得到自己想要的内容。http://blog.csdn.net/taomanman/article/details/6693594 class Program { static void Main(string[] args) { //需要解析的集合 List<string> list = new List<string>(); //已经解析的集合 List<string> listCount = new List<string>(); //list.Add("http://hao.360.cn/"); list.Add("http://www.baidu.com"); ReadHtml(list,listCount); Console.ReadLine(); } /// <summary> /// 读取HTML中的URL /// </summary> /// <param name="list"></param> /// <param name="listCount"></param> public static void ReadHtml(List<string> list, List<string> listCount) { List<string> count = new List<string>(); for (int a = 0; a < list.Count; a++) { //没有解析过该项 if (!listCount.Contains(list[a])) { try { //在已解析过的集合里面添加本条数据 listCount.Add(list[a]); WebRequest req = WebRequest.Create(list[a]); WebResponse result = req.GetResponse(); //得到的流是网页内容 Stream ReceiveStream = result.GetResponseStream(); StreamReader readerOfStream = new StreamReader(ReceiveStream, System.Text.Encoding.GetEncoding("GB2312")); //得到当前URL的源码 string str = readerOfStream.ReadToEnd(); //解析 Regex regex = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"); foreach (Match mc in regex.Matches(str)) { Regex regexOhter = new Regex(list[a] + "|.png|.jpg|.gif|.bmp|.js|.css|.xls|.doc|.pdf|.chw|.exe|.mp3|.mp4|.avi|.swf|.xml"); if (!regexOhter.IsMatch(mc.ToString())) { Console.WriteLine(mc); count.Add(mc.ToString()); } } Console.WriteLine("----------------------解析完一个页面!--------------------"); if (a == list.Count - 1) { //递归调用本方法 ReadHtml(count, listCount); } } catch (System.Exception ex) { } finally { List<string> error = new List<string>(); //如果出错在出错的后面一条URL继续解析 for (int z = a + 1; z < list.Count; z++) { error.Add(list[z]); } //继续解析 ReadHtml(error, listCount); } } } } }这是我写的解析HTML中的超链接通过一个点扩散去找!你要找到自己需要的内容对正则有一定的要求!!! class Program { static void Main(string[] args) { //需要解析的集合 List<string> list = new List<string>(); //已经解析的集合 List<string> listCount = new List<string>(); //list.Add("http://hao.360.cn/"); list.Add("http://www.baidu.com"); ReadHtml(list,listCount); Console.ReadLine(); } /// <summary> /// 读取HTML中的URL /// </summary> /// <param name="list"></param> /// <param name="listCount"></param> public static void ReadHtml(List<string> list, List<string> listCount) { List<string> count = new List<string>(); for (int a = 0; a < list.Count; a++) { //没有解析过该项 if (!listCount.Contains(list[a])) { try { //在已解析过的集合里面添加本条数据 listCount.Add(list[a]); WebRequest req = WebRequest.Create(list[a]); WebResponse result = req.GetResponse(); //得到的流是网页内容 Stream ReceiveStream = result.GetResponseStream(); StreamReader readerOfStream = new StreamReader(ReceiveStream, System.Text.Encoding.GetEncoding("GB2312")); //得到当前URL的源码 string str = readerOfStream.ReadToEnd(); //解析 Regex regex = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"); foreach (Match mc in regex.Matches(str)) { Regex regexOhter = new Regex(list[a] + "|.png|.jpg|.gif|.bmp|.js|.css|.xls|.doc|.pdf|.chw|.exe|.mp3|.mp4|.avi|.swf|.xml"); if (!regexOhter.IsMatch(mc.ToString())) { Console.WriteLine(mc); count.Add(mc.ToString()); } } Console.WriteLine("----------------------解析完一个页面!--------------------"); if (a == list.Count - 1) { //递归调用本方法 ReadHtml(count, listCount); } } catch (System.Exception ex) { } finally { List<string> error = new List<string>(); //如果出错在出错的后面一条URL继续解析 for (int z = a + 1; z < list.Count; z++) { error.Add(list[z]); } //继续解析 ReadHtml(error, listCount); } } } } } WPF DataGrid 隐藏列 C#中的阻滞Socket与C++的阻滞SOCKET哪个速度更快? 如何根据网页链接地址得到网页源码? 关于枚举类型判断的问题 页面传值问题 最后40分,谁要拿去,我受不了了,到现在没搞出来,大家帮帮忙啊! 请问如何用c#抓屏 qinuxman 求问visual studio怎么增加编译参数。 类库和UI相互引用了?依赖倒置? 如何生产单独的DLL文件! 一个非程序问题 问大家一个问题,,我写的一个网页,为什么后面的脚本能运行,而前面的脚本不能运行啊?江湖告急
HttpWebRequest httpWebRequest = WebRequest.Create(@"http://www.cnblogs.com") as HttpWebRequest;
HttpWebResponse httpWebResponse = httpWebRequest.GetResponse() as HttpWebResponse;
Stream stream = httpWebResponse.GetResponseStream();
StreamReader reader = new StreamReader(stream, Encoding.UTF8);
string s = reader.ReadToEnd();
reader.Close();
stream.Close();
httpWebResponse.Close();
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(s);
HtmlNodeCollection anchors = htmlDoc.DocumentNode.SelectNodes(@"//a[@class='titlelnk']");
foreach (HtmlNode anchor in anchors)
Response.Write(anchor.InnerHtml + "<br/>");
Response.End();
用到了HtmlAgilityPack这个第三方类库。
对抓取过后的源码,进行正则匹配,得到自己想要的内容。http://blog.csdn.net/taomanman/article/details/6693594
{
static void Main(string[] args)
{
//需要解析的集合
List<string> list = new List<string>();
//已经解析的集合
List<string> listCount = new List<string>();
//list.Add("http://hao.360.cn/");
list.Add("http://www.baidu.com"); ReadHtml(list,listCount);
Console.ReadLine();
} /// <summary>
/// 读取HTML中的URL
/// </summary>
/// <param name="list"></param>
/// <param name="listCount"></param>
public static void ReadHtml(List<string> list, List<string> listCount)
{
List<string> count = new List<string>();
for (int a = 0; a < list.Count; a++)
{
//没有解析过该项
if (!listCount.Contains(list[a]))
{
try
{
//在已解析过的集合里面添加本条数据
listCount.Add(list[a]);
WebRequest req = WebRequest.Create(list[a]);
WebResponse result = req.GetResponse();
//得到的流是网页内容
Stream ReceiveStream = result.GetResponseStream();
StreamReader readerOfStream = new StreamReader(ReceiveStream, System.Text.Encoding.GetEncoding("GB2312"));
//得到当前URL的源码
string str = readerOfStream.ReadToEnd();
//解析
Regex regex = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
foreach (Match mc in regex.Matches(str))
{
Regex regexOhter = new Regex(list[a] + "|.png|.jpg|.gif|.bmp|.js|.css|.xls|.doc|.pdf|.chw|.exe|.mp3|.mp4|.avi|.swf|.xml");
if (!regexOhter.IsMatch(mc.ToString()))
{
Console.WriteLine(mc);
count.Add(mc.ToString());
}
}
Console.WriteLine("----------------------解析完一个页面!--------------------");
if (a == list.Count - 1)
{
//递归调用本方法
ReadHtml(count, listCount);
}
}
catch (System.Exception ex) { }
finally
{
List<string> error = new List<string>();
//如果出错在出错的后面一条URL继续解析
for (int z = a + 1; z < list.Count; z++)
{
error.Add(list[z]);
}
//继续解析
ReadHtml(error, listCount);
}
}
}
}
}
这是我写的解析HTML中的超链接
通过一个点扩散去找!
你要找到自己需要的内容
对正则有一定的要求!!!
{
static void Main(string[] args)
{
//需要解析的集合
List<string> list = new List<string>();
//已经解析的集合
List<string> listCount = new List<string>();
//list.Add("http://hao.360.cn/");
list.Add("http://www.baidu.com"); ReadHtml(list,listCount);
Console.ReadLine();
} /// <summary>
/// 读取HTML中的URL
/// </summary>
/// <param name="list"></param>
/// <param name="listCount"></param>
public static void ReadHtml(List<string> list, List<string> listCount)
{
List<string> count = new List<string>();
for (int a = 0; a < list.Count; a++)
{
//没有解析过该项
if (!listCount.Contains(list[a]))
{
try
{
//在已解析过的集合里面添加本条数据
listCount.Add(list[a]);
WebRequest req = WebRequest.Create(list[a]);
WebResponse result = req.GetResponse();
//得到的流是网页内容
Stream ReceiveStream = result.GetResponseStream();
StreamReader readerOfStream = new StreamReader(ReceiveStream, System.Text.Encoding.GetEncoding("GB2312"));
//得到当前URL的源码
string str = readerOfStream.ReadToEnd();
//解析
Regex regex = new Regex(@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
foreach (Match mc in regex.Matches(str))
{
Regex regexOhter = new Regex(list[a] + "|.png|.jpg|.gif|.bmp|.js|.css|.xls|.doc|.pdf|.chw|.exe|.mp3|.mp4|.avi|.swf|.xml");
if (!regexOhter.IsMatch(mc.ToString()))
{
Console.WriteLine(mc);
count.Add(mc.ToString());
}
}
Console.WriteLine("----------------------解析完一个页面!--------------------");
if (a == list.Count - 1)
{
//递归调用本方法
ReadHtml(count, listCount);
}
}
catch (System.Exception ex) { }
finally
{
List<string> error = new List<string>();
//如果出错在出错的后面一条URL继续解析
for (int z = a + 1; z < list.Count; z++)
{
error.Add(list[z]);
}
//继续解析
ReadHtml(error, listCount);
}
}
}
}
}