高分求一个 抓取网页数据软件 个抓取网页数据软件(网页源码中产品的图片 价格 描述 重量 等等信息) 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 获得html源码,正则分析源码并提取所需要的。会写正则就好办了 <img src="images/lv/M45715.png" alt="LV BOETIE PM Monogram M45715" title=" LV BOETIE PM Monogram M45715 " width="216" height="270" class="listingProductImage" /></a><br /><h3 class="itemTitle"><div id="list_itemTitle" align="center"><a href="http://www.stylebags.net/lv-boetie-pm-monogram-m45715-p-723.html">LV BOETIE PM Monogram M45715</a></div></h3><div id="list_itemPrice"><br />$219.00</div> <br /> </div><br class="clearBoth" /> <br class="clearBoth" /><div id="productsListingBottomNumber" class="navSplitPagesResult back" style="color:#fff; margin-top:24px; margin-bottom:24px"></div><div id="//需要用正则获取http://www.stylebags.net/lv-boetie-pm-monogram-m45715-p-723.html,怎么写 坐等大牛 网页抓取类:using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Text;using System.Net;using System.IO;using System.Text.RegularExpressions;using System.Collections;using System.IO.Compression;public class webCrawl{ public webCrawl() { } //获取网页字符根据url public static string getHtml(string url) { try { string str = ""; Encoding en = Encoding.GetEncoding(getEncoding(url)); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Headers.Set("Pragma", "no-cache"); request.Timeout = 30000; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { Stream strM = response.GetResponseStream(); StreamReader sr = new StreamReader(strM, en); str = sr.ReadToEnd(); strM.Close(); sr.Close(); } return str; } catch { return String.Empty; } } //获取编码 public static string getEncoding(string url) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 30000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); else reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)"); if (reg_charset.IsMatch(html)) { return reg_charset.Match(html).Groups["charset"].Value; } else if (response.CharacterSet != string.Empty) { return response.CharacterSet; } else return Encoding.Default.BodyName; } } catch (Exception ex) { throw new Exception(ex.Message); } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return Encoding.Default.BodyName; } //根据内容--获取标题 public static string getTitle(string url) { string title = string.Empty; string htmlStr = getHtml(url);//获取网页 Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline); title = TitleMatch.Groups[1].Value; title = Regex.Replace(title, @"\W", "");//去除空格 return title; } //根据内容--获取描述信息 public static string getDescription(string url) { string htmlStr = getHtml(url); Match Desc = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>", RegexOptions.IgnoreCase | RegexOptions.Multiline); string mdd = Desc.Groups[1].Value; return Regex.Replace(Desc.Groups[1].Value, @"\W", ""); } //根据内容--获取所有链接 public static List<string> getLink(string htmlStr) { List<string> list = new List<string>(); //用来存放链接 String reg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; //链接的正则表达式 Regex regex = new Regex(reg, RegexOptions.IgnoreCase); MatchCollection mc = regex.Matches(htmlStr); for (int i = 0; i < mc.Count; i++) //存放匹配的集合 { bool hasExist = false; //链接存在与否的标记 String name = mc[i].ToString(); foreach (String one in list) { if (name == one) { hasExist = true; //链接已存在 break; } } if (!hasExist) list.Add(name); //链接不存在,添加 } return list; } //根据内容--取得body内的内容 public static string getBody(string url) { string htmlStr = getHtml(url); string result = string.Empty; Regex regBody = new Regex(@"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>"); Match m = regBody.Match(htmlStr); if (m.Success) { result = parseHtml(m.Value); } return result; } //获取所有图片 public static List<string> getImg(string url) { List<string> list = new List<string>(); string temp = string.Empty; string htmlStr = getHtml(url); MatchCollection matchs = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>"); //抽取所有图片 for (int i = 0; i < matchs.Count; i++) { list.Add(matchs[i].Value); } return list; } //所有图片路径(如果是相对路径的话,自动设置成绝对路径) public static List<string> getImgPath(string url) { List<string> list = new List<string>(); string htmlStr = getHtml(url); string pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"; MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match m in matches) { string imgPath = m.Groups["imgUrl"].Value.Trim(); if (Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$")) //用了2次匹配,去除链接是网页的 只留图片 { if (!imgPath.Contains("http"))//必须包含http 否则无法下载 { imgPath = getUrl(url) + imgPath; } list.Add(imgPath); } } return list; } //下载图片 public void DownloadImg(string fileurl) { if (fileurl.Contains('.'))//url路径必须是绝对路径 例如http://xxx.com/img/logo.jpg { string imgName = DateTime.Now.ToString("yyyyMMddHHmmssffff") + fileurl.Substring(fileurl.LastIndexOf('.')); // 生成图片的名字 string filepath = System.Web.HttpContext.Current.Server.MapPath("") + "/" + imgName; WebClient mywebclient = new WebClient(); mywebclient.DownloadFile(fileurl, filepath); } } //过滤html public static string parseHtml(string html) { string value = Regex.Replace(html, "<[^>]*>", string.Empty); value = value.Replace("<", string.Empty); value = value.Replace(">", string.Empty); //return value.Replace(" ", string.Empty); return Regex.Replace(value, @"\s+", ""); } //处理url路径问题 public static string getUrl(string url) { //如果是http://www.xxx.com 返回http://www.xxx.com/ //如果是http://www.xxx.com/art.aspx 返回http://www.xxx.com/ return url = url.Substring(0, url.LastIndexOf('/')) + "/"; }} 谢谢你的回答 这个问题我解决了 现在出现新问题了解决了一起结贴http://topic.csdn.net/u/20111031/15/2c1ae90f-9232-466f-afab-e9296ef8dd94.html?seed=112401756&r=76254699 采哪个站,给你思路1.获取采集列表2.获取当前页的产品信息标题指向的URL。3.进入产品页获取HTML。4.正则分析你要的数据5.执行下一个产品页面数据的获取。6.跳到下一页继续执行2.有不明白的,QQ:444758708 我用了下你写的这个类,为什么有的能获得HTML,有的不能获得呢。比如www.baidu.com能获得,www.taobao.com就获得不了。 C# PPC程序中 split重载方法 需要加命名空间吗 如何把数据从SQL导入到Access 求正则表达式 Backgroundworker 如何解决pintviewdialog只允许预览1000页上限的问题 求教如何在c#中调用java class 的函数阿? c# 几个基本的问题--估计认真学过的可以解答 使用reflector出现"未将对象引用设置到对象的实例",求高人解决。 在线急等!!!!数据库关闭问题,跪求解答!!! 我做了个异步的tcp程序,100分求助!!大虾~小鱼一律欢迎~解决结贴! C#的最小化后再显示 vs2010 不能可视化编辑强类型的dataset?
会写正则就好办了
<img src="images/lv/M45715.png" alt="LV BOETIE PM Monogram M45715" title=" LV BOETIE PM Monogram M45715 " width="216" height="270" class="listingProductImage" />
</a><br /><h3 class="itemTitle"><div id="list_itemTitle" align="center"><a href="http://www.stylebags.net/lv-boetie-pm-monogram-m45715-p-723.html">
LV BOETIE PM Monogram M45715</a></div></h3><div id="list_itemPrice"><br />$219.00</div> <br /> </div>
<br class="clearBoth" />
<br class="clearBoth" />
<div id="productsListingBottomNumber" class="navSplitPagesResult back" style="color:#fff; margin-top:24px; margin-bottom:24px"></div>
<div id="
//需要用正则获取http://www.stylebags.net/lv-boetie-pm-monogram-m45715-p-723.html,怎么写 坐等大牛
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;public class webCrawl
{
public webCrawl() { } //获取网页字符根据url
public static string getHtml(string url)
{
try
{
string str = "";
Encoding en = Encoding.GetEncoding(getEncoding(url));
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Headers.Set("Pragma", "no-cache");
request.Timeout = 30000;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
Stream strM = response.GetResponseStream();
StreamReader sr = new StreamReader(strM, en);
str = sr.ReadToEnd();
strM.Close();
sr.Close();
}
return str;
}
catch
{
return String.Empty;
}
} //获取编码
public static string getEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 30000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
finally
{
if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null;
}
return Encoding.Default.BodyName;
} //根据内容--获取标题
public static string getTitle(string url)
{
string title = string.Empty;
string htmlStr = getHtml(url);//获取网页
Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
title = TitleMatch.Groups[1].Value;
title = Regex.Replace(title, @"\W", "");//去除空格
return title; } //根据内容--获取描述信息
public static string getDescription(string url)
{
string htmlStr = getHtml(url);
Match Desc = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string mdd = Desc.Groups[1].Value;
return Regex.Replace(Desc.Groups[1].Value, @"\W", "");
}
//根据内容--获取所有链接
public static List<string> getLink(string htmlStr)
{
List<string> list = new List<string>(); //用来存放链接
String reg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; //链接的正则表达式
Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
MatchCollection mc = regex.Matches(htmlStr);
for (int i = 0; i < mc.Count; i++) //存放匹配的集合
{
bool hasExist = false; //链接存在与否的标记
String name = mc[i].ToString();
foreach (String one in list)
{
if (name == one)
{
hasExist = true; //链接已存在
break;
}
}
if (!hasExist) list.Add(name); //链接不存在,添加
}
return list; } //根据内容--取得body内的内容
public static string getBody(string url)
{
string htmlStr = getHtml(url);
string result = string.Empty;
Regex regBody = new Regex(@"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>");
Match m = regBody.Match(htmlStr);
if (m.Success)
{
result = parseHtml(m.Value);
}
return result;
} //获取所有图片
public static List<string> getImg(string url)
{
List<string> list = new List<string>();
string temp = string.Empty;
string htmlStr = getHtml(url);
MatchCollection matchs = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>"); //抽取所有图片
for (int i = 0; i < matchs.Count; i++)
{
list.Add(matchs[i].Value);
}
return list;
} //所有图片路径(如果是相对路径的话,自动设置成绝对路径)
public static List<string> getImgPath(string url)
{
List<string> list = new List<string>();
string htmlStr = getHtml(url);
string pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";
MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
foreach (Match m in matches)
{
string imgPath = m.Groups["imgUrl"].Value.Trim();
if (Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$")) //用了2次匹配,去除链接是网页的 只留图片
{
if (!imgPath.Contains("http"))//必须包含http 否则无法下载
{
imgPath = getUrl(url) + imgPath;
}
list.Add(imgPath);
}
}
return list;
} //下载图片
public void DownloadImg(string fileurl)
{
if (fileurl.Contains('.'))//url路径必须是绝对路径 例如http://xxx.com/img/logo.jpg
{
string imgName = DateTime.Now.ToString("yyyyMMddHHmmssffff") + fileurl.Substring(fileurl.LastIndexOf('.')); // 生成图片的名字
string filepath = System.Web.HttpContext.Current.Server.MapPath("") + "/" + imgName;
WebClient mywebclient = new WebClient();
mywebclient.DownloadFile(fileurl, filepath);
}
} //过滤html
public static string parseHtml(string html)
{
string value = Regex.Replace(html, "<[^>]*>", string.Empty);
value = value.Replace("<", string.Empty);
value = value.Replace(">", string.Empty);
//return value.Replace(" ", string.Empty); return Regex.Replace(value, @"\s+", "");
} //处理url路径问题
public static string getUrl(string url)
{
//如果是http://www.xxx.com 返回http://www.xxx.com/
//如果是http://www.xxx.com/art.aspx 返回http://www.xxx.com/
return url = url.Substring(0, url.LastIndexOf('/')) + "/";
}
}
解决了一起结贴
http://topic.csdn.net/u/20111031/15/2c1ae90f-9232-466f-afab-e9296ef8dd94.html?seed=112401756&r=76254699
1.获取采集列表
2.获取当前页的产品信息标题指向的URL。
3.进入产品页获取HTML。
4.正则分析你要的数据
5.执行下一个产品页面数据的获取。
6.跳到下一页继续执行2.
有不明白的,QQ:444758708