求一个C#的网页数据抓取源代码

string netUrl = "http://store.taobao.com/shop/view_shop.htm?asker=wangwang&shop_nick=琳子68";
HttpWebRequest myWebRequest = (HttpWebRequest)WebRequest.Create(netUrl);
WebResponse resp = myWebRequest.GetResponse();
StreamReader oStreamRd = new StreamReader(resp.GetResponseStream(), Encoding.GetEncoding("GB2312"));
string content = oStreamRd.ReadToEnd();

大哥，这只是把网页的数据读取下，不过还是谢谢你！
最好还要有塞选标题和内容（静态HTML），然后把它插入数据库（SQLSERVER2005）
麻烦大家了，虚心向高手请教啊

抓取网页服务器端源码可能性不大，估计需要黑客类的软件协助，但HTML源码倒不成问题。

public class WebForm1 : System.Web.UI.Page
{
  protected System.Web.UI.WebControls.DataGrid dgData;

  private void Page_Load(object sender, System.EventArgs e)
  {
   // 在此处放置用户代码以初始化页面
   dgData.DataSource=getInfo();
   dgData.DataBind();

  }
  //根据Url地址得到网页的html源码
  private string GetWebContent1(string Url)
  {
   string strResult="";
   try
   {
    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
　　　　//声明一个HttpWebRequest请求
    request.Timeout = 30000;
    //设置连接超时时间
    request.Headers.Set("Pragma", "no-cache");
    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
    Stream streamReceive = response.GetResponseStream();
    System.Text.Encoding encoding = System.Text.Encoding.GetEncoding("GB2312");
    StreamReader streamReader = new StreamReader(streamReceive, encoding);
    strResult = streamReader.ReadToEnd();
   }
   catch
   {
    return "";
   }
   return strResult;
  }  //获取超级链接和文章标题、内容
  private DataTable getInfo()
  {
   //创建datatable
   DataTable dt = new DataTable();
   dt.Columns.Add("title",typeof(string));
   dt.Columns.Add("URL",typeof(string));
   dt.Columns.Add("content",typeof(string));
   dt.Columns.Add("newsdate",typeof(string));
   dt.TableName="newsthief";   string html=GetWebContent1("http://www.xxxxxxxxx");

   string strPattern0=@"<td width=84% >";
   //计算匹配的个数
   int Count=0;
   MatchCollection Matches=Regex.Matches(html,strPattern0,RegexOptions.IgnoreCase|RegexOptions.Compiled);
   foreach(Match NextMatch in Matches)
   {
    Count++;
   }

   string sHtml=html;
   string strPattern=@"a[\s]+href=(?<Link>[^\s>]+)[^>]*>(?<Text>[^<]*)</a>";

   for(int j=0;j<Count-1;j++)//Count-1
   {

    string sTemp=Regex.Split(sHtml,"<td width=84% >",RegexOptions.IgnoreCase)[j+1];
    string sHref=Regex.Split(sTemp,"</td>",RegexOptions.IgnoreCase)[0];
    string sDateTemp=Regex.Split(sTemp,"</td>",RegexOptions.IgnoreCase)[1];
    string sDate=Regex.Split(sDateTemp,"<td Width=12% >",RegexOptions.IgnoreCase)[1];
    Matches=Regex.Matches(sHref,strPattern,RegexOptions.IgnoreCase|RegexOptions.Compiled);
    foreach(Match NextMatch in Matches)
    {
     string URL="http://www.china-insurance.com/news-center/"+NextMatch.Groups["Link"].Value.ToString().Trim();
     string title=NextMatch.Groups["Text"].Value.ToString().Trim();
     string htmlContent=GetWebContent1(URL);
     string sContentTemp=Regex.Split(htmlContent,"<table width=\"99%\"   height=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\" align=\"center\">",RegexOptions.IgnoreCase)[1];
     string sContent=Regex.Split(sContentTemp,"</table>",RegexOptions.IgnoreCase)[0];
     sContentTemp=Regex.Split(sContent,"<font id=zoom>",RegexOptions.IgnoreCase)[2];
     sContent=Regex.Split(sContentTemp,"</font>",RegexOptions.IgnoreCase)[0];
     //去掉<img ...>   @"\<IMG(.[^\<]*)?()\>)"
     string sContent1=Regex.Replace(sContent,@"<img[\s\S]*?>","&nbsp",RegexOptions.IgnoreCase|RegexOptions.Compiled);
     DataRow dr = dt.NewRow();
     dr["title"]=title;
     dr["URL"]=URL;
     dr["content"]=sContent1;
     dr["newsdate"]=sDate;
     dt.Rows.Add(dr);
    }
   }
   return dt;

public   class PageUtil
    {
        #region 得到网络流
        public static HttpWebResponse GetResponse(string url)
        {
            return GetResponse(url, @"http://www.bfor.cn");
        }
        public static HttpWebRequest CreateRequest(string url)
        {
            return CreateRequest(url, @"http://www.bfor.cn");
        }
        public static HttpWebRequest CreateRequest(string url, string referral)
        {
            WebRequest req = WebRequest.Create(url);            HttpWebRequest wreq = req as HttpWebRequest;
            if (null != wreq)
            {
                wreq.UserAgent = "xiaoxiao";
                wreq.Referer = @"http://www.bfor.cn";
                wreq.Timeout = 60000;
            }
            return wreq;
        }
        public static HttpWebResponse GetResponse(string url, string referral)
        {
            HttpWebRequest request = CreateRequest(url);
            return (HttpWebResponse)request.GetResponse();
        }
        public static string GetPageText(string url, out string referral)
        {
            string str;
            HttpWebResponse response = GetResponse(url);
            using (Stream s = response.GetResponseStream())
            {
                string enc = getEncoding(response);
                if (enc == "")
                    enc = "gb2312";
                referral = enc;
                Encoding encode = System.Text.Encoding.GetEncoding(enc);
                using (StreamReader sr = new StreamReader(s, encode))
                {
                    str = sr.ReadToEnd();
                }
            }
            response.Close();
            return str;
        }
        private static string getEncoding(HttpWebResponse httpResp)
        {
            string contentType = httpResp.ContentType;//类似“Content-Type: text/html; charset=utf-8;”或“Content-Type: text/html; charset=utf-8”或者“Content-Type: text/html”//注意utf-8后面有可能没有分号
            int i = contentType.IndexOf("charset=");
            string s = httpResp.CharacterSet;
            if (i >= 0)
            {
                i += 8;
                int j = contentType.IndexOf(';', i);
                if (j >= i)
                {
                    return contentType.Substring(i, j - i).Trim();
                }
                return contentType.Substring(i);
            }
            return string.Empty;
        }
        #endregion
    }调用  string en;
            string response=string.Empty;
            try
            {
               response= PageUtil.GetPageText(url, out en);                ServiceManager.EditorContainer.LoadText(url, response);                Close();

                this.DialogResult = DialogResult.OK;
            }
            catch (Exception ex)
            {
                Util.ShowErrorMessageBox("打开网址'{0}'时出现错误 :\n\n{1}",url, ex.Message);
            }

out 参数，获取页面编码

<@Aattention Content="本Blog原创文章，转载或引用请注明转载" From="Robby.cnblogs.com"@>      由于自己的搜索引擎中做到了这一块内容，所以今天说说如何抓取网页数据、分析并且去除Html标签，给大家提供一个参考。我的平台是Visual Studio2005，C#。—————————————————————割—————————————————————————      首先将网页内容整个抓取下来，这个我就不说了，不是本次话题的重点。假设抓取的数据放在RecvBuffer这个byte[]中（数据从网络上传输过来时不是字符串的形式而是byte），那么我们的第一步就是将RecvBuffer转化为String，以便于对其操作，实例如下：
  // 将接收到的数据增加到响应字符串中
  strResponse += Encoding.ASCII.GetString(RecvBuffer, 0, nBytes);      strResponse即是保存数据的字符串，此处用系统自带的System.Text.Encoding的方法转化RecvBuffer，GetString的第一个参数RecvBuffer就是我们的原始数据，即包含需要解码的字节序列的字节数组；第二个参数0代表第一个要解码的字节的索引，一般就从0开始；第三个参数nBytes为要解码的字节数，可以自己调整。      得到了数据的字符串形式，然后可以对网页进行解析了（其实就是对字符串的各种操作和正则表达式的应用）。下面我以几个例子来说明对网页数据的解析：
  // 解析页面，查找链接
  // 此处尚需扩展，还有某些形式的链接不被识别
  string strRef = @"(href|HREF|src|SRC|action|ACTION|Action)[ ]*=[ ]*[""'][^""'#>]+[""']";
  MatchCollection matches = new Regex(strRef).Matches(strResponse);
  strStatus += "找到: "+matches.Count+" 个链接\r\n";      上面的例子将网页中的链接解析出来，strRef变量表示了正则表达式的模式，变量matches表示符合匹配的项目的集合，后面的Regex(strRef).Matches(strResponse)就是创建正则规则使得strResponse里符合strRef模式的字符串都返回。然后调用matches的变量就可以取得各种信息了。
      当然，这里只能识别一些基本的链接形式，像script中的链接和一些不带“”的链接都没有被支持，这个的扩展还是蛮简单的。
      再举几个更简单点的解析的例子，大家学习学习：
  //获取标题
  Match TitleMatch = Regex.Match(strResponse, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
  title = TitleMatch.Groups[1].Value;  //获取描述信息
  Match Desc = Regex.Match(strResponse, "<Meta name=\"DESCRIPTION\" content=\"([^<]*)\">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
  strdesc = Desc.Groups[1].Value;  //获取网页的大小
  size = strResponse.Length;
—————————————————————割—————————————————————————      好了，下面说一下如何去除Html标签，这个想必有很多初学者很需要。其实还是正则表达式和字符串基本操作的应用，由于这个功能还是比较常用的，所以例子写成了函数，便于调用：
  /**//// <summary>
  /// 将Html标签转化为空格
  /// </summary>
  /// <param name="strHtml">待转化的字符串</param>
  /// <returns>经过转化的字符串</returns>
  private string stripHtml(string strHtml)
  {
     Regex objRegExp = new Regex("<(.|\n)+?>");
     string strOutput = objRegExp.Replace(strHtml, "");
     strOutput = strOutput.Replace("<", "<");
     strOutput = strOutput.Replace(">", ">");
     return strOutput;
  }      ok，这样一来Html标签就基本没了，但是有些例外会使得去除不干净，所以建议连续两次转化，这样就搞定了。但是还没结束，如果你留意的话，可以看到上面的函数其实是将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作。所以再加入这样的语句：
  //把所有空格变为一个空格
  Regex r = new Regex(@"\s+");
  wordsOnly = r.Replace(strResponse, " ");
  wordsOnly.Trim();      好了，大功告成，这里的wordsOnly便是我们最终的成果----去除了Html标签，并去除了多余空格的字符串。
      希望对大家有用！

WebClient client = new WebClient();

                Console.WriteLine("开始获取网页");

                string input = client.DownloadString(URL);

                return input;
然后用正则分析即可

这个问题不错，刚好我也遇到要开发这个，用winform做。要是找到了答案给我发份塞 QQ：253260227

我有个完整的多线程网页抓取程序带源码（c#）不过要收费的：QQ473324702

楼主解决否，我现在也正在做与之相关的（winform）可否给我发一份，谢谢邮箱[email protected]

LZ,有了吗？可以给我发一份吗？谢谢。邮箱：[email protected]

楼主问题解决了吧？给我发个吧我现在很需要发我邮箱[email protected] 谢谢

这个问题面临的好多噢，我是初学者，能否给发一份完整版的，谢谢了高手们！！邮箱[email protected]

也发我一个楼主！发我邮箱[email protected]

string netUrl = "http://www.265.com";
HttpWebRequest myWebRequest = (HttpWebRequest)WebRequest.Create(netUrl);
WebResponse resp = myWebRequest.GetResponse();
StreamReader oStreamRd = new StreamReader(resp.GetResponseStream(),Encoding.GetEncoding("GB2312"));
string content = oStreamRd.ReadToEnd();
Console.WriteLine(content);
Console.ReadLine();

好多高手前辈，晚辈也求教一份，谢谢[email protected]

楼主！小弟虚心求一份！ [email protected]

小弟求一份 [email protected]
谢谢
谢谢
谢谢

昨天主管突然找我,说月底有个游戏要发新补丁,但是没有新增部分的资料,找到一个网站却不知道怎么下载到我们的数据库中.
我看了一下,网页为了防抓取,都是用js来生成内容,不过还是让我找到具体位置,所有物品有3万多,需要的还要过滤,有用的只有3千多,人工来做几乎不太现实的,于是写了一个winform来抓它的信息来写到数据库中
动作在DocumentCompleted事件中完成.        private const string SQL_DATA = "select * from Table1";
        private const string SQL_INSERT_1 = "insert into Table1 (WebID,Name,NeedLevel,Content) values (";
        private const string Sql_INSERT_2 = ")";
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            string mydocument = webBrowser1.DocumentText;            //listBox1.Items.Add(mydocument);
            string SQL_INSERT = string.Empty;
//取得页面中的物品等级
            int mylevel = GetLevel(mydocument);
//取得页面中物品名称
            string myname = GetName(mydocument);
            if (!string.IsNullOrEmpty(myname))
            {
                SQL_INSERT = SQL_INSERT_1 + ID.ToString() + ",'" + myname + "'," + mylevel + ",'" + mydocument + "'" + Sql_INSERT_2;                SqlConnection cn = new SqlConnection();
                cn.ConnectionString = SQL_CONNECTION;
                SqlCommand sqlcmd = new SqlCommand(SQL_INSERT, cn);
                cn.Open();
                try
                {
                    int tmp = sqlcmd.ExecuteNonQuery();
                }
                catch
                {
                    throw new Exception("no good");
                }
                sqlcmd.Dispose();
                cn.Close();
            }
        }
取得物品名称和等级：www.shiapifa.com        private string GetName(string mydocument)
        {
            if (string.IsNullOrEmpty(mydocument)) return string.Empty;
            int pos_start = mydocument.IndexOf(".html\">", 0);
            if (pos_start == -1) return string.Empty;
            int pos_end = mydocument.IndexOf("</a></div>", 0);
            if (pos_end == -1) return string.Empty;
            if (pos_start >= pos_end) return string.Empty;
            string name = string.Empty;
            name = mydocument.Substring(pos_start + 7, pos_end - pos_start - 7);
            return name;
        }        private int GetLevel(string mydocument)
        {
            if (string.IsNullOrEmpty(mydocument))return 0;
            int pos_start = mydocument.IndexOf("需要等级", 0);
            if (pos_start == -1) return 0;
            int pos_end = mydocument.IndexOf("</div>", pos_start);
            if (pos_end == -1) return 0;
            if (pos_start >= pos_end) return  0;
            string Level = "0";
            Level = mydocument.Substring(pos_start + 5, pos_end - pos_start - 5);
            int intleve = 0;
            try
            {
                intleve = Convert.ToInt32(Level);
            }
            catch
            {
            }
            finally
            {

            }
                return intleve;
        }无法判断浏览器是否完成加载，所以用timer控件来完成: www.wanxinyi.com        private const int START_ID = 1;
        private const int END_ID = 40000;//32999;
        private static int ID = 1;        private void timer1_Tick(object sender, EventArgs e)
        {
            if (ID < END_ID && !webBrowser1.IsBusy)
            {
                ID = ID + 1;
                webBrowser1.Navigate("http://xxxx.com/xx.php?id=" + ID.ToString());                //listBox1.Items.Add(ID.ToString() + webBrowser1.DocumentText);                //InsertIntoDB(webBrowser1.DocumentText, ID);
                textBox4.Text = ID.ToString();
            }
        }
timer的interval控制在1000ms，40000条数据用了11个多小时，从昨天晚上10点到今天早上，刚来的时候看到数据全都乖乖的在数据库呆着了．呵呵，搞定，交差．．．

protected void Page_Load(object sender, EventArgs e)
        {
            string strurl = "http://www.baidu.com"; //欲获取的网页地址            WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient            //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
            myWebClient.Credentials = CredentialCache.DefaultCredentials;            //从资源下载数据并返回字节数组。（加@是因为网址中间有"/"符号）
            byte[] pagedata = myWebClient.DownloadData(strurl);            //以下两句每次只要使用一条即可，功能是一样是用来转换字符集，根据获取网站页面的字符编码选择
            //string result=Encoding.Default.GetString(pagedata);
            //如果获取网站页面采用的是GB2312，则使用这句
            string result = Encoding.UTF8.GetString(pagedata);
            //如果获取网站页面采用的是UTF-8，则使用这句
            //因为我的博客使用了UTF-8编码，所以在这里我使用这句
            Response.Write(result); //在WEB页中显示获取的内容
        }这个比较简单点。

我用PHP做过一个，你的需求是截取一部分代码存储在数据库吧？
在查询出来的时候又能显示网页的信息？

string str = @"D:\Raymond's Documents\My Pictures\247129.jpg";
            FileStream fs = new FileStream(str, FileMode.Open, FileAccess.Read);
            BinaryReader by = new BinaryReader(fs);
            int length = (int)fs.Length;
            byte[] imgbyte = by.ReadBytes(length);            MemoryStream ms = new MemoryStream(imgbyte);
            ms.Seek(0, SeekOrigin.Begin);
            Image image = Image.FromStream(ms);

        /// <summary>获取网页源代码
        ///
        /// </summary>
        /// <param name="url">目标网页地址</param>
        /// <param name="code">如果目标网页是utf-8编码的在此输入utf8，若是gb2312编码的则不用输入</param>
        /// <returns></returns>
        public static string GetWebresourceFile(string url, string code)
        {            WebClient myWebClient = new WebClient();
            byte[] myDataBuffer = myWebClient.DownloadData(url);
            string SourceCode = "";
            if ("utf8" == code)
            {
                SourceCode = Encoding.UTF8.GetString(myDataBuffer);  //url对应的网页是 utf-8 的编码则是用这个
            }
            else
            {
                SourceCode = Encoding.Default.GetString(myDataBuffer);  //url对应的网页是 gb2312 的编码则是用这个
            }            return SourceCode;
        }

调试易

求一个C#的网页数据抓取源代码

解决方案 »