大家好,我做一个winform程序抓取网页代码,使用了如下函数:  /// <summary>
        /// 根据Url地址得到网页的html源码
        /// </summary>
        /// <param name="Url">目标URL</param>
        /// <returns>字符串类型源码(GB2312)</returns>
        private string GetWebContent(string Url)
        {
            string strResult = "";
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
                //声明一个HttpWebRequest请求
                request.Timeout = 30000;
                //设置连接超时时间
                request.Headers.Set("Pragma", "no-cache");
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream streamReceive = response.GetResponseStream();
                Encoding encoding = Encoding.GetEncoding("UTF-8");
                StreamReader streamReader = new StreamReader(streamReceive, encoding);
                strResult = streamReader.ReadToEnd();
            }
            catch
            {
                MessageBox.Show("出错");
            }
            return strResult;
        }
            //要抓取的URL地址
            string Url = "http://d1.7l.mop.com/statistics";            //得到指定Url的源码
            string strWebContent = this.GetPageContent(Url);
结果这个网站需要登陆,于是便只能抓到登陆页面,非常郁闷 请大家帮我想想办法,100分相赠! 谢谢啦! 

解决方案 »

  1.   

    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Net;
    using System.IO;
    namespace bot
    {
       public class Html
        {
            /// <summary>
            /// httpwebrequest类中的一些属性的集合
            /// </summary>
            public struct RequestPPT
            {
                private string strAccept;
                /// <summary>
                /// 获取或设置request类中的Accept属性
                /// 用以设置接受的文件类型
                /// </summary>            
                public string Accept
                {
                    get
                    {
                        return strAccept;
                    }
                    set
                    {
                        strAccept = value;
                    }
                }
                private string strContentType;
                /// <summary>
                /// 获取或设置request类中的ContentType属性
                /// 用以设置请求的媒体类型
                /// </summary>          
                public string ContentType
                {
                    get
                    {
                        return strContentType;
                    }
                    set
                    {
                        strContentType = value;
                    }
                }
                /// <summary>
                /// 获取或设置request类中的UserAgent属性
                /// 用以设置请求的客户端信息
                /// </summary>
                private string strUserAgent;
                public string UserAgent
                {
                    get
                    {
                        return strUserAgent;
                    }
                    set
                    {
                        strUserAgent = value;
                    }
                }
                private string strMethod;
                /// <summary>
                /// 获取或设置request类中的Method属性
                /// 可以将 Method 属性设置为任何 HTTP 1.1 协议谓词:GET、HEAD、POST、PUT、DELETE、TRACE 或 OPTIONS。
                /// 如果 ContentLength 属性被设置为 -1 以外的任何值,则必须将 Method 属性设置为上载数据的协议属性。
                /// </summary>            
                public string Method
                {
                    get
                    {
                        return strMethod;
                    }
                    set
                    {
                        strMethod = value;
                    }
                }
            }
            /// <summary>
            /// 构建一个httt请求以获取目标链接的cookies,需要传入目标的登录地址和相关的post信息,返回完成登录的cookies,以及返回的html内容
            /// </summary>
            /// <param name="url">登录页面的地址</param>
            /// <param name="post">post信息</param>
            /// <param name="strHtml">输出的html代码</param>
            /// <param name="rppt">请求的标头所需要的相关属性设置</param>
            /// <returns>请求完成后的cookies</returns>
            public CookieCollection funGetCookie(string url, byte[] post, out string strHtml, RequestPPT rppt,string server)
            {
                
                CookieCollection ckclReturn = new CookieCollection();
                CookieContainer cc = new CookieContainer();
                HttpWebRequest hwRequest;
                HttpWebResponse hwResponse;
                //请求cookies的格式
                //hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
                //hwResponse = (HttpWebResponse)hwRequest.GetResponse();
                //string cookie = hwResponse.Headers.Get("Set-Cookie");
                //cookie = cookie.Split(';')[0];
                //hwRequest = null;
                //hwResponse = null;
                //构建即将发送的包头
                //cc.SetCookies(new Uri(server), cookie);           
                hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
                hwRequest.CookieContainer = cc;
                hwRequest.Accept = rppt.Accept;
                hwRequest.ContentType = rppt.ContentType;
                hwRequest.UserAgent = rppt.UserAgent;
                hwRequest.Method = rppt.Method;
                hwRequest.ContentLength = post.Length;
                //写入标头
                Stream stream;
                stream = hwRequest.GetRequestStream();
                stream.Write(post, 0, post.Length);
                stream.Close();
                //发送请求获取响应内容
                hwResponse = (HttpWebResponse)hwRequest.GetResponse();
                stream = hwResponse.GetResponseStream();
                StreamReader sReader = new StreamReader(stream, Encoding.Default);
                strHtml = sReader.ReadToEnd();
                sReader.Close();
                stream.Close();
                //获取缓存内容
                ckclReturn = hwResponse.Cookies;
                return ckclReturn;
            }
           /// <summary>
           /// 根据已经获取的有效cookies来获取目标链接的内容
           /// </summary>
           /// <param name="strUri">目标链接的url</param>
           /// <param name="ccl">已经获取到的有效cookies</param>
           /// <param name="rppt">头属性的相关设置</param>
           /// <returns>目标连接的纯文本:"txt/html"</returns>
           public string funGetHtmlByCookies(string strUri, CookieCollection ccl, RequestPPT rppt)
           {
               CookieContainer cc = new CookieContainer();
               HttpWebRequest hwRequest;
               HttpWebResponse hwResponse;      
       
               //构建即将发送的包头       
               hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
               cc.Add(ccl);
               hwRequest.CookieContainer = cc;
               hwRequest.Accept = rppt.Accept;
               hwRequest.ContentType = rppt.ContentType;
               hwRequest.UserAgent = rppt.UserAgent;
               hwRequest.Method = rppt.Method;
               hwRequest.ContentLength = 0;    
        
               //发送请求获取响应内容
               hwResponse = (HttpWebResponse)hwRequest.GetResponse();
               Stream stream;
               stream = hwResponse.GetResponseStream();
               StreamReader sReader = new StreamReader(stream, Encoding.Default);
               string strHtml = sReader.ReadToEnd();
               sReader.Close();
               stream.Close();           //返回值          
               return strHtml;
           }
           /// <summary>
           /// 根据已经获取的有效cookies来获取目标链接的内容
           /// </summary>
           /// <param name="strUri">目标链接的url</param>
           ///<param name="post">post的byte信息</param>
           /// <param name="ccl">已经获取到的有效cookies</param>
           /// <param name="rppt">头属性的相关设置</param>
           /// <returns>目标连接的纯文本:"txt/html"</returns>
           public string funGetHtmlByCookies(string strUri,byte[] post, CookieCollection ccl, RequestPPT rppt)
           {
               CookieContainer cc = new CookieContainer();
               HttpWebRequest hwRequest;
               HttpWebResponse hwResponse;           //构建即将发送的包头       
               hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
               cc.Add(ccl);
               hwRequest.CookieContainer = cc;
               hwRequest.Accept = rppt.Accept;
               hwRequest.ContentType = rppt.ContentType;
               hwRequest.UserAgent = rppt.UserAgent;
               hwRequest.Method = rppt.Method;
               hwRequest.ContentLength = post.Length;
               //写入post信息
               Stream stream;
               stream = hwRequest.GetRequestStream();
               stream.Write(post, 0, post.Length);
               stream.Close();
               //发送请求获取响应内容
               hwResponse = (HttpWebResponse)hwRequest.GetResponse();
               
               stream = hwResponse.GetResponseStream();
               StreamReader sReader = new StreamReader(stream, Encoding.Default);
               string strHtml = sReader.ReadToEnd();
               sReader.Close();
               stream.Close();           //返回值          
               return strHtml;
           }
        }
    }
      

  2.   

    你必须模拟登录一次用上面的方法可以获取到cookies
    但是之前你需要一个抓包工具,把你登录的时候的包抓下来,然后模拟的方式构建post然后httpwebrequest去get就ok了
      

  3.   

    HttpAnalyzerFullV2.exe很好专门的http抓包我一直用他嘿嘿
    关于验证码你可以在验证码那部分人工识别,获取下他的cookies这样以后都自动登录了不用在去理会验证码了,你的2个帖子我都会了,另外一个上面有一些具体的调用方法你看看,应该对你有帮助,下班了下班了,有问题明天继续哈,早日解决早日结贴啊,俺可是巴巴的往着你的分的呵呵