采集网页的代码我是这么写的:
string rePageInfo=String.Empty;
Stream responseStream=null;
StreamReader sr=null;
try
{
   HttpWebRequest myWebRequest=(HttpWebRequest)WebRequest.Create(@urlstr);
   myWebRequest.Timeout=1000;
   myWebRequest.Method="GET";      
   HttpWebResponse res=(HttpWebResponse)myWebRequest.GetResponse();
   if(res.StatusCode==HttpStatusCode.OK)
   {
       responseStream=res.GetResponseStream();
       sr=new StreamReader(responseStream,System.Text.Encoding.Default);
       rePageInfo=sr.ReadToEnd();
   }
   res=null;
   myWebRequest=null;
}
catch
{
}
finally
{
   //释放资源的代码
   ......
}采集到网页后进行分析现在问题是,当采集的页面使用utf-8、gb2312、big5等不同的编码时,采集程序要么采集的是一片空白(没东西),要么是乱码然后我在读取流的时候加了个识别代码,如下:
   if(res.StatusCode==HttpStatusCode.OK)
   {
       string conntype=res.Headers["Content-Type"];
       //提取<meta http-equiv="Content-Type" content="text/html; charset=GB2312" />中的content的值,并进一步提取charset的值,并赋值给conntype       responseStream=res.GetResponseStream();
       sr=new StreamReader(responseStream,System.Text.Encoding.GetEncoding(conntype.Trim()));
       rePageInfo=sr.ReadToEnd();
       ......
    ......但这样,好象是可以同时采集utf-8和gb2312的网页了,但却不稳定,有的能行,可有的又不行!!!! 尤其是有的服务器如果是unix或linux等的时候,采集到的数据的换行符都成小黑块了还有就是 res.Headers["Content-Type"]获取的值很多都是只有“text/html”,而没有后边的charset=.+请问哪位高手有这方面的经验呀?采集网页的时候怎么进行页面的代码转换呢

解决方案 »

  1.   

    /// <summary>
            /// Download a page
            /// </summary>
            /// <returns>The data downloaded from the page</returns>
            private string GetPage()
            {
                WebResponse response = null;
                Stream stream = null;
                StreamReader
                reader = null;
                //StreamReader reader = null;            try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);                response = request.GetResponse();
                    stream = response.GetResponseStream();                if (!response.ContentType.ToLower().StartsWith("text/"))
                    {
                        SaveBinaryFile(response);
                        return null;
                    }                reader = new StreamReader(stream, System.Text.Encoding.Default);                string buffer = "", line;               // reader = new StreamReader(stream);                while ((line = reader.ReadLine()) != null)
                    {
                        buffer += line + "\r\n";
                    }                string strEncoding = Tools.GetEncoding(buffer);
                    //string strEncoding = "UNICODE";
                    //System.Text.Encoding encoding;
                    if (strEncoding == "UTF-8")
                        encoding = Encoding.UTF8;
                    else if (strEncoding == "UTF-7")
                        encoding = Encoding.UTF7;
                    else if (strEncoding == "UNICODE")
                        encoding = Encoding.Unicode;
                    else
                        encoding = Encoding.Default;                //encoding = Encoding.Unicode;
                    //Encoding
                    // encoding=
                    //encoding = Encoding.ASCII;                if (encoding != Encoding.Default)
                    {
                        request.Timeout = 60000;
                        request = (HttpWebRequest)WebRequest.Create(m_uri);
                        response = request.GetResponse();
                        stream = response.GetResponseStream();
                        reader = new StreamReader(stream, encoding);
                        buffer = reader.ReadToEnd();
                    }                //SaveTextFile(buffer);
                    reader.Close();
                    stream.Close();
                    response.Close();
                    return buffer;
                }
                catch (WebException e)
                {
                    System.Console.WriteLine("Can't download:" + e);
                    return null;
                }
                catch (IOException e)
                {
                    System.Console.WriteLine("Can't download:" + e);
                    return null;
                }
                finally
                {
                    if (reader != null)
                        reader.Close();                if (stream != null)
                        stream.Close();                if (response != null)
                        response.Close();
                }
            }
    #region 获取HTML文件编码
    /// <summary>
    /// 获取HTML文件编码
    /// </summary>
    /// <param name="inputString">HTML文件</param>
    /// <returns></returns>
    public static string GetEncoding(string inputString) 
    { Regex r = new Regex("charset\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))",
    RegexOptions.IgnoreCase|RegexOptions.Compiled);
    Match m = r.Match(inputString);
    return  m.Groups[1].Value.Replace("\"","").Replace(">","").ToUpper();

    } #endregion
    分给真少.