c#关于截取网页内容

我是这样写的
                    WebClient MyWebClient = new WebClient();                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于对向Internet资源的请求进行身份验证的网络凭据。                    Byte[] pageData = MyWebClient.DownloadDatag("http://123.sogou.com/");//从指定网站下载据
                    string pageHtml = Encoding.Default.GetString(pageData);  //如果获取网站页面采用的GB2312，则使用这句
                    richTextBox1.Text = strallstrm;//在控制台输入获取的内容这样的话我richTextBox1里面的内容只是该网站的源代码而不是网页内容。。各位高手，我还要添加个什么代码才能使它显示网页内容啊？

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

/// <summary>
        /// 获取网页源代码方法四
        /// </summary>
        /// <param name="url">地址</param>
        /// <param name="charSet">指定编码，如果为空，则自动判断</param>
        /// <param name="out_str">网页源代码</param>
        public static string GetHtml(string url, string charSet)
        {
            string strWebData = string.Empty;
            try
            {
                WebClient myWebClient = new WebClient(); //创建WebClient实例
                byte[] myDataBuffer = myWebClient.DownloadData(url);
                strWebData = System.Text.Encoding.Default.GetString(myDataBuffer);
                //获取网页字符编码描述信息
                if (string.IsNullOrEmpty(charSet))
                {
                    Match charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    string webCharSet = charSetMatch.Groups[3].Value.Trim().ToLower();
                    if (webCharSet != "gb2312")
                    {
                        webCharSet = "utf-8";
                    }
                    if (System.Text.Encoding.GetEncoding(webCharSet) != System.Text.Encoding.Default)
                    {
                        strWebData = System.Text.Encoding.GetEncoding(webCharSet).GetString(myDataBuffer);
                    }
                }
            }
            catch (Exception ex)
            {
                return null;
            }
            return strWebData;
        }
//在控制台输入获取的内容这样的话我richTextBox1里面的内容只是该网站的源代码而不是网页内容。。各位高手，我还要添加个什么代码才能使它显示网页内容啊？使用 Webrowser 控件
[Quote=引用楼主 zhang308337299 的回复:]
我说在我的那个基础上加，我记得只加一两句代码，用不了你那么麻烦，拜托写下
RichTextBoxStreamType.PlainText
RichTextBoxStreamType.RichText
RichTextBoxStreamType.UnicodePlainText richTextBox1.LoadFile("d:\\source.rtf", RichTextBoxStreamType.RichText);http://msdn.microsoft.com/zh-cn/library/system.windows.forms.richtextboxstreamtype(v=vs.80).aspx
其实你写的代码并没有错误只是在指定URL的时候要到具体页面  MyWebClient.DownloadDatag("http://123.sogou.com/index.html");//不过呢要需要编码
//获取网页源码
private string GetWebContent(string sUrl)
        {
            string strResult = "";
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(sUrl);
                //声明一个HttpWebRequest请求
                request.Timeout = 3000000;
                //设置连接超时时间
                request.Headers.Set("Pragma", "no-cache");
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                if (response.ToString() != "")
                {
                    Stream streamReceive = response.GetResponseStream();
                    Encoding encoding = Encoding.GetEncoding("UTF-8");
                    StreamReader streamReader = new StreamReader(streamReceive, encoding);
                    strResult = streamReader.ReadToEnd();
                }
            }
            catch (Exception exp)
            {
                //MessageBox.Show("出错");
                MessageBox.Show(exp.Message);
            }
            return strResult;
        }
//截取网页内容主流就是 substring 和正则表达式
请问你设置后面的编码了么string pageHtml = GetHtml("http://123.sogou.com/index.html", "gbk");
我以为你看到那个枚举就会明白的，原来你非要人把方法写出来，智能提示都懒得看
RichTextBox.LoadFile 方法 (Stream, RichTextBoxStreamType)
这个有什么看不懂的啊LZ
直接复制过去调用就好了string str=GetWebContent("http://www.baidu.com");//网页源码
string pageData = MyWebClient.DownloadString("http://123.sogou.com/");
using(System.IO.StringReader sr = new System.IO.StringReader(pageData))
      richTextBox1.LoadFile(sr, RichTextBoxStreamType.RichText);如果返回的字符串不符合RTF规范将招聘异常用HttpRequest.GetResponseStream会更方便
http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.getresponsestream.aspx
貌似不行,使用Webrowser  空间吧,  还有使用httpclient  对于有些网站下载会有编码问题,如果你只是针对几个网站这样可以,否则换成其他方式请求网页吧
你要截取内容，首先要得到源码，然后有两个方案可行：
1.根据正则表达式匹配出你想要的内容
2.利用Winista.Htmlparser.Net 解析Html。这是.NET平台下解析Html的开源代码，网上有源码下载，百度一下就能搜到，这里就不提供了。并且有英文的帮助文档。找不到的留下邮箱。
个人认为这是.net平台下解析html不错的解决方案，基本上能够满足我们对html的解析工作
使用webbrowser 控件，可以显示页面效果
using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;namespace 网页采集器
{
    public class Helper
    {
        public static string GetRequestString(string strUrl, int enterType, Encoding EnCodeType)
        {
            string strResult;
            try
            {
                HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(strUrl);
                myReq.Timeout = 30000;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
                Stream myStream = HttpWResp.GetResponseStream();
                StreamReader sr = new StreamReader(myStream, EnCodeType);
                StringBuilder strBuilder = new StringBuilder();                while (-1 != sr.Peek())
                {
                    strBuilder.Append(sr.ReadLine());
                    if (enterType == 1)
                    {
                        strBuilder.Append("\r\n");
                    }
                }
                strResult = strBuilder.ToString();
            }
            catch (Exception err)
            {
                strResult = "请求错误：" + err.Message;
            }
            return strResult;
        }        public static void DownFile(string url, string strPath)
        {
            try
            {
                Uri uri = new Uri(url);
                HttpWebRequest mRequest = (HttpWebRequest)WebRequest.Create(uri);
                mRequest.Method = "GET";
                mRequest.ContentType = "application/x-www-form-urlencoded";
                HttpWebResponse wr = (HttpWebResponse)mRequest.GetResponse();
                Stream sIn = wr.GetResponseStream();
                FileStream fs = new FileStream("D:\\博客圆\\"+strPath, FileMode.Create, FileAccess.Write);
                long length = wr.ContentLength;
                long i = 0;
                decimal j = 0;
                while (i < length)
                {
                    byte[] buffer = new byte[1024];
                    i += sIn.Read(buffer, 0, buffer.Length);
                    fs.Write(buffer, 0, buffer.Length);
                }
                sIn.Close();
                wr.Close();
                fs.Close();
            }
            catch (Exception ex)
            {            }
        }        /// <summary>
        /// 写文件
        /// </summary>
        /// <param name="fileName"></param>
        /// <param name="content"></param>
        public static void WriteFile(string fileName, string content)
        {
            System.IO.StreamWriter sw = new System.IO.StreamWriter("D:\\博客圆\\" + fileName, false);//重写该文件，不存在则创建
            sw.Write(content);
            sw.Close();
        }        /// <summary>
        /// 检查文件夹是否存在，存在则创建
        /// </summary>
        /// <param name="path"></param>
        public static void CheckDirectory(string path)
        {
            DirectoryInfo di = new DirectoryInfo(@"D:/博客圆/" + path);
            if (!di.Exists)
            {
                di.Create();
            }
        }
    }
}using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;namespace 网页采集器
{
    class Program
    {
        static void Main(string[] args)
        {
            string siteUrl = "http://www.cnblogs.com/";
            //获取首页
            string html = Helper.GetRequestString(siteUrl , 1, Encoding.UTF8);            #region 处理样式文件与js文件
            //处理样式文件与js文件
            MatchCollection mcCss = Regex.Matches(html, @"(href|src)=\"".*(css|js)\""");
            foreach (Match ma in mcCss)
            {
                if (ma.Value.Contains("http"))
                {
                    Match name = Regex.Match(ma.Value, @"\w*\.(css|js)");
                    Match path = Regex.Match(ma.Value, @"\b/.*/\b");
                    Helper.CheckDirectory(path.Value);
                    string css = Helper.GetRequestString(ma.Value.Replace("href=\"", "").Replace("\"", ""), 1, Encoding.UTF8);
                    Helper.WriteFile(path.Value + name.Value, css);
                }
                else
                {
                    Match name = Regex.Match(ma.Value, @"\w*\.(css|js)");
                    Match path = Regex.Match(ma.Value, @"/.*/");
                    Helper.CheckDirectory(path.Value);
                    string css = Helper.GetRequestString(siteUrl + ma.Value.Replace("href=\"", "").Replace("\"", ""), 1, Encoding.UTF8);
                    Helper.WriteFile(path.Value + name.Value, css);
                }
            }
            #endregion            #region 处理图片
            //处理图片
            MatchCollection mcImg = Regex.Matches(html, @"<\s?img[^>]+?>");
            foreach (Match ma in mcImg)
            {
                if (ma.Value.Contains("http"))
                {
                    Match name = Regex.Match(ma.Value, @"\w*\.(jpg|gif|png|bmp)");
                    Match path = Regex.Match(ma.Value, @"src=\"".*(jpg|gif|png|bmp)\""");
                    Match dir = Regex.Match(path.Value, @"\b/.*/\b");
                    Helper.CheckDirectory(dir.Value);
                    Helper.DownFile(path.Value.Replace("src=\"", "").Replace("\"", ""), dir.Value + name.Value);
                }
                else
                {
                    Match name = Regex.Match(ma.Value, @"\w*\.(jpg|gif|png|bmp)");
                    Match path = Regex.Match(ma.Value, @"src=\"".*(jpg|gif|png|bmp)\""");
                    Match dir = Regex.Match(path.Value, @"/.*/");
                    Helper.CheckDirectory(dir.Value);
                    Helper.DownFile("http://www.cnblogs.com"+path.Value.Replace("src=\"", "").Replace("\"", ""), dir.Value + name.Value);
                }
            }
            #endregion            html = Regex.Replace(html, @"http://.*\.com", "");                        Helper.WriteFile("index.html", html);            Console.WriteLine("处理成功");
            Console.Read();
        }
    }
}
D:\\博客圆下会生成相关的文件,包括样式,JS,挂在IIS上可以访问,如果要直接打开,里面的样式路径与图片路径要修改.
用webbrowser两行代码就解决了，非要用你自己的写法，然后你还不会。。
网页很多内容就动态加载的，有通过js控制的，用webbrowser可以，它就是一个浏览器
public bool Parse(string sURL)
{
//sURL = @"d:\tmp\test1.htm";
            string sHtml=string.Empty;
WebClient wc = new WebClient();
            try
            {
                byte[] pagedata = wc.DownloadData(@sURL);                //转换字符、
                if (sURL.Contains("vancl.com"))
                {
                    sHtml = Encoding.UTF8.GetString(pagedata);
                }
                else
                {
                    sHtml = wc.DownloadString(sURL);                }

                //sHtml = sHtml.Replace("", "</P>");
                if (!sURL.Contains("taobao.com"))
                {
                    sHtml = sHtml.Replace("<<", "<");
                    sHtml = sHtml.Replace("> />", "/>");
                }
            }
            catch (Exception)
            {                return false;
            }