我想抓取网页,并且提取其中的英文词语,我找到了相关程序,但正则匹配的不好,大家能帮我看看嘛?
{
    class Program
    {
        static void Main(string[] args)
        {
            byte[] buffer = getBytes("http://video.shishicai.cn/haoma/cqssc/list/120.aspx", null, null);
            string html = Encoding.UTF8.GetString(buffer);
            MatchCollection mc = Regex.Matches(html, @"{"(?(([a-z]|[A-Z])+(\d))"}");
           
            
            Console.WriteLine(mc);
          
            Console.WriteLine("Close!");
            Console.ReadKey();        }        // 读取网络资源,返回字节数组
        private static byte[] getBytes(string url, CookieContainer cookie, byte[] postData)
        {
            int c = url.IndexOf("/", 10);
            byte[] data = null;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            request.AllowAutoRedirect = true;
            if (cookie != null) request.CookieContainer = cookie;
            request.Referer = (c > 0 ? url.Substring(0, c) : url);
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
            request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";

解决方案 »

  1.   

    System.Text.RegularExpressions.Regex.Match(string,@"[A-Za-z]{300}");
    试下这个.
      

  2.   

    我在公司,打不开那个网页按你的需求,应该是先获取网页源代码,然后过滤掉html标签,当然,也可以同时过滤掉非英文单词,剩下的英文取前300个就行了取网页源代码
            private string GetHtmlCode(string url, Encoding encoding)
            {
                System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
                request.Method = "Get";
                System.Net.WebResponse response = request.GetResponse();
                System.IO.Stream resStream = response.GetResponseStream();
                System.IO.StreamReader sr = new System.IO.StreamReader(resStream, encoding);
                string html = (sr.ReadToEnd());
                resStream.Close();
                sr.Close();
                return html;
            }过滤html标签,当然,视情况,如果网页中有脚本会稍复杂点
    Regex reg = new Regex(@"<[^>]*>");
    string result = reg.Replace(yourStr, "");或者可以利用webBrowser直接取得页面文字再过滤非英文后提取
      

  3.   

    webclient等抓取页面内容
    过滤HTML,Regex.Replace("",@"<[^>]*>",""),或Regex.Replace("",@"[\u4e00-\u9fa5]+",""); 替换中文
      

  4.   

    谢谢您,你的代码我不知道如何用,如果您有时间还是请你帮我看看我的这段代码有什么问题,我试过了,不过程序没有任何反应,没有结果输出,您看看这是什么问题:
    using System;
    using System.Text.RegularExpressions;
    using System.IO;
    using System.Text;using System.Net;
    using System.IO.Compression;
    using System.Web;
    using System.Collections;namespace 控制台测试
    {
        class Program
        {
            static void Main(string[] args)
            {
                byte[] buffer = getBytes("http://www.imqq.com/", null, null);
                string html = Encoding.UTF8.GetString(buffer);
                Regex rx = new Regex(@"[A-Za-z]{300}");
                MatchCollection mc = rx.Matches(html);            foreach (Match m in mc)
                  {
                     string word = m.Groups["word"].Value;
                     int index = m.Index;
                      Console.WriteLine("produce", word, index);
                   }
            
            }        // 读取网络资源,返回字节数组
            private static byte[] getBytes(string url, CookieContainer cookie, byte[] postData)
            {
                int c = url.IndexOf("/", 10);
                byte[] data = null;
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                request.AllowAutoRedirect = true;
                if (cookie != null) request.CookieContainer = cookie;
                request.Referer = (c > 0 ? url.Substring(0, c) : url);
                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
                request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";            if (postData != null)                                           // 需要 Post 数据
                {
                    request.Method = "POST";
                    request.ContentType = "application/x-www-form-urlencoded";
                    request.ContentLength = postData.Length;
                    Stream requestStream = request.GetRequestStream();
                    requestStream.Write(postData, 0, postData.Length);
                    requestStream.Close();
                }            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                string ce = response.Headers[HttpResponseHeader.ContentEncoding];
                int ContentLength = (int)response.ContentLength;
                Stream s = response.GetResponseStream();
                c = 1024 * 10;
                if (ContentLength < 0)                                          // 不能获取数据的长度
                {
                    data = new byte[c];
                    MemoryStream ms = new MemoryStream();
                    int l = s.Read(data, 0, c);
                    while (l > 0)
                    {
                        ms.Write(data, 0, l);
                        l = s.Read(data, 0, c);
                    }
                    data = ms.ToArray();
                    ms.Close();
                }
                else                                                            // 数据长度已知
                {
                    data = new byte[ContentLength];
                    int pos = 0;
                    while (ContentLength > 0)
                    {
                        int l = s.Read(data, pos, ContentLength);
                        pos += l;
                        ContentLength -= l;
                    }
                }
                s.Close();
                response.Close();            if (ce == "gzip")                                               // 若数据是压缩格式,则要进行解压
                {
                    MemoryStream js = new MemoryStream();                       // 解压后的流   
                    MemoryStream ms = new MemoryStream(data);                   // 用于解压的流   
                    GZipStream g = new GZipStream(ms, CompressionMode.Decompress);
                    byte[] buffer = new byte[c];                                // 读数据缓冲区      
                    int l = g.Read(buffer, 0, c);                               // 一次读 10K      
                    while (l > 0)
                    {
                        js.Write(buffer, 0, l);
                        l = g.Read(buffer, 0, c);
                    }
                    g.Close();
                    ms.Close();
                    data = js.ToArray();
                    js.Close();
                }
                return data;                                                    // 返回字节数组
            }
        }
    }