我想抓取网页,并且提取其中的英文词语,我找到了相关程序,但正则匹配的不好,大家能帮我看看嘛?
{
class Program
{
static void Main(string[] args)
{
byte[] buffer = getBytes("http://video.shishicai.cn/haoma/cqssc/list/120.aspx", null, null);
string html = Encoding.UTF8.GetString(buffer);
MatchCollection mc = Regex.Matches(html, @"{"(?(([a-z]|[A-Z])+(\d))"}");
Console.WriteLine(mc);
Console.WriteLine("Close!");
Console.ReadKey(); } // 读取网络资源,返回字节数组
private static byte[] getBytes(string url, CookieContainer cookie, byte[] postData)
{
int c = url.IndexOf("/", 10);
byte[] data = null;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.AllowAutoRedirect = true;
if (cookie != null) request.CookieContainer = cookie;
request.Referer = (c > 0 ? url.Substring(0, c) : url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";
{
class Program
{
static void Main(string[] args)
{
byte[] buffer = getBytes("http://video.shishicai.cn/haoma/cqssc/list/120.aspx", null, null);
string html = Encoding.UTF8.GetString(buffer);
MatchCollection mc = Regex.Matches(html, @"{"(?(([a-z]|[A-Z])+(\d))"}");
Console.WriteLine(mc);
Console.WriteLine("Close!");
Console.ReadKey(); } // 读取网络资源,返回字节数组
private static byte[] getBytes(string url, CookieContainer cookie, byte[] postData)
{
int c = url.IndexOf("/", 10);
byte[] data = null;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.AllowAutoRedirect = true;
if (cookie != null) request.CookieContainer = cookie;
request.Referer = (c > 0 ? url.Substring(0, c) : url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";
试下这个.
private string GetHtmlCode(string url, Encoding encoding)
{
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
request.Method = "Get";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, encoding);
string html = (sr.ReadToEnd());
resStream.Close();
sr.Close();
return html;
}过滤html标签,当然,视情况,如果网页中有脚本会稍复杂点
Regex reg = new Regex(@"<[^>]*>");
string result = reg.Replace(yourStr, "");或者可以利用webBrowser直接取得页面文字再过滤非英文后提取
过滤HTML,Regex.Replace("",@"<[^>]*>",""),或Regex.Replace("",@"[\u4e00-\u9fa5]+",""); 替换中文
using System;
using System.Text.RegularExpressions;
using System.IO;
using System.Text;using System.Net;
using System.IO.Compression;
using System.Web;
using System.Collections;namespace 控制台测试
{
class Program
{
static void Main(string[] args)
{
byte[] buffer = getBytes("http://www.imqq.com/", null, null);
string html = Encoding.UTF8.GetString(buffer);
Regex rx = new Regex(@"[A-Za-z]{300}");
MatchCollection mc = rx.Matches(html); foreach (Match m in mc)
{
string word = m.Groups["word"].Value;
int index = m.Index;
Console.WriteLine("produce", word, index);
}
} // 读取网络资源,返回字节数组
private static byte[] getBytes(string url, CookieContainer cookie, byte[] postData)
{
int c = url.IndexOf("/", 10);
byte[] data = null;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.AllowAutoRedirect = true;
if (cookie != null) request.CookieContainer = cookie;
request.Referer = (c > 0 ? url.Substring(0, c) : url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; if (postData != null) // 需要 Post 数据
{
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = postData.Length;
Stream requestStream = request.GetRequestStream();
requestStream.Write(postData, 0, postData.Length);
requestStream.Close();
} HttpWebResponse response = (HttpWebResponse)request.GetResponse();
string ce = response.Headers[HttpResponseHeader.ContentEncoding];
int ContentLength = (int)response.ContentLength;
Stream s = response.GetResponseStream();
c = 1024 * 10;
if (ContentLength < 0) // 不能获取数据的长度
{
data = new byte[c];
MemoryStream ms = new MemoryStream();
int l = s.Read(data, 0, c);
while (l > 0)
{
ms.Write(data, 0, l);
l = s.Read(data, 0, c);
}
data = ms.ToArray();
ms.Close();
}
else // 数据长度已知
{
data = new byte[ContentLength];
int pos = 0;
while (ContentLength > 0)
{
int l = s.Read(data, pos, ContentLength);
pos += l;
ContentLength -= l;
}
}
s.Close();
response.Close(); if (ce == "gzip") // 若数据是压缩格式,则要进行解压
{
MemoryStream js = new MemoryStream(); // 解压后的流
MemoryStream ms = new MemoryStream(data); // 用于解压的流
GZipStream g = new GZipStream(ms, CompressionMode.Decompress);
byte[] buffer = new byte[c]; // 读数据缓冲区
int l = g.Read(buffer, 0, c); // 一次读 10K
while (l > 0)
{
js.Write(buffer, 0, l);
l = g.Read(buffer, 0, c);
}
g.Close();
ms.Close();
data = js.ToArray();
js.Close();
}
return data; // 返回字节数组
}
}
}