看了大家的回复,在别人的基础上我编写了一段网页提取的C#代码(我想提取英文网页的文本内容),但不知道为什么没有任何输出,大家帮我看看问题出在哪?请尽量给出具体代码,我在线等,谢谢了
using System;
using System.Text.RegularExpressions;
using System.IO;
using System.Text;using System.Net;
using System.IO.Compression;
using System.Web;
using System.Collections;namespace 控制台测试
{
class Program
{
static void Main(string[] args)
{
byte[] buffer = getBytes("http://www.imqq.com/", null, null);
string html = Encoding.UTF8.GetString(buffer);
Regex rx = new Regex(@"[A-Za-z]{300}");
MatchCollection mc = rx.Matches(html); foreach (Match m in mc)
{
string word = m.Groups["word"].Value;
int index = m.Index;
Console.WriteLine("produce", word, index);
}
} // 读取网络资源,返回字节数组
private static byte[] getBytes(string url, CookieContainer cookie, byte[] postData)
{
int c = url.IndexOf("/", 10);
byte[] data = null;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.AllowAutoRedirect = true;
if (cookie != null) request.CookieContainer = cookie;
request.Referer = (c > 0 ? url.Substring(0, c) : url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; if (postData != null) // 需要 Post 数据
{
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = postData.Length;
Stream requestStream = request.GetRequestStream();
requestStream.Write(postData, 0, postData.Length);
requestStream.Close();
} HttpWebResponse response = (HttpWebResponse)request.GetResponse();
string ce = response.Headers[HttpResponseHeader.ContentEncoding];
int ContentLength = (int)response.ContentLength;
Stream s = response.GetResponseStream();
c = 1024 * 10;
if (ContentLength < 0) // 不能获取数据的长度
{
data = new byte[c];
MemoryStream ms = new MemoryStream();
int l = s.Read(data, 0, c);
while (l > 0)
{
ms.Write(data, 0, l);
l = s.Read(data, 0, c);
}
data = ms.ToArray();
ms.Close();
}
else // 数据长度已知
{
data = new byte[ContentLength];
int pos = 0;
while (ContentLength > 0)
{
int l = s.Read(data, pos, ContentLength);
pos += l;
ContentLength -= l;
}
}
s.Close();
response.Close(); if (ce == "gzip") // 若数据是压缩格式,则要进行解压
{
MemoryStream js = new MemoryStream(); // 解压后的流
MemoryStream ms = new MemoryStream(data); // 用于解压的流
GZipStream g = new GZipStream(ms, CompressionMode.Decompress);
byte[] buffer = new byte[c]; // 读数据缓冲区
int l = g.Read(buffer, 0, c); // 一次读 10K
while (l > 0)
{
js.Write(buffer, 0, l);
l = g.Read(buffer, 0, c);
}
g.Close();
ms.Close();
data = js.ToArray();
js.Close();
}
return data; // 返回字节数组
}
}
}
using System;
using System.Text.RegularExpressions;
using System.IO;
using System.Text;using System.Net;
using System.IO.Compression;
using System.Web;
using System.Collections;namespace 控制台测试
{
class Program
{
static void Main(string[] args)
{
byte[] buffer = getBytes("http://www.imqq.com/", null, null);
string html = Encoding.UTF8.GetString(buffer);
Regex rx = new Regex(@"[A-Za-z]{300}");
MatchCollection mc = rx.Matches(html); foreach (Match m in mc)
{
string word = m.Groups["word"].Value;
int index = m.Index;
Console.WriteLine("produce", word, index);
}
} // 读取网络资源,返回字节数组
private static byte[] getBytes(string url, CookieContainer cookie, byte[] postData)
{
int c = url.IndexOf("/", 10);
byte[] data = null;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.AllowAutoRedirect = true;
if (cookie != null) request.CookieContainer = cookie;
request.Referer = (c > 0 ? url.Substring(0, c) : url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; if (postData != null) // 需要 Post 数据
{
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = postData.Length;
Stream requestStream = request.GetRequestStream();
requestStream.Write(postData, 0, postData.Length);
requestStream.Close();
} HttpWebResponse response = (HttpWebResponse)request.GetResponse();
string ce = response.Headers[HttpResponseHeader.ContentEncoding];
int ContentLength = (int)response.ContentLength;
Stream s = response.GetResponseStream();
c = 1024 * 10;
if (ContentLength < 0) // 不能获取数据的长度
{
data = new byte[c];
MemoryStream ms = new MemoryStream();
int l = s.Read(data, 0, c);
while (l > 0)
{
ms.Write(data, 0, l);
l = s.Read(data, 0, c);
}
data = ms.ToArray();
ms.Close();
}
else // 数据长度已知
{
data = new byte[ContentLength];
int pos = 0;
while (ContentLength > 0)
{
int l = s.Read(data, pos, ContentLength);
pos += l;
ContentLength -= l;
}
}
s.Close();
response.Close(); if (ce == "gzip") // 若数据是压缩格式,则要进行解压
{
MemoryStream js = new MemoryStream(); // 解压后的流
MemoryStream ms = new MemoryStream(data); // 用于解压的流
GZipStream g = new GZipStream(ms, CompressionMode.Decompress);
byte[] buffer = new byte[c]; // 读数据缓冲区
int l = g.Read(buffer, 0, c); // 一次读 10K
while (l > 0)
{
js.Write(buffer, 0, l);
l = g.Read(buffer, 0, c);
}
g.Close();
ms.Close();
data = js.ToArray();
js.Close();
}
return data; // 返回字节数组
}
}
}
我只想提取英文单词和相应的数量,您能帮我改改正则法则吗?太谢谢了