using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Windows.Forms;
using System.Web;
namespace SearchEngine
{
class Crawl
{
string patternCode = @"<meta[\s\S]+?charset=[\s]*[""]?(.*?)""[\s]*[\S]?>";//用于分析网站编码
string patternUrl = "<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]";
//content="text/html; charset=gb2312"
string filePath = @"E:\学习\各种项目\智能搜索\htmldownload\";//文件保存路径
string dlErrorName = @"E:\学习\各种项目\智能搜索\htmldownload\downloadErrorLog.txt";
string getCodeErrorName = @"E:\学习\各种项目\智能搜索\htmldownload\getCodeErrorLog.txt";
/// <summary>
/// 下载指定url的html文档,并保存在本地
/// </summary>
/// <param name="url"></param>
/// <returns>返回html文本</returns>
public string download(string url)
{
string encoding="ASSIC";
string html=string.Empty;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 5000;
request.Method = "get";
request.ContentType = "text/html";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader strRead;
encoding = getEncoding(url);
//判断网页是否经过gzip压缩,如果是则解压缩
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
{
strRead = new StreamReader(new GZipStream(resStream, CompressionMode.Decompress), Encoding.GetEncoding(encoding), true);
}
else
strRead = new StreamReader(resStream, Encoding.GetEncoding(encoding));
html = strRead.ReadToEnd();
resStream.Close();
//MessageBox.Show("1");
saveHtml(html, encoding);
analyseLink(html);
}
catch (Exception ex)
{
/*一旦产生异常就将异常信息写入错误日志*/
try
{
FileStream fs = new FileStream(dlErrorName, FileMode.Append, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs);
sw.WriteLine(ex.Message + url + "||");
sw.Close();
fs.Close();
}
catch (Exception e)
{
return "";
}
}
return html;
} /// <summary>
/// 提取网页中的链接
/// </summary>
/// <param name="html">传入的字符串</param>
private void analyseLink(string html)
{
Regex urlReg = new Regex(patternUrl, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection matches = urlReg.Matches(html);
foreach (Match match in matches)
{
string url = match.Groups[1].Value;
download(url);
}
} static int name = 1;//将保存的文件命名
/// <summary>
/// 讲指定的网页信息保存至本地
/// </summary>
/// <param name="html">需要保存的网页信息字符串</param>
/// <param name="url">网页的地址</param>
public void saveHtml(string html,string code)
{
string filename = filePath + name.ToString() + ".htm";
StreamWriter sw = new StreamWriter(filename, false, Encoding.GetEncoding(code));
sw.Write(html);
sw.Close();
name++;
} /// <summary>
/// 根据url获取网页编码
/// </summary>
/// <param name="url">网页的域名地址</param>
/// <returns>返回string类型的编码</returns>
public string getEncoding(string url)
{
HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse wrs = (HttpWebResponse)wr.GetResponse();
string code = "utf-8";
try
{
Stream stream = wrs.GetResponseStream();
StreamReader sr = null;
if (wrs.ContentEncoding != null&&wrs.ContentEncoding.Equals("gzip",StringComparison.InvariantCultureIgnoreCase))
{
sr=new StreamReader(new GZipStream(stream, CompressionMode.Decompress));
}
else
sr=new StreamReader(stream);
string line;
Regex regex = new Regex(patternCode, RegexOptions.IgnoreCase|RegexOptions.Multiline);
Match m;
while ((line = sr.ReadLine())!=null)
{
if (regex.IsMatch(line))
{
m = regex.Match(line);
code = m.Groups[1].Value;
break;
}
}
}
catch (Exception ex)
{
try
{
FileStream fs = new FileStream(getCodeErrorName, FileMode.Append, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs);
sw.WriteLine(ex.Message + " " + url + "||");
sw.Close();
fs.Close();
}
catch
{
return "";
}
}
return code;
}
}
}我用的根地址是http://www.sohu.com/ 这个页面是可以抓取并且保存在本地的,然后我调试了一下 发现第二次调用函数getEncoding(string url)的时候 程序会暂停在HttpWebResponse wrs = (HttpWebResponse)wr.GetResponse();不动 但是并没有报出异常,求解啊
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Windows.Forms;
using System.Web;
namespace SearchEngine
{
class Crawl
{
string patternCode = @"<meta[\s\S]+?charset=[\s]*[""]?(.*?)""[\s]*[\S]?>";//用于分析网站编码
string patternUrl = "<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]";
//content="text/html; charset=gb2312"
string filePath = @"E:\学习\各种项目\智能搜索\htmldownload\";//文件保存路径
string dlErrorName = @"E:\学习\各种项目\智能搜索\htmldownload\downloadErrorLog.txt";
string getCodeErrorName = @"E:\学习\各种项目\智能搜索\htmldownload\getCodeErrorLog.txt";
/// <summary>
/// 下载指定url的html文档,并保存在本地
/// </summary>
/// <param name="url"></param>
/// <returns>返回html文本</returns>
public string download(string url)
{
string encoding="ASSIC";
string html=string.Empty;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 5000;
request.Method = "get";
request.ContentType = "text/html";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader strRead;
encoding = getEncoding(url);
//判断网页是否经过gzip压缩,如果是则解压缩
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
{
strRead = new StreamReader(new GZipStream(resStream, CompressionMode.Decompress), Encoding.GetEncoding(encoding), true);
}
else
strRead = new StreamReader(resStream, Encoding.GetEncoding(encoding));
html = strRead.ReadToEnd();
resStream.Close();
//MessageBox.Show("1");
saveHtml(html, encoding);
analyseLink(html);
}
catch (Exception ex)
{
/*一旦产生异常就将异常信息写入错误日志*/
try
{
FileStream fs = new FileStream(dlErrorName, FileMode.Append, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs);
sw.WriteLine(ex.Message + url + "||");
sw.Close();
fs.Close();
}
catch (Exception e)
{
return "";
}
}
return html;
} /// <summary>
/// 提取网页中的链接
/// </summary>
/// <param name="html">传入的字符串</param>
private void analyseLink(string html)
{
Regex urlReg = new Regex(patternUrl, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection matches = urlReg.Matches(html);
foreach (Match match in matches)
{
string url = match.Groups[1].Value;
download(url);
}
} static int name = 1;//将保存的文件命名
/// <summary>
/// 讲指定的网页信息保存至本地
/// </summary>
/// <param name="html">需要保存的网页信息字符串</param>
/// <param name="url">网页的地址</param>
public void saveHtml(string html,string code)
{
string filename = filePath + name.ToString() + ".htm";
StreamWriter sw = new StreamWriter(filename, false, Encoding.GetEncoding(code));
sw.Write(html);
sw.Close();
name++;
} /// <summary>
/// 根据url获取网页编码
/// </summary>
/// <param name="url">网页的域名地址</param>
/// <returns>返回string类型的编码</returns>
public string getEncoding(string url)
{
HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse wrs = (HttpWebResponse)wr.GetResponse();
string code = "utf-8";
try
{
Stream stream = wrs.GetResponseStream();
StreamReader sr = null;
if (wrs.ContentEncoding != null&&wrs.ContentEncoding.Equals("gzip",StringComparison.InvariantCultureIgnoreCase))
{
sr=new StreamReader(new GZipStream(stream, CompressionMode.Decompress));
}
else
sr=new StreamReader(stream);
string line;
Regex regex = new Regex(patternCode, RegexOptions.IgnoreCase|RegexOptions.Multiline);
Match m;
while ((line = sr.ReadLine())!=null)
{
if (regex.IsMatch(line))
{
m = regex.Match(line);
code = m.Groups[1].Value;
break;
}
}
}
catch (Exception ex)
{
try
{
FileStream fs = new FileStream(getCodeErrorName, FileMode.Append, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs);
sw.WriteLine(ex.Message + " " + url + "||");
sw.Close();
fs.Close();
}
catch
{
return "";
}
}
return code;
}
}
}我用的根地址是http://www.sohu.com/ 这个页面是可以抓取并且保存在本地的,然后我调试了一下 发现第二次调用函数getEncoding(string url)的时候 程序会暂停在HttpWebResponse wrs = (HttpWebResponse)wr.GetResponse();不动 但是并没有报出异常,求解啊
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货