估计应该是用正则去分析html(不用正则估计会很费劲),功能类似于网站采集软件“火车头”那样的,但是没有那么复杂。我把我的需求说一下吧,大家帮忙看看有没有成熟的框架可用,如果有我就在框架基础上修改算了,重头做太过复杂感觉。1.检查网址是否正常 返回200和60为正常 404为故障 301和302为跳转。 左面列出网站地址,右面对应着相应网站的状态2.取到一些网站的文章标题以及文章内容
PS:取文章中比较复杂,P表示段落,整合到程序中要处理换行。3.分析百度,比如输入一个关键词,我要得到三部分内容,
1)搜出来的那些普通数据,
2)以及顶端的推广内容
3)右侧的推广内容
PS:取文章中比较复杂,P表示段落,整合到程序中要处理换行。3.分析百度,比如输入一个关键词,我要得到三部分内容,
1)搜出来的那些普通数据,
2)以及顶端的推广内容
3)右侧的推广内容
HttpWebRequest hwr = (HttpWebRequest) WebRequest.Create("http://www.baidu.com"); hwr.AllowAutoRedirect = false; //不允许重定向 hwr.Timeout = 10000; //连接超时时间设置 hwr.Method = "GET"; //协议:GET、HEAD、POST、PUT、DELETE、TRACE 或OPTIONS。 try { HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse(); MessageBox.Show(((int)hwrs.StatusCode).ToString()); //获得http状态码 如:200但是404却捕捉不到 Stream stream=hwrs.GetResponseStream(); MessageBox.Show(hwrs.CharacterSet); //获取返回结果的字符编码 StreamReader sr = new StreamReader(stream,Encoding.GetEncoding(hwrs.CharacterSet)); //注意读取的文字编码格式要和写入文件的文字编码格式相同 StreamWriter sw = new StreamWriter("c:\\b.html",false,Encoding.GetEncoding(hwrs.CharacterSet)); //写入文字的编码格式和读取时候的编码格式一样 sw.Write(sr.ReadToEnd()); sw.Flush(); sw.Close(); sr.Close(); } catch(Exception ex) { MessageBox.Show(ex.ToString()); }
在研究HtmlAgilityPack 写过的朋友分享下源码呀!
string[] values = docStockContext.DocumentNode.SelectSingleNode("./tr[2]").InnerText.Trim().Split('/n');报错了().Split('/n');提示我 字符文本中的字符太多 ,请问这行代码应该如何修改
<a[^>]*?href=['\""](.*?)['\""][^>]title=""(.*?)""*?>这个正则表达式符合大部分的url然后在一层一层的去揭破 得到详细页的信息
更正下正则<a[^>]*?href=['\""](.*?)['\""][^>]*?>
Header: 丁, Value: 12:00
Header: Θユ, Value: 79.9
Header: 禦秈, Value: 79.9
Header: 芥, Value: 80.0
Header: 害禴, Value: 【2.0
Header: 眎计, Value: 18,847
Header: 琎Μ, Value: 81.9
Header: 秨絃, Value: 81.4
Header: 程蔼, Value: 81.4
Header: 程, Value: 79.8
Header: 戈, Value:
Completed.代码如下 // 下载 Yahoo 奇摩股市资料 (范例为 2317 鸿海)
WebClient client = new WebClient();
MemoryStream ms = new MemoryStream(client.DownloadData("http://tw.stock.yahoo.com/q/q?s=2317")); // 使用预设编码读入 HTML
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.Load(ms, Encoding.Default); // 装载第一层查询结果
HtmlAgilityPack.HtmlDocument docStockContext = new HtmlAgilityPack.HtmlDocument();
docStockContext.LoadHtml(doc.DocumentNode.SelectSingleNode("/html[1]/body[1]/center[1]/table[2]/tr[1]/td[1]/table[1]").InnerHtml); // 取得个股标头
HtmlNodeCollection nodeHeaders = docStockContext.DocumentNode.SelectNodes("./tr[1]/th"); // 取得个股数值
string[] values = docStockContext.DocumentNode.SelectSingleNode("./tr[2]").InnerText.Trim().Split('\n'); int i = 0;
// 输出资料
foreach (HtmlNode nodeHeader in nodeHeaders)
{
string aaa = string.Format("Header: {0}, Value: {1}\r\n", nodeHeader.InnerText, values[i].Trim());
richTextBox1.AppendText(aaa);
i++;
} doc = null;
docStockContext = null;
client = null;
ms.Close(); richTextBox1.AppendText("Completed.");
NOTE
目前 HTML Agility Pack 预设编码应是法文编码,所以如果是读取中文 HTML 内容的话,无法直接使用 HtmlDocument.LoadHtml() 方法,而要透过 MemoryStream 使用 HtmlDocument.Load() 方法,才可以指定中文的编码。尝试了改下 但都报错..怎么回事啊
不就是获取网页源码吗、//根据url路径获取网页源码
private string GetWebContent(string sUrl)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(sUrl);
//声明一个HttpWebRequest请求
request.Timeout = 3000000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.ToString() != "")
{
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");//DB2312和UTF-8
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
}
catch (Exception exp)
{
writeLog(exp.Message, DateTime.Now);
strResult = "";
}
return strResult;
}
HtmlAgilityPack.HtmlDocument doc = hw.Load("http://tw.stock.yahoo.com/q/q?s=2317"); // 装载第一层查询结果
HtmlAgilityPack.HtmlDocument docStockContext = new HtmlAgilityPack.HtmlDocument();
docStockContext.LoadHtml(doc.DocumentNode.SelectSingleNode("/html[1]/body[1]/center[1]/table[2]/tr[1]/td[1]/table[1]").InnerHtml); // 取得个股标头
HtmlNodeCollection nodeHeaders = docStockContext.DocumentNode.SelectNodes("./tr[1]/th"); // 取得个股数值
string[] values = docStockContext.DocumentNode.SelectSingleNode("./tr[2]").InnerText.Trim().Split('\n'); int i = 0;
// 输出资料
foreach (HtmlNode nodeHeader in nodeHeaders)
{
string aaa = string.Format("Header: {0}, Value: {1}\r\n", nodeHeader.InnerText, values[i].Trim());
richTextBox1.AppendText(aaa);
i++;
} doc = null;
docStockContext = null;
//client = null;
//ms.Close(); richTextBox1.AppendText("Completed.");
/// 获取指定URL的HTML源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding">如果为NULL 则自动识别</param>
/// <returns></returns>
public static string GetWebHtml(string url, Encoding encoding)
{
try
{
HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(url);
HttpWebResponse res; try
{
res = (HttpWebResponse)hwr.GetResponse();
}
catch
{
return string.Empty;
} if (res.StatusCode == HttpStatusCode.OK)
{
using (Stream mystream = res.GetResponseStream())
{
//没有指定编码,
if (encoding == null)
{
return DecodeData(mystream, res);
}
//指定了编码
else
{
using (StreamReader reader = new StreamReader(mystream, encoding))
{
return reader.ReadToEnd();
}
}
}
} return null;
}
catch
{
return null;
}
}
private static string DecodeData(Stream responseStream, HttpWebResponse response)
{
string name = null;
string text2 = response.Headers["content-type"];
if (text2 != null)
{
int index = text2.IndexOf("charset=");
if (index != -1)
{
name = text2.Substring(index + 8);
}
}
MemoryStream stream = new MemoryStream();
byte[] buffer = new byte[0x400];
for (int i = responseStream.Read(buffer, 0, buffer.Length); i > 0; i = responseStream.Read(buffer, 0, buffer.Length))
{
stream.Write(buffer, 0, i);
}
responseStream.Close();
if (name == null)
{
MemoryStream stream3 = stream;
stream3.Seek((long)0, SeekOrigin.Begin);
string text3 = new StreamReader(stream3, Encoding.ASCII).ReadToEnd();
if (text3 != null)
{
int startIndex = text3.IndexOf("charset=");
int num4 = -1;
if (startIndex != -1)
{
num4 = text3.IndexOf("\"", startIndex);
if (num4 != -1)
{
int num5 = startIndex + 8;
name = text3.Substring(num5, (num4 - num5) + 1).TrimEnd(new char[] { '>', '"' });
}
}
}
}
Encoding aSCII = null;
if (name == null)
{
aSCII = Encoding.GetEncoding("gb2312");
}
else
{
try
{
if (name == "GBK")
{
name = "GB2312";
}
aSCII = Encoding.GetEncoding(name);
}
catch
{
aSCII = Encoding.GetEncoding("gb2312");
}
}
stream.Seek((long)0, SeekOrigin.Begin);
StreamReader reader2 = new StreamReader(stream, aSCII);
return reader2.ReadToEnd();
}
http://blog.csdn.net/w3031213101/article/details/5801888