我是这样写的
WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于对向Internet资源的请求进行身份验证的网络凭据。 Byte[] pageData = MyWebClient.DownloadDatag("http://123.sogou.com/");//从指定网站下载据
string pageHtml = Encoding.Default.GetString(pageData); //如果获取网站页面采用的GB2312,则使用这句
richTextBox1.Text = strallstrm;//在控制台输入获取的内容这样的话我richTextBox1里面的内容只是该网站的源代码而不是网页内容。。各位高手,我还要添加个什么代码才能使它显示网页内容啊?
WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于对向Internet资源的请求进行身份验证的网络凭据。 Byte[] pageData = MyWebClient.DownloadDatag("http://123.sogou.com/");//从指定网站下载据
string pageHtml = Encoding.Default.GetString(pageData); //如果获取网站页面采用的GB2312,则使用这句
richTextBox1.Text = strallstrm;//在控制台输入获取的内容这样的话我richTextBox1里面的内容只是该网站的源代码而不是网页内容。。各位高手,我还要添加个什么代码才能使它显示网页内容啊?
解决方案 »
- 如何在App_config写关于硬件配置的信息
- 请教怎么让自定义控件,属性更改里,在窗体的Load事件中自动加上一行代码
- richtextbox内容如何格式保存到文本文件中
- 水晶报表中无法将请求提交给后台处理?
- 怪事,为什么我的this.components 是nothing??
- c# winform 新建窗口,如何使新建的窗口不获得焦点?
- ComboBox的简单问题,请帮忙看看,谢谢!!
- 怎么实现类似vs的工具栏一样让工具栏一直靠左排列
- 一个水晶报表(Crystal Reports)问题,高手给看看如何解决.
- 看一下这段代码,错在哪里?
- 用什么控件才能写出像系统那样的时间?
- 怎么获得局域网另外一台计算机的文件目录?
/// <summary>
/// 获取网页源代码方法四
/// </summary>
/// <param name="url">地址</param>
/// <param name="charSet">指定编码,如果为空,则自动判断</param>
/// <param name="out_str">网页源代码</param>
public static string GetHtml(string url, string charSet)
{
string strWebData = string.Empty;
try
{
WebClient myWebClient = new WebClient(); //创建WebClient实例
byte[] myDataBuffer = myWebClient.DownloadData(url);
strWebData = System.Text.Encoding.Default.GetString(myDataBuffer);
//获取网页字符编码描述信息
if (string.IsNullOrEmpty(charSet))
{
Match charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[3].Value.Trim().ToLower();
if (webCharSet != "gb2312")
{
webCharSet = "utf-8";
}
if (System.Text.Encoding.GetEncoding(webCharSet) != System.Text.Encoding.Default)
{
strWebData = System.Text.Encoding.GetEncoding(webCharSet).GetString(myDataBuffer);
}
}
}
catch (Exception ex)
{
return null;
}
return strWebData;
}
我说在我的那个基础上加,我记得只加一两句代码,用不 了你那么麻烦,拜托写下
RichTextBoxStreamType.RichText
RichTextBoxStreamType.UnicodePlainText richTextBox1.LoadFile("d:\\source.rtf", RichTextBoxStreamType.RichText);http://msdn.microsoft.com/zh-cn/library/system.windows.forms.richtextboxstreamtype(v=vs.80).aspx
//获取网页源码
private string GetWebContent(string sUrl)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(sUrl);
//声明一个HttpWebRequest请求
request.Timeout = 3000000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.ToString() != "")
{
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
}
catch (Exception exp)
{
//MessageBox.Show("出错");
MessageBox.Show(exp.Message);
}
return strResult;
}
//截取网页内容 主流就是 substring 和正则表达式
RichTextBox.LoadFile 方法 (Stream, RichTextBoxStreamType)
直接复制过去调用就好了string str=GetWebContent("http://www.baidu.com");//网页源码
using(System.IO.StringReader sr = new System.IO.StringReader(pageData))
richTextBox1.LoadFile(sr, RichTextBoxStreamType.RichText);如果返回的字符串不符合RTF规范将招聘异常用HttpRequest.GetResponseStream会更方便
http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.getresponsestream.aspx
1.根据正则表达式匹配出你想要的内容
2.利用Winista.Htmlparser.Net 解析Html。这是.NET平台下解析Html的开源代码,网上有源码下载,百度一下就能搜到,这里就不提供了。并且有英文的帮助文档。找不到的留下邮箱。
个人认为这是.net平台下解析html不错的解决方案,基本上能够满足我们对html的解析工作
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;namespace 网页采集器
{
public class Helper
{
public static string GetRequestString(string strUrl, int enterType, Encoding EnCodeType)
{
string strResult;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(strUrl);
myReq.Timeout = 30000;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
Stream myStream = HttpWResp.GetResponseStream();
StreamReader sr = new StreamReader(myStream, EnCodeType);
StringBuilder strBuilder = new StringBuilder(); while (-1 != sr.Peek())
{
strBuilder.Append(sr.ReadLine());
if (enterType == 1)
{
strBuilder.Append("\r\n");
}
}
strResult = strBuilder.ToString();
}
catch (Exception err)
{
strResult = "请求错误:" + err.Message;
}
return strResult;
} public static void DownFile(string url, string strPath)
{
try
{
Uri uri = new Uri(url);
HttpWebRequest mRequest = (HttpWebRequest)WebRequest.Create(uri);
mRequest.Method = "GET";
mRequest.ContentType = "application/x-www-form-urlencoded";
HttpWebResponse wr = (HttpWebResponse)mRequest.GetResponse();
Stream sIn = wr.GetResponseStream();
FileStream fs = new FileStream("D:\\博客圆\\"+strPath, FileMode.Create, FileAccess.Write);
long length = wr.ContentLength;
long i = 0;
decimal j = 0;
while (i < length)
{
byte[] buffer = new byte[1024];
i += sIn.Read(buffer, 0, buffer.Length);
fs.Write(buffer, 0, buffer.Length);
}
sIn.Close();
wr.Close();
fs.Close();
}
catch (Exception ex)
{ }
} /// <summary>
/// 写文件
/// </summary>
/// <param name="fileName"></param>
/// <param name="content"></param>
public static void WriteFile(string fileName, string content)
{
System.IO.StreamWriter sw = new System.IO.StreamWriter("D:\\博客圆\\" + fileName, false);//重写该文件,不存在则创建
sw.Write(content);
sw.Close();
} /// <summary>
/// 检查文件夹是否存在,存在则创建
/// </summary>
/// <param name="path"></param>
public static void CheckDirectory(string path)
{
DirectoryInfo di = new DirectoryInfo(@"D:/博客圆/" + path);
if (!di.Exists)
{
di.Create();
}
}
}
}using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;namespace 网页采集器
{
class Program
{
static void Main(string[] args)
{
string siteUrl = "http://www.cnblogs.com/";
//获取首页
string html = Helper.GetRequestString(siteUrl , 1, Encoding.UTF8); #region 处理样式文件与js文件
//处理样式文件与js文件
MatchCollection mcCss = Regex.Matches(html, @"(href|src)=\"".*(css|js)\""");
foreach (Match ma in mcCss)
{
if (ma.Value.Contains("http"))
{
Match name = Regex.Match(ma.Value, @"\w*\.(css|js)");
Match path = Regex.Match(ma.Value, @"\b/.*/\b");
Helper.CheckDirectory(path.Value);
string css = Helper.GetRequestString(ma.Value.Replace("href=\"", "").Replace("\"", ""), 1, Encoding.UTF8);
Helper.WriteFile(path.Value + name.Value, css);
}
else
{
Match name = Regex.Match(ma.Value, @"\w*\.(css|js)");
Match path = Regex.Match(ma.Value, @"/.*/");
Helper.CheckDirectory(path.Value);
string css = Helper.GetRequestString(siteUrl + ma.Value.Replace("href=\"", "").Replace("\"", ""), 1, Encoding.UTF8);
Helper.WriteFile(path.Value + name.Value, css);
}
}
#endregion #region 处理图片
//处理图片
MatchCollection mcImg = Regex.Matches(html, @"<\s?img[^>]+?>");
foreach (Match ma in mcImg)
{
if (ma.Value.Contains("http"))
{
Match name = Regex.Match(ma.Value, @"\w*\.(jpg|gif|png|bmp)");
Match path = Regex.Match(ma.Value, @"src=\"".*(jpg|gif|png|bmp)\""");
Match dir = Regex.Match(path.Value, @"\b/.*/\b");
Helper.CheckDirectory(dir.Value);
Helper.DownFile(path.Value.Replace("src=\"", "").Replace("\"", ""), dir.Value + name.Value);
}
else
{
Match name = Regex.Match(ma.Value, @"\w*\.(jpg|gif|png|bmp)");
Match path = Regex.Match(ma.Value, @"src=\"".*(jpg|gif|png|bmp)\""");
Match dir = Regex.Match(path.Value, @"/.*/");
Helper.CheckDirectory(dir.Value);
Helper.DownFile("http://www.cnblogs.com"+path.Value.Replace("src=\"", "").Replace("\"", ""), dir.Value + name.Value);
}
}
#endregion html = Regex.Replace(html, @"http://.*\.com", ""); Helper.WriteFile("index.html", html); Console.WriteLine("处理成功");
Console.Read();
}
}
}
{
//sURL = @"d:\tmp\test1.htm";
string sHtml=string.Empty;
WebClient wc = new WebClient();
try
{
byte[] pagedata = wc.DownloadData(@sURL); //转换字符、
if (sURL.Contains("vancl.com"))
{
sHtml = Encoding.UTF8.GetString(pagedata);
}
else
{
sHtml = wc.DownloadString(sURL); }
//sHtml = sHtml.Replace("<!--", "<P>");
//sHtml = sHtml.Replace("-->", "</P>");
if (!sURL.Contains("taobao.com"))
{
sHtml = sHtml.Replace("<<", "<");
sHtml = sHtml.Replace("> />", "/>");
}
}
catch (Exception)
{ return false;
}