protected void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
string start = "<div class=\"shoucang\">";
string end = "<div class=\"pagebreak\">";
string url = "http://www.chinaz.com/Webmaster/Club/1021954U2009.html";
this.txt1.Text = RemoveHTML(SubHtml(GetHtml(url), start, end));
}
} /// <summary>
/// 获取网页内容
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetHtml(string url)
{
string result = "";
try
{
WebRequest request = WebRequest.Create(url);
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("GBK"));
result = reader.ReadToEnd();
}
catch
{
result = "";
}
return result;
} /// <summary>
/// 截取指定区域的内容
/// </summary>
/// <param name="htmlCode">源码</param>
/// <param name="strBegin">开始位置</param>
/// <param name="strEnd">结束为止</param>
/// <returns></returns>
public string SubHtml(string htmlCode, string strBegin, string strEnd)
{
string NewsTitle = "";
Regex regex1 = new Regex("" + strBegin + @"(?<title>[\s\S]+?)" + strEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
for (Match match1 = regex1.Match(htmlCode); match1.Success; match1 = match1.NextMatch())
{
NewsTitle = match1.Groups["title"].ToString();
}
return NewsTitle;
} public string RemoveHTML(string HtmlCode)
{
string MatchVale = HtmlCode;
foreach (Match s in Regex.Matches(HtmlCode, "<.+?>"))
{
MatchVale = MatchVale.Replace(s.Value, "");
}
return MatchVale;
}就采集以上这个网址,里面有图片的,我想把图片也显示出来,但被去除了,怎么办?
求各位哥哥帮小弟改一下方法吧~~
{
if (!IsPostBack)
{
string start = "<div class=\"shoucang\">";
string end = "<div class=\"pagebreak\">";
string url = "http://www.chinaz.com/Webmaster/Club/1021954U2009.html";
this.txt1.Text = RemoveHTML(SubHtml(GetHtml(url), start, end));
}
} /// <summary>
/// 获取网页内容
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetHtml(string url)
{
string result = "";
try
{
WebRequest request = WebRequest.Create(url);
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("GBK"));
result = reader.ReadToEnd();
}
catch
{
result = "";
}
return result;
} /// <summary>
/// 截取指定区域的内容
/// </summary>
/// <param name="htmlCode">源码</param>
/// <param name="strBegin">开始位置</param>
/// <param name="strEnd">结束为止</param>
/// <returns></returns>
public string SubHtml(string htmlCode, string strBegin, string strEnd)
{
string NewsTitle = "";
Regex regex1 = new Regex("" + strBegin + @"(?<title>[\s\S]+?)" + strEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
for (Match match1 = regex1.Match(htmlCode); match1.Success; match1 = match1.NextMatch())
{
NewsTitle = match1.Groups["title"].ToString();
}
return NewsTitle;
} public string RemoveHTML(string HtmlCode)
{
string MatchVale = HtmlCode;
foreach (Match s in Regex.Matches(HtmlCode, "<.+?>"))
{
MatchVale = MatchVale.Replace(s.Value, "");
}
return MatchVale;
}就采集以上这个网址,里面有图片的,我想把图片也显示出来,但被去除了,怎么办?
求各位哥哥帮小弟改一下方法吧~~
解决方案 »
- dreamweaver如何让图片居中(水平和垂直都要)
- 如何保存选定的节点
- excel嵌入网页编辑,像gmail一样
- 我自己建了一个类.为什么在里面打不出DataSet的Relations属性??
- 求解啊 求解啊!!!1
- 关于数组的赋值问题
- 帮忙看看:以下源码无法利用DataGrid的AllowCustomPaging属性实现自动分页
- 关于网站开发时 委托的使用 请高手解释解释???????????
- 请问怎么通过web service调用SQL Server2000中已经建立的存储过程?(基于C#)
- asp.net不能在Script中调用客户端的控件?
- .net如果要正常使用,需要给C盘的哪些文件夹加上哪些权限呢
- 关于radiobuttonlist 和 dropdownlist 的问题
/// 匹配页面的图片地址
/// </summary>
/// <param name="HtmlCode"></param>
/// <param name="imgHttp">要补充的http://路径信息</param>
/// <returns></returns>
public string GetImgSrc(string HtmlCode, string imgHttp)
{
string MatchVale = "";
string Reg = @"<img.+?>";
foreach (Match m in Regex.Matches(HtmlCode, Reg))
{
MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "||";
}
return MatchVale;
}
/// <summary>
/// 匹配<img src="" />中的图片路径实际链接
/// </summary>
/// <param name="ImgString"><img src="" />字符串</param>
/// <returns></returns>
public string GetImg(string ImgString, string imgHttp)
{
string MatchVale = "";
string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
{
MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");
}
return (imgHttp + MatchVale);
}
实际图片地址
http://www.chinaz.com/upimg/allimg/091021/1656210.jpg
而采集到的是
upimg/allimg/091021/1656210.jpg少了前面部分,请问怎么解决
{
ArrayList al = new ArrayList();
Match match = Regex.Match(htmlCode, @"http(s)?://+(((/?)+[\w-.]+(/))*)+[\w-./]+\.+(jpg|jpeg|png|bmp|gif)", RegexOptions.IgnoreCase);
while (match.Success)
{
if (!al.Contains(match.Value)) al.Add(match.Value);
match = match.NextMatch();
}
return al;
}WebRequest request = WebRequest.Create("");
WebResponse response = request.GetResponse();
Stream reader = response.GetResponseStream();
FileStream writer = new FileStream("D:\\a.gif", FileMode.OpenOrCreate, FileAccess.Write);
byte[] buff = new byte[512];
int c = 0; while ((c=reader.Read(buff, 0, buff.Length)) > 0)
{
writer.Write(buff, 0, c);
}
writer.Close();