正则需要Html里的规律 比如图片基本都有一个共性<img src= ...> List<string> links = new List<string>(); foreach(var match in new Regex("(?is)<img.*\\ssrc=(['\"]?)(\w+)\1.*>").Matches(htmlcode)) links.Add(match.Group[2].Value);
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net; using System.IO; using System.IO.Compression; using System.Text.RegularExpressions;namespace WikiPageCreater.Common { public class PageHelper { /// <summary> /// 根据 url 获取网页编码 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetEncoding(string url) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); else reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)"); if (reg_charset.IsMatch(html)) { return reg_charset.Match(html).Groups["charset"].Value; } else if (response.CharacterSet != string.Empty) { return response.CharacterSet; } else return Encoding.Default.BodyName; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return Encoding.Default.BodyName; } /// <summary> /// 根据 url 和 encoding 获取当前url页面的 html 源代码 /// </summary> /// <param name="url"></param> /// <param name="encoding"></param> /// <returns></returns> public static string GetHtml(string url, Encoding encoding) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding); else reader = new StreamReader(response.GetResponseStream(), encoding); string html = reader.ReadToEnd(); return html; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return string.Empty; } } } 抓取后用正则来取
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(@"http://csdnimg.cn/www/images/csdnindex_logo.gif");
request.Method = "GET";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
System.Drawing.Image image1 = System.Drawing.Image.FromStream(streamReceive);
image1.Save(Server.MapPath("~/a.gif"));
streamReceive.Close();
比如图片基本都有一个共性<img src= ...>
List<string> links = new List<string>();
foreach(var match in new Regex("(?is)<img.*\\ssrc=(['\"]?)(\w+)\1.*>").Matches(htmlcode))
links.Add(match.Group[2].Value);
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;namespace WikiPageCreater.Common
{
public class PageHelper
{
/// <summary>
/// 根据 url 获取网页编码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return Encoding.Default.BodyName;
} /// <summary>
/// 根据 url 和 encoding 获取当前url页面的 html 源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding"></param>
/// <returns></returns>
public static string GetHtml(string url, Encoding encoding)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
else
reader = new StreamReader(response.GetResponseStream(), encoding);
string html = reader.ReadToEnd(); return html;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return string.Empty;
}
}
}
抓取后用正则来取