http://www.kingold.com.cn/2009product/index2.asp?class_2=0 比如这个网站。上面很多图片。需要一个类别一次性抓取完。同一类分页后 url变成 http://www.kingold.com.cn/2009product/index2.asp?class_2=0&offset=24,然后offset是以每次增加24变化。请问大侠们,如何一次性抓取同一类的所有图片、图片对应的编码和克重?我现在用了WebRequest,只能实现一页一页的抓取。实现思路是先把页面加载到本地,然后用js得到相关信息。
解决方案 »
- 求通用的js 验证框架
- win8 iis8.0 权限问题
- 使用jmail时无法发送附件,可以发出去,但是发的不是附件,是文本格式
- 不用用户控件,怎样实现asp的include功能
- 大哥大姐帮帮忙吧!有分哦!先谢了..
- 从客户端(zuopinzhanshi_zpjs="<IMG height=182 alt=...")中检测到有潜在危险的 Request.Form 值。
- help
- ASP.NET MVC 连接oracle报错,请问如何解决?
- 急!!!关于DropDownList数据绑定的问题!!!
- 使用asp.net显示文本文件问题——不能换行
- 根据用户填写的邮箱,发送随机数字到该邮箱里。.net(C#)实现给功能
- 有关asp.net文件依赖方式的缓存的问题,请高人指点
/// 保存远程文件
/// </summary>
/// <param name="Url">地址</param>
/// <param name="Path">保存路径</param>
/// <param name="FileName">保存文件名</param>
/// <returns>文件名</returns>
public static string RemoteSave(string Url, string Path)
{
string StringFilePath = Path + GetFileExtends(Url);
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
_xmlhttp.open("GET", Url, false, null, null);
_xmlhttp.send("");
if (_xmlhttp.readyState == 4)
{
if (System.IO.File.Exists(StringFilePath))
System.IO.File.Delete(StringFilePath);
System.IO.FileStream fs = new System.IO.FileStream(StringFilePath, System.IO.FileMode.CreateNew);
System.IO.BinaryWriter w = new System.IO.BinaryWriter(fs);
w.Write((byte[])_xmlhttp.responseBody);
w.Close();
fs.Close();
}
else
throw new Exception(_xmlhttp.statusText);
return StringFilePath;
}
用正则表达式 找出所有图片关于分页 就 一页读完跳转到下一页
using System.Net;
Pic_Remote("<img src=http://static.tianya.cn/w250/images/20081206/6262712/1228548462160.jpg>") //远程存图
private string Pic_Remote(string news_Content)
{
string htmlStr = news_Content;
string nowyymm = DateTime.Now.ToString("yyyy-MM"); //当前年月
string nowdd = DateTime.Now.ToString("dd"); //当天号数
string path = "images/" + nowyymm + "/" + nowdd;
Directory.CreateDirectory(Server.MapPath(path));
string returnValue = "";
returnValue = SaveUrlPics(htmlStr, path, nowyymm, nowdd);
return returnValue;
} //下载图片到本地
public string SaveUrlPics(string strHTML, string path, string nowyymm, string nowdd)
{
string[] imgurlAry = GetImgTag(strHTML);
try
{
for (int i = 0; i < imgurlAry.Length; i++)
{
//WebRequest req = WebRequest.Create(imgurlAry[i]);
string preStr = System.DateTime.Now.ToString() + "_";
preStr = preStr.Replace("-", "");
preStr = preStr.Replace(":", "");
preStr = preStr.Replace(" ", "");
WebClient wc = new WebClient();
wc.DownloadFile(imgurlAry[i], Server.MapPath(path) + "/" + preStr + imgurlAry[i].Substring(imgurlAry[i].LastIndexOf("/") + 1));
//替换原图片地址
string imgPath = "/Files/Remoteupfile/" + nowyymm + "/" + nowdd;
strHTML = strHTML.Replace(imgurlAry[i], imgPath + "/" + preStr + imgurlAry[i].Substring(imgurlAry[i].LastIndexOf("/") + 1));
}
}
catch (Exception ex)
{
//return ex.Message;
}
return strHTML;
} //获取图片标志
private string[] GetImgTag(string htmlStr)
{
Regex regObj = new Regex("<img.+?>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
string[] strAry = new string[regObj.Matches(htmlStr).Count];
int i = 0;
foreach (Match matchItem in regObj.Matches(htmlStr))
{
strAry[i] = GetImgUrl(matchItem.Value);
i++;
}
return strAry;
} //获取图片URL地址
private string GetImgUrl(string imgTagStr)
{
string str = "";
Regex regObj = new Regex("http://.+.(?:jpg|gif|bmp|png)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
foreach (Match matchItem in regObj.Matches(imgTagStr))
{
str = matchItem.Value;
}
return str;
}[/code]
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;namespace WikiPageCreater.Common
{
public class PageHelper
{
/// <summary>
/// 根据 url 获取网页编码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return Encoding.Default.BodyName;
} /// <summary>
/// 根据 url 和 encoding 获取当前url页面的 html 源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding"></param>
/// <returns></returns>
public static string GetHtml(string url, Encoding encoding)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
else
reader = new StreamReader(response.GetResponseStream(), encoding);
string html = reader.ReadToEnd(); return html;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return string.Empty;
}
}
}
抓取HTML 正则获取也可以
要用到的表格
<table width="100%" border="0" cellspacing="0" cellpadding="0">
<tr>
<td width="37%"><a href="/2009product/kingold_product/upimages/20107291631053181.jpg" target="_blank"><img src="/2009product/kingold_product/upimages/20107291631053181.jpg" width="225" border="0"></a></td>
<td width="63%"><table width="100%" border="0" cellspacing="0" cellpadding="0">
<tr>
<td> </td>
<td align="left" class="a12">金凰首饰</td>
</tr>
<tr>
<td> </td>
<td align="left" class="a12">【编码】</td>
</tr>
<tr>
<td> </td>
<td align="left" class="a12"> 02220002</td>
</tr>
<tr>
<td> </td>
<td align="left" class="a12">【克重】</td>
</tr>
<tr>
<td width="4%"> </td>
<td align="left" class="a12">55克</td>
</tr>
</table></td>
</tr>
</table>不知思路是否正确。请大侠指点。若正确,我可以得到页面上所有img,不知如何得到相关文字信息。
得到SRC还是很简单。。string html = "这里是你下载下来的HTML码源"
Regex re = new Regex(@"(?<=<a href="".*?target=""_blank""><img src=\"").*?(?="" width=""225"" border=""0""></a>)", RegexOptions.None);
MatchCollection mc = re.Matches(html );
foreach (Match ma in mc)
{
//ma.Value就是每个img里的src
}
得到img的src 我可以得到。但是还要得到img对应的那些个 重量、编码信息
后台用WebClient.DownloadFile下载图片。。额,貌似有点慢。然后一个事物提交到数据库。笨方法,不知有没高手有好方法。