抓取别的网站图片信息到本地

http://www.kingold.com.cn/2009product/index2.asp?class_2=0 比如这个网站。上面很多图片。需要一个类别一次性抓取完。同一类分页后 url变成 http://www.kingold.com.cn/2009product/index2.asp?class_2=0&offset=24，然后offset是以每次增加24变化。请问大侠们，如何一次性抓取同一类的所有图片、图片对应的编码和克重？我现在用了WebRequest，只能实现一页一页的抓取。实现思路是先把页面加载到本地，然后用js得到相关信息。

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

/// <summary>
        /// 保存远程文件
        /// </summary>
        /// <param name="Url">地址</param>
        /// <param name="Path">保存路径</param>
        /// <param name="FileName">保存文件名</param>
        /// <returns>文件名</returns>
        public static string RemoteSave(string Url, string Path)
        {
            string StringFilePath = Path + GetFileExtends(Url);
            MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
            _xmlhttp.open("GET", Url, false, null, null);
            _xmlhttp.send("");
            if (_xmlhttp.readyState == 4)
            {
                if (System.IO.File.Exists(StringFilePath))
                    System.IO.File.Delete(StringFilePath);
                System.IO.FileStream fs = new System.IO.FileStream(StringFilePath, System.IO.FileMode.CreateNew);
                System.IO.BinaryWriter w = new System.IO.BinaryWriter(fs);
                w.Write((byte[])_xmlhttp.responseBody);
                w.Close();
                fs.Close();
            }
            else
                throw new Exception(_xmlhttp.statusText);
            return StringFilePath;
        }
用正则表达式找出所有图片关于分页就  一页读完跳转到下一页
using System.Text.RegularExpressions;
using System.Net;
Pic_Remote("<img src=http://static.tianya.cn/w250/images/20081206/6262712/1228548462160.jpg>") //远程存图
      private string Pic_Remote(string news_Content)
      {
          string htmlStr = news_Content;
          string nowyymm = DateTime.Now.ToString("yyyy-MM");    //当前年月
          string nowdd = DateTime.Now.ToString("dd"); //当天号数
          string path = "images/" + nowyymm + "/" + nowdd;
          Directory.CreateDirectory(Server.MapPath(path));
          string returnValue = "";
          returnValue = SaveUrlPics(htmlStr, path, nowyymm, nowdd);
          return returnValue;
      }      //下载图片到本地
      public string SaveUrlPics(string strHTML, string path, string nowyymm, string nowdd)
      {
          string[] imgurlAry = GetImgTag(strHTML);
          try
          {
              for (int i = 0; i < imgurlAry.Length; i++)
              {
                  //WebRequest req = WebRequest.Create(imgurlAry[i]);
                  string preStr = System.DateTime.Now.ToString() + "_";
                  preStr = preStr.Replace("-", "");
                  preStr = preStr.Replace(":", "");
                  preStr = preStr.Replace(" ", "");
                  WebClient wc = new WebClient();
                  wc.DownloadFile(imgurlAry[i], Server.MapPath(path) + "/" + preStr + imgurlAry[i].Substring(imgurlAry[i].LastIndexOf("/") + 1));
                  //替换原图片地址
                  string imgPath = "/Files/Remoteupfile/" + nowyymm + "/" + nowdd;
                  strHTML = strHTML.Replace(imgurlAry[i], imgPath + "/" + preStr + imgurlAry[i].Substring(imgurlAry[i].LastIndexOf("/") + 1));
              }
          }
          catch (Exception ex)
          {
              //return ex.Message;
          }
          return strHTML;
      }      //获取图片标志
      private string[] GetImgTag(string htmlStr)
      {
          Regex regObj = new Regex("<img.+?>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
          string[] strAry = new string[regObj.Matches(htmlStr).Count];
          int i = 0;
          foreach (Match matchItem in regObj.Matches(htmlStr))
          {
              strAry[i] = GetImgUrl(matchItem.Value);
              i++;
          }
          return strAry;
      }      //获取图片URL地址
      private string GetImgUrl(string imgTagStr)
      {
          string str = "";
          Regex regObj = new Regex("http://.+.(?:jpg|gif|bmp|png)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
          foreach (Match matchItem in regObj.Matches(imgTagStr))
          {
              str = matchItem.Value;
          }
          return str;
      }[/code]
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;namespace WikiPageCreater.Common
{
    public class PageHelper
    {
        /// <summary>
        /// 根据 url 获取网页编码
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetEncoding(string url)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            StreamReader reader = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(url);
                request.Timeout = 20000;
                request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                {
                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                    else
                        reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);                    string html = reader.ReadToEnd();                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                    if (reg_charset.IsMatch(html))
                    {
                        return reg_charset.Match(html).Groups["charset"].Value;
                    }
                    else if (response.CharacterSet != string.Empty)
                    {
                        return response.CharacterSet;
                    }
                    else
                        return Encoding.Default.BodyName;
                }
            }
            catch
            {
            }
            finally
            {                if (response != null)
                {
                    response.Close();
                    response = null;
                }
                if (reader != null)
                    reader.Close();                if (request != null)
                    request = null;            }            return Encoding.Default.BodyName;
        }        /// <summary>
        /// 根据 url 和 encoding 获取当前url页面的 html 源代码
       /// </summary>
        /// <param name="url"></param>
        /// <param name="encoding"></param>
        /// <returns></returns>
        public static string GetHtml(string url, Encoding encoding)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            StreamReader reader = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(url);
                request.Timeout = 20000;
                request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                {
                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                    else
                        reader = new StreamReader(response.GetResponseStream(), encoding);
                    string html = reader.ReadToEnd();                    return html;
                }
            }
            catch
            {
            }
            finally
            {                if (response != null)
                {
                    response.Close();
                    response = null;
                }
                if (reader != null)
                    reader.Close();                if (request != null)
                    request = null;            }            return string.Empty;
        }
    }
}
抓取HTML 正则获取也可以
我的思路是。正则获取页面上所有img，然后正则获取该img下的相关信息。
要用到的表格
<table width="100%"  border="0" cellspacing="0" cellpadding="0">
                              <tr>
                                <td width="37%"><a href="/2009product/kingold_product/upimages/20107291631053181.jpg" target="_blank"><img src="/2009product/kingold_product/upimages/20107291631053181.jpg" width="225" border="0"></a></td>
                                <td width="63%"><table width="100%"  border="0" cellspacing="0" cellpadding="0">
                                    <tr>
                                      <td> </td>
                                      <td align="left" class="a12">金凰首饰</td>
                                    </tr>
                                    <tr>
                                      <td> </td>
                                      <td align="left" class="a12">【编码】</td>
                                      </tr>
                                    <tr>
                                      <td> </td>
                                      <td align="left" class="a12"> 02220002</td>
                                      </tr>
                                    <tr>
                                      <td> </td>
                                      <td align="left" class="a12">【克重】</td>
                                      </tr>
                                    <tr>
                                      <td width="4%"> </td>
                                      <td align="left" class="a12">55克</td>
                                      </tr>
                                </table></td>
                              </tr>
                          </table>不知思路是否正确。请大侠指点。若正确，我可以得到页面上所有img，不知如何得到相关文字信息。
得到SRC还是很简单。。string html = "这里是你下载下来的HTML码源"
Regex re = new Regex(@"(?<=<a href="".*?target=""_blank""><img src=\"").*?(?="" width=""225"" border=""0""></a>)", RegexOptions.None);
MatchCollection mc = re.Matches(html );
foreach (Match ma in mc)
{
   //ma.Value就是每个img里的src
}
得到img的src 我可以得到。但是还要得到img对应的那些个重量、编码信息
哎，搞出来了。用正则取到图片所属table。然后把table添加到自己的页面。然后用js得到图片的url、编号、克重。
后台用WebClient.DownloadFile下载图片。。额，貌似有点慢。然后一个事物提交到数据库。笨方法，不知有没高手有好方法。