http://www.kingold.com.cn/2009product/index2.asp?class_2=0  比如这个网站。上面很多图片。需要一个类别一次性抓取完。同一类分页后 url变成 http://www.kingold.com.cn/2009product/index2.asp?class_2=0&offset=24,然后offset是以每次增加24变化。请问大侠们,如何一次性抓取同一类的所有图片、图片对应的编码和克重?我现在用了WebRequest,只能实现一页一页的抓取。实现思路是先把页面加载到本地,然后用js得到相关信息。

解决方案 »

  1.   

     /// <summary>
            /// 保存远程文件
            /// </summary>
            /// <param name="Url">地址</param>
            /// <param name="Path">保存路径</param>
            /// <param name="FileName">保存文件名</param>
            /// <returns>文件名</returns>
            public static string RemoteSave(string Url, string Path)
            {
                string StringFilePath = Path + GetFileExtends(Url);
                MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
                _xmlhttp.open("GET", Url, false, null, null);
                _xmlhttp.send("");
                if (_xmlhttp.readyState == 4)
                {
                    if (System.IO.File.Exists(StringFilePath))
                        System.IO.File.Delete(StringFilePath);
                    System.IO.FileStream fs = new System.IO.FileStream(StringFilePath, System.IO.FileMode.CreateNew);
                    System.IO.BinaryWriter w = new System.IO.BinaryWriter(fs);
                    w.Write((byte[])_xmlhttp.responseBody);
                    w.Close();
                    fs.Close();
                }
                else
                    throw new Exception(_xmlhttp.statusText);
                return StringFilePath;
            } 
    用正则表达式 找出所有图片关于分页 就  一页读完跳转到下一页 
      

  2.   

    using System.Text.RegularExpressions;
    using System.Net;
    Pic_Remote("<img src=http://static.tianya.cn/w250/images/20081206/6262712/1228548462160.jpg>") //远程存图
          private string Pic_Remote(string news_Content)
          {
              string htmlStr = news_Content;
              string nowyymm = DateTime.Now.ToString("yyyy-MM");    //当前年月
              string nowdd = DateTime.Now.ToString("dd"); //当天号数
              string path = "images/" + nowyymm + "/" + nowdd;
              Directory.CreateDirectory(Server.MapPath(path));
              string returnValue = "";
              returnValue = SaveUrlPics(htmlStr, path, nowyymm, nowdd);
              return returnValue;
          }      //下载图片到本地
          public string SaveUrlPics(string strHTML, string path, string nowyymm, string nowdd)
          {
              string[] imgurlAry = GetImgTag(strHTML);
              try
              {
                  for (int i = 0; i < imgurlAry.Length; i++)
                  {
                      //WebRequest req = WebRequest.Create(imgurlAry[i]);
                      string preStr = System.DateTime.Now.ToString() + "_";
                      preStr = preStr.Replace("-", "");
                      preStr = preStr.Replace(":", "");
                      preStr = preStr.Replace(" ", "");
                      WebClient wc = new WebClient();
                      wc.DownloadFile(imgurlAry[i], Server.MapPath(path) + "/" + preStr + imgurlAry[i].Substring(imgurlAry[i].LastIndexOf("/") + 1));
                      //替换原图片地址
                      string imgPath = "/Files/Remoteupfile/" + nowyymm + "/" + nowdd;
                      strHTML = strHTML.Replace(imgurlAry[i], imgPath + "/" + preStr + imgurlAry[i].Substring(imgurlAry[i].LastIndexOf("/") + 1));
                  }
              }
              catch (Exception ex)
              {
                  //return ex.Message;
              }
              return strHTML;
          }      //获取图片标志
          private string[] GetImgTag(string htmlStr)
          {
              Regex regObj = new Regex("<img.+?>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
              string[] strAry = new string[regObj.Matches(htmlStr).Count];
              int i = 0;
              foreach (Match matchItem in regObj.Matches(htmlStr))
              {
                  strAry[i] = GetImgUrl(matchItem.Value);
                  i++;
              }
              return strAry;
          }      //获取图片URL地址
          private string GetImgUrl(string imgTagStr)
          {
              string str = "";
              Regex regObj = new Regex("http://.+.(?:jpg|gif|bmp|png)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
              foreach (Match matchItem in regObj.Matches(imgTagStr))
              {
                  str = matchItem.Value;
              }
              return str;
          }[/code]
      

  3.   

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.IO.Compression;
    using System.Text.RegularExpressions;namespace WikiPageCreater.Common
    {
        public class PageHelper
        {
            /// <summary>
            /// 根据 url 获取网页编码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetEncoding(string url)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                        else
                            reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);                    string html = reader.ReadToEnd();                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                        if (reg_charset.IsMatch(html))
                        {
                            return reg_charset.Match(html).Groups["charset"].Value;
                        }
                        else if (response.CharacterSet != string.Empty)
                        {
                            return response.CharacterSet;
                        }
                        else
                            return Encoding.Default.BodyName;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return Encoding.Default.BodyName;
            }        /// <summary>
            /// 根据 url 和 encoding 获取当前url页面的 html 源代码        
           /// </summary>
            /// <param name="url"></param>
            /// <param name="encoding"></param>
            /// <returns></returns>
            public static string GetHtml(string url, Encoding encoding)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                        else
                            reader = new StreamReader(response.GetResponseStream(), encoding);
                        string html = reader.ReadToEnd();                    return html;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return string.Empty;
            }
        }
    }
    抓取HTML 正则获取也可以
      

  4.   

    我的思路是。正则获取页面上所有img,然后正则获取该img下的相关信息。
    要用到的表格
    <table width="100%"  border="0" cellspacing="0" cellpadding="0"> 
                                  <tr> 
                                    <td width="37%"><a href="/2009product/kingold_product/upimages/20107291631053181.jpg" target="_blank"><img src="/2009product/kingold_product/upimages/20107291631053181.jpg" width="225" border="0"></a></td> 
                                    <td width="63%"><table width="100%"  border="0" cellspacing="0" cellpadding="0"> 
                                        <tr> 
                                          <td>&nbsp;</td> 
                                          <td align="left" class="a12">金凰首饰</td> 
                                        </tr> 
                                        <tr> 
                                          <td>&nbsp;</td> 
                                          <td align="left" class="a12">【编码】</td> 
                                          </tr> 
                                        <tr> 
                                          <td>&nbsp;</td> 
                                          <td align="left" class="a12">&nbsp;02220002</td> 
                                          </tr> 
                                        <tr> 
                                          <td>&nbsp;</td> 
                                          <td align="left" class="a12">【克重】</td> 
                                          </tr> 
                                        <tr> 
                                          <td width="4%">&nbsp;</td> 
                                          <td align="left" class="a12">55克</td> 
                                          </tr> 
                                    </table></td> 
                                  </tr> 
                              </table>不知思路是否正确。请大侠指点。若正确,我可以得到页面上所有img,不知如何得到相关文字信息。
      

  5.   


    得到SRC还是很简单。。string html = "这里是你下载下来的HTML码源"
    Regex re = new Regex(@"(?<=<a href="".*?target=""_blank""><img src=\"").*?(?="" width=""225"" border=""0""></a>)", RegexOptions.None);
    MatchCollection mc = re.Matches(html );
    foreach (Match ma in mc)
    {
       //ma.Value就是每个img里的src
    }
      

  6.   


    得到img的src 我可以得到。但是还要得到img对应的那些个 重量、编码信息
      

  7.   

    哎,搞出来了。用正则取到图片所属table。然后把table添加到自己的页面。然后用js得到图片的url、编号、克重。
    后台用WebClient.DownloadFile下载图片。。额,貌似有点慢。然后一个事物提交到数据库。笨方法,不知有没高手有好方法。