private void button1_Click(object sender, EventArgs e)
        {
            string str = "";
            string FileTxt = "";
            int  count = 0;
            FileTxt = this.filetxt.Text;
            if (FileTxt=="")
            { FileTxt = "html.txt"; }
            int begindex = 0;
            int endindex = 0;
            string num = "";
            byte[] buf = new byte[38192000];
         
                num = this.tb1.Text.ToString();
                string keyword = this.keyword.Text;
                string kwend = this.kwend.Text;
                int index = num.IndexOf(keyword);
                int indexend = num.IndexOf(kwend);
                int start=0;
                index= index + keyword.Length;
                          begindex = Convert.ToInt32(num.Substring(index, indexend-index));
                endindex = Convert.ToInt32(this.tb2.Text);                for (; begindex <= endindex; begindex++)
                {
                    string page = tb1.Text.Substring(0, index - 1) + begindex + tb1.Text.Substring(indexend);
                    HttpWebRequest request = (HttpWebRequest)
                     WebRequest.Create(page);
                    HttpWebResponse response = (HttpWebResponse)
                    request.GetResponse();
                    Stream resStream = response.GetResponseStream();
                        count = resStream.Read(buf, 0, buf.Length);                
                        str = str + Encoding.Default.GetString(buf, 0, count);
                
                                  resStream.Close();
                }
                this.LoadToText(str, FileTxt);            }
当点击抓取按钮时,程序根网页url(123.asp?id=456&page=1)里 id的范围抓取网页html源码,所有的网页能全部抓到了,但都不全,是 byte[]错了吗?高手指教!

解决方案 »

  1.   

    有没考虑用 XMLHTTP来抓呢?
      

  2.   

    count = resStream.Read(buf, 0, buf.Length);                这个地方要分几次读, 你的缓冲区new byte[38192000] 太大了,
    ,内部socket的缓冲区一次根本读不下那么多,  所以你得循环几次来读取。你不考虑下,你的内存么
      

  3.   

    使用下边这个通用获取远程web函数来试验,专门使用StreamReader实例来获取流比较稳当点
    /// <summary>
    /// 判断页面是否存在,并返回页面快照信息
    /// </summary>
    /// <param name="urlstr">远程地址</param>
    /// <param name="strEncoding">编码格式</param>
    /// <returns>获取的数据,没有则返回空</returns>
    private StringBuilder FValidAndGetURL(string urlstr,string strEncoding) 
    {
    //lock_GetPageInfo.WaitOne(); //线程加锁
    WebResponse response = null;
    Stream stream = null;
    StreamReader reader = null; try
    {
    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(@urlstr.Trim()); request.Timeout=60000; request.Method="GET";
    request.AllowAutoRedirect=true;
    request.UserAgent="Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
    request.Accept=@"*/*";
    //request.CookieContainer=this.Get_CookieContainer(urlstr.Trim());

    response = request.GetResponse(); if (!response.ContentType.ToLower().StartsWith("text/"))
    {
    //SaveBinaryFile(response);
    MessageBox.Show("获取的不是web信息:" + response.ContentType.Trim());
    return null;
    }                stream = response.GetResponseStream(); System.Text.Encoding encoding; switch(strEncoding.Trim().ToUpper())
    {
    case "UTF-8":
    {
    encoding = Encoding.UTF8;
    break;
    }
    case "UTF-7":
    {
    encoding = Encoding.UTF7;
    break;
    }
    case "UNICODE":
    {
    encoding = Encoding.Unicode;
    break;
    }
    default:
    {
    encoding = Encoding.Default;
    break;
    }
    } reader = new StreamReader(stream, encoding); StringBuilder buffer=new StringBuilder(); buffer.Append(reader.ReadToEnd());                 

    reader.Close();
    stream.Close();
    response.Close();
    return buffer;
    }
    catch (WebException e)
    {
    MessageBox.Show(e.Message);
    return null;
    }
    catch (IOException e)
    {
    MessageBox.Show(e.Message);
    return null;
    }
    finally
    {
    if (reader != null)
    {
    reader.Close();
    } if (stream != null)
    {
    stream.Close();
    } if (response != null)
    {
    response.Close();
    }
    } //lock_GetPageInfo.ReleaseMutex();//线程解锁

    }