private void button1_Click(object sender, EventArgs e)
{
string str = "";
string FileTxt = "";
int count = 0;
FileTxt = this.filetxt.Text;
if (FileTxt=="")
{ FileTxt = "html.txt"; }
int begindex = 0;
int endindex = 0;
string num = "";
byte[] buf = new byte[38192000];
num = this.tb1.Text.ToString();
string keyword = this.keyword.Text;
string kwend = this.kwend.Text;
int index = num.IndexOf(keyword);
int indexend = num.IndexOf(kwend);
int start=0;
index= index + keyword.Length;
begindex = Convert.ToInt32(num.Substring(index, indexend-index));
endindex = Convert.ToInt32(this.tb2.Text); for (; begindex <= endindex; begindex++)
{
string page = tb1.Text.Substring(0, index - 1) + begindex + tb1.Text.Substring(indexend);
HttpWebRequest request = (HttpWebRequest)
WebRequest.Create(page);
HttpWebResponse response = (HttpWebResponse)
request.GetResponse();
Stream resStream = response.GetResponseStream();
count = resStream.Read(buf, 0, buf.Length);
str = str + Encoding.Default.GetString(buf, 0, count);
resStream.Close();
}
this.LoadToText(str, FileTxt); }
当点击抓取按钮时,程序根网页url(123.asp?id=456&page=1)里 id的范围抓取网页html源码,所有的网页能全部抓到了,但都不全,是 byte[]错了吗?高手指教!
{
string str = "";
string FileTxt = "";
int count = 0;
FileTxt = this.filetxt.Text;
if (FileTxt=="")
{ FileTxt = "html.txt"; }
int begindex = 0;
int endindex = 0;
string num = "";
byte[] buf = new byte[38192000];
num = this.tb1.Text.ToString();
string keyword = this.keyword.Text;
string kwend = this.kwend.Text;
int index = num.IndexOf(keyword);
int indexend = num.IndexOf(kwend);
int start=0;
index= index + keyword.Length;
begindex = Convert.ToInt32(num.Substring(index, indexend-index));
endindex = Convert.ToInt32(this.tb2.Text); for (; begindex <= endindex; begindex++)
{
string page = tb1.Text.Substring(0, index - 1) + begindex + tb1.Text.Substring(indexend);
HttpWebRequest request = (HttpWebRequest)
WebRequest.Create(page);
HttpWebResponse response = (HttpWebResponse)
request.GetResponse();
Stream resStream = response.GetResponseStream();
count = resStream.Read(buf, 0, buf.Length);
str = str + Encoding.Default.GetString(buf, 0, count);
resStream.Close();
}
this.LoadToText(str, FileTxt); }
当点击抓取按钮时,程序根网页url(123.asp?id=456&page=1)里 id的范围抓取网页html源码,所有的网页能全部抓到了,但都不全,是 byte[]错了吗?高手指教!
,内部socket的缓冲区一次根本读不下那么多, 所以你得循环几次来读取。你不考虑下,你的内存么
/// <summary>
/// 判断页面是否存在,并返回页面快照信息
/// </summary>
/// <param name="urlstr">远程地址</param>
/// <param name="strEncoding">编码格式</param>
/// <returns>获取的数据,没有则返回空</returns>
private StringBuilder FValidAndGetURL(string urlstr,string strEncoding)
{
//lock_GetPageInfo.WaitOne(); //线程加锁
WebResponse response = null;
Stream stream = null;
StreamReader reader = null; try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(@urlstr.Trim()); request.Timeout=60000; request.Method="GET";
request.AllowAutoRedirect=true;
request.UserAgent="Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
request.Accept=@"*/*";
//request.CookieContainer=this.Get_CookieContainer(urlstr.Trim());
response = request.GetResponse(); if (!response.ContentType.ToLower().StartsWith("text/"))
{
//SaveBinaryFile(response);
MessageBox.Show("获取的不是web信息:" + response.ContentType.Trim());
return null;
} stream = response.GetResponseStream(); System.Text.Encoding encoding; switch(strEncoding.Trim().ToUpper())
{
case "UTF-8":
{
encoding = Encoding.UTF8;
break;
}
case "UTF-7":
{
encoding = Encoding.UTF7;
break;
}
case "UNICODE":
{
encoding = Encoding.Unicode;
break;
}
default:
{
encoding = Encoding.Default;
break;
}
} reader = new StreamReader(stream, encoding); StringBuilder buffer=new StringBuilder(); buffer.Append(reader.ReadToEnd());
reader.Close();
stream.Close();
response.Close();
return buffer;
}
catch (WebException e)
{
MessageBox.Show(e.Message);
return null;
}
catch (IOException e)
{
MessageBox.Show(e.Message);
return null;
}
finally
{
if (reader != null)
{
reader.Close();
} if (stream != null)
{
stream.Close();
} if (response != null)
{
response.Close();
}
} //lock_GetPageInfo.ReleaseMutex();//线程解锁
}