想要采集的某个站,貌似被“反采集”了,懂采集的朋友帮看看 本帖最后由 xiongmzh 于 2011-08-21 00:37:40 编辑 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 不过是 gzip 压缩了而已$url = 'http://price.pcauto.com.cn/dictionary/category/';$s = file_get_contents($url);echo gzdecode($s);function gzdecode($data) { $len = strlen($data); if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { return null; // Not GZIP format (See RFC 1952) } $method = ord(substr($data,2,1)); // Compression method $flags = ord(substr($data,3,1)); // Flags if ($flags & 31 != $flags) { // Reserved bits are set -- NOT ALLOWED by RFC 1952 return null; } // NOTE: $mtime may be negative (PHP integer limitations) $mtime = unpack("V", substr($data,4,4)); $mtime = $mtime[1]; $xfl = substr($data,8,1); $os = substr($data,8,1); $headerlen = 10; $extralen = 0; $extra = ""; if ($flags & 4) { // 2-byte length prefixed EXTRA data in header if ($len - $headerlen - 2 < 8) { return false; // Invalid format } $extralen = unpack("v",substr($data,8,2)); $extralen = $extralen[1]; if ($len - $headerlen - 2 - $extralen < 8) { return false; // Invalid format } $extra = substr($data,10,$extralen); $headerlen += 2 + $extralen; } $filenamelen = 0; $filename = ""; if ($flags & 8) { // C-style string file NAME data in header if ($len - $headerlen - 1 < 8) { return false; // Invalid format } $filenamelen = strpos(substr($data,8+$extralen),chr(0)); if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { return false; // Invalid format } $filename = substr($data,$headerlen,$filenamelen); $headerlen += $filenamelen + 1; } $commentlen = 0; $comment = ""; if ($flags & 16) { // C-style string COMMENT data in header if ($len - $headerlen - 1 < 8) { return false; // Invalid format } $commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0)); if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { return false; // Invalid header format } $comment = substr($data,$headerlen,$commentlen); $headerlen += $commentlen + 1; } $headercrc = ""; if ($flags & 1) { // 2-bytes (lowest order) of CRC32 on header present if ($len - $headerlen - 2 < 8) { return false; // Invalid format } $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; $headercrc = unpack("v", substr($data,$headerlen,2)); $headercrc = $headercrc[1]; if ($headercrc != $calccrc) { return false; // Bad header CRC } $headerlen += 2; } // GZIP FOOTER - These be negative due to PHP's limitations $datacrc = unpack("V",substr($data,-8,4)); $datacrc = $datacrc[1]; $isize = unpack("V",substr($data,-4)); $isize = $isize[1]; // Perform the decompression: $bodylen = $len-$headerlen-8; if ($bodylen < 1) { // This should never happen - IMPLEMENTATION BUG! return null; } $body = substr($data,$headerlen,$bodylen); $data = ""; if ($bodylen > 0) { switch ($method) { case 8: // Currently the only supported compression method: $data = gzinflate($body); break; default: // Unknown compression method return false; } } else { // I'm not sure if zero-byte body content is allowed. // Allow it for now... Do nothing... } // Verifiy decompressed size and CRC32: // NOTE: This may fail with large data sizes depending on how // PHP's integer limitations affect strlen() since $isize // may be negative for large sizes. if ($isize != strlen($data) || crc32($data) != $datacrc) { // Bad format! Length or CRC doesn't match! return false; } return $data; } 确实被GZIP压缩,解压后正常! private static string DownloadWebPage(string url) { WebRequest request = WebRequest.Create(url); request.Credentials = CredentialCache.DefaultCredentials; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //Console.WriteLine(response.StatusDescription); Stream dataStream = response.GetResponseStream(); GZipStream gzip = new GZipStream(dataStream, CompressionMode.Decompress); StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("GB2312")); string responseFromServer = reader.ReadToEnd(); reader.Close(); dataStream.Close(); response.Close(); return responseFromServer; } static void Main(string[] args) { DownloadWebPage("http://price.pcauto.com.cn/dictionary/category/"); } 不过是 gzip 压缩了而已 版主很强大,请教一下:是如何看出来时经过gzip压缩的。以后学习留作参考。 呵呵,谢谢了,又学了一招我那问题已经搞定了,不过,用的。net采集的 如果被“反采集”,用file_get_contents采集到的应该是空白,但是我采集的是乱码,想必就因为这个,才判断是gzip压缩吧 今天面试php遇到几个问题不会,请大家帮我看看 php文件下载功能的问题 PHP可以替代jsp吗? php_iconv.dll,php_domxml.dll那里有的下载啊。???? 问个问题:PHP5调用MYSQL5存储过程的问题。 刚刚开始学PHP,谁能提供一段关于PHP+apache+Postgresql的代码。 这样包含文件为啥不行? 关于SESSION的问题。 请各位老哥帮我解决身份验证不能登陆的问题,给300分? 关于Apache服务器的一些问题??? 很基础的东西 求一个表的设计:每篇文章的关键字应该怎么建表?
$url = 'http://price.pcauto.com.cn/dictionary/category/';
$s = file_get_contents($url);
echo gzdecode($s);function gzdecode($data) {
$len = strlen($data);
if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
return null; // Not GZIP format (See RFC 1952)
} $method = ord(substr($data,2,1)); // Compression method
$flags = ord(substr($data,3,1)); // Flags
if ($flags & 31 != $flags) {
// Reserved bits are set -- NOT ALLOWED by RFC 1952
return null;
} // NOTE: $mtime may be negative (PHP integer limitations)
$mtime = unpack("V", substr($data,4,4));
$mtime = $mtime[1];
$xfl = substr($data,8,1);
$os = substr($data,8,1);
$headerlen = 10;
$extralen = 0;
$extra = "";
if ($flags & 4) {
// 2-byte length prefixed EXTRA data in header
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$extralen = unpack("v",substr($data,8,2));
$extralen = $extralen[1];
if ($len - $headerlen - 2 - $extralen < 8) {
return false; // Invalid format
}
$extra = substr($data,10,$extralen);
$headerlen += 2 + $extralen;
}
$filenamelen = 0;
$filename = "";
if ($flags & 8) {
// C-style string file NAME data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$filenamelen = strpos(substr($data,8+$extralen),chr(0));
if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
return false; // Invalid format
}
$filename = substr($data,$headerlen,$filenamelen);
$headerlen += $filenamelen + 1;
}
$commentlen = 0;
$comment = "";
if ($flags & 16) {
// C-style string COMMENT data in header
if ($len - $headerlen - 1 < 8) {
return false; // Invalid format
}
$commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0));
if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
return false; // Invalid header format
}
$comment = substr($data,$headerlen,$commentlen);
$headerlen += $commentlen + 1;
}
$headercrc = "";
if ($flags & 1) {
// 2-bytes (lowest order) of CRC32 on header present
if ($len - $headerlen - 2 < 8) {
return false; // Invalid format
}
$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
$headercrc = unpack("v", substr($data,$headerlen,2));
$headercrc = $headercrc[1];
if ($headercrc != $calccrc) {
return false; // Bad header CRC
}
$headerlen += 2;
}
// GZIP FOOTER - These be negative due to PHP's limitations
$datacrc = unpack("V",substr($data,-8,4));
$datacrc = $datacrc[1];
$isize = unpack("V",substr($data,-4));
$isize = $isize[1]; // Perform the decompression:
$bodylen = $len-$headerlen-8;
if ($bodylen < 1) {
// This should never happen - IMPLEMENTATION BUG!
return null;
}
$body = substr($data,$headerlen,$bodylen);
$data = "";
if ($bodylen > 0) {
switch ($method) {
case 8:
// Currently the only supported compression method:
$data = gzinflate($body);
break;
default:
// Unknown compression method
return false;
}
} else {
// I'm not sure if zero-byte body content is allowed.
// Allow it for now... Do nothing...
}
// Verifiy decompressed size and CRC32:
// NOTE: This may fail with large data sizes depending on how
// PHP's integer limitations affect strlen() since $isize
// may be negative for large sizes.
if ($isize != strlen($data) || crc32($data) != $datacrc) {
// Bad format! Length or CRC doesn't match!
return false;
}
return $data;
}
private static string DownloadWebPage(string url)
{
WebRequest request =
WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//Console.WriteLine(response.StatusDescription);
Stream dataStream = response.GetResponseStream();
GZipStream gzip = new GZipStream(dataStream, CompressionMode.Decompress);
StreamReader reader = new StreamReader(gzip, Encoding.GetEncoding("GB2312"));
string responseFromServer = reader.ReadToEnd();
reader.Close();
dataStream.Close();
response.Close();
return responseFromServer;
}
static void Main(string[] args)
{
DownloadWebPage("http://price.pcauto.com.cn/dictionary/category/");
}
版主很强大,请教一下:是如何看出来时经过gzip压缩的。以后学习留作参考。