现要用php做个
function get_zh($url,$str)
{
return ....;
}
要求:传入一个$url,把$url中的所有子页中的内容含有$str的子页url返回出来,并把子页含有$str的那句话也返回出来,返回格式是 $url_z|$str_z
如: 传入二个参数是 "http://www.hao123.com/" "网"
那么就要返回出http://www.hao123.com/页面中所有的子页中含有“网”字的子页的url与子页中含有“网”字的那句话也要返回出来
(如:返回出 http://www.hao123.com/netbuy.htm|淘宝网)大家帮我想想,多谢
function get_zh($url,$str)
{
return ....;
}
要求:传入一个$url,把$url中的所有子页中的内容含有$str的子页url返回出来,并把子页含有$str的那句话也返回出来,返回格式是 $url_z|$str_z
如: 传入二个参数是 "http://www.hao123.com/" "网"
那么就要返回出http://www.hao123.com/页面中所有的子页中含有“网”字的子页的url与子页中含有“网”字的那句话也要返回出来
(如:返回出 http://www.hao123.com/netbuy.htm|淘宝网)大家帮我想想,多谢
用XMLHTTP来读取页面内容,然后自己分析字符串就可以了
这里有VC使用XMLHTTP的范例
http://blog.csdn.net/SupermanKing/archive/2008/10/16/3082913.aspx这里是VB使用XMLHTTP的范例
http://blog.csdn.net/SupermanKing/archive/2008/10/16/3082860.aspxPHP使用起来和VB差不多,只是创建对象过程稍微有点差异而已$XMLObject = new com('Microsoft.XMLHTTP'); //创建对象
$URLPath = 'http://www.baidu.com'; //设置要访问的网址
$XMLObject->open('GET', $URLPath, false); //向对象传递打开目标地址的相关参数
//向HTTP头添加信息
$XMLObject->setRequestHeader('CONTENT-TYPE', 'application/x-www-form-urlencoded');
$XMLObject->Send($SendStr); //开始提交请求$StrCode = 'gb2312'; //设置编码方式
echo bytes2bstr($XMLObject->responseBody,$StrCode); //按照编码取出信息并输出 //转换字节数组为字符串
function bytes2bstr($body, $StrCode){
$objStream = new com("adodb.stream");
$objStream->Type = 1;
$objStream->Mode = 3;
$objStream->open();
$objStream->Write($body);
$objStream->Position = 0;
$objStream->Type = 2;
$objStream->Charset = $StrCode;
Return $objStream->ReadText;
}
通过new com(组件类名) 就可以创建出组件来使用,所以
在Windows的IIS下用XMLHTTP是可以用的,别的就难说了
$url = "http://www.baidu.com";
$str = file_get_contents($url);
echo htmlspecialchars($str); //将源代码编码成HTML,这样才能显示出代码
?>
$urlarr = get_zh('http://www.hao123.com', '网');print_r($urlarr);function get_zh($url,$str)
{
$arrUrl = array();
$content = getUrlContent($url);
preg_match_all('/\<a.+href=[\'\"](.+)[\'\"].*\>(.+)\<\/a\>/Ui', $content, $matchs);
foreach ($matchs[2] as $key => $v)
{
if(strstr($v, $str)) {
$arrUrl[] = $matchs[1][$key].'|'.$v;
}
}
return $arrUrl;
}
function getUrlContent($url, $showerr = false, $port=80, $outtime=30)
{
$tmp = str_replace('http://', '', $url);
if(!strstr($tmp, '/')) {
$domain = $tmp;
$path = '/';
} else {
$domain = substr($tmp, 0, strpos($tmp, '/'));
$path = strstr($tmp, '/');
}
$fp = fsockopen($domain, $port, $errno, $errstr, $outtime);
if (!$fp)
{
if ($showerr)
{
echo "$errstr ($errno)<br />\n";
}
else
{
return "";
}
}
else
{
$out = "GET $path HTTP/1.1\r\n";
$out .= "Host: {$domain}\r\n";
//$out .= "Referer: {$url}\r\n";
$out .= "Connection: Close\r\n\r\n";
fwrite($fp, $out);
$html = '';
while (!feof($fp))
{
$html .= fgets($fp, 128);
}
fclose($fp);
return substr(strstr($html, "\r\n\r\n"), 4);
}
}
可你还没有把hao123子页含 网的 列出来呢,
error_reporting(0);
set_time_limit(0);
function _getUrlContent($url)
{
$handle = fopen($url, "r");
if($handle){
$content = stream_get_contents($handle,1024*1024);
return $content;
}else{
return false;
}
}
function _filterUrl($web_content){
$reg_tag_a = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
$result = preg_match_all($reg_tag_a,$web_content,$match_result);
if($result){
return $match_result[1];
}
}
function _reviseUrl($base_url,$url_list){
$url_info = parse_url($base_url);
$base_url = $url_info["scheme"].'://';
if($url_info["user"]&&$url_info["pass"]){
$base_url .= $url_info["user"].":".$url_info["pass"]."@";
}
$base_url .= $url_info["host"];
if($url_info["port"]){
$base_url .= ":".$url_info["port"];
}
$base_url .= $url_info["path"];
//print_r($base_url);
if(is_array($url_list)){
foreach ($url_list as $url_item)
{
preg_match('/^http:\/\/(.*)\.[^\.]+$/Ui', $base_url, $matches);
$str=$matches[1];
$has_num = substr_count($url_item,$str);
if(preg_match('/^http/',$url_item))
{
//已经是完整的url
if($has_num>0)
{
$result[] = $url_item;
}
}
else
{
//不完整的url
$has_cn= (preg_match('/.cn+/',$base_url) ?'':'不').'存在';
$has_com=(preg_match('/.com+/',$base_url) ?'':'不').'存在';
if($has_com=="存在")
{
$num_f=strpos($base_url,".com");
$base_url_c=substr($base_url,0,$num_f+4);
}
if($has_cn=="存在")
{
$num_f=strpos($base_url,".cn");
$base_url_c= substr($base_url,0,$num_f+3);
}
if(substr($url_item,0,1)=="/")
{
$real_url = $base_url_c.$url_item;
}
else
$real_url = $base_url_c.'/'.$url_item;
$result[] = $real_url;
}
}
return $result;
}
else
{
return;
}
} function crawler($url){
$content = _getUrlContent($url);
if($content)
{
$url_list = _reviseUrl($url,_filterUrl($content));
if($url_list)
{
return $url_list;
}
else
{
return ;
}
}
else
{
return ;
}
} function msubstr($str, $start, $len) {
$tmpstr = "";
$strlen = $start + $len;
for($i = 0; $i < $strlen; $i++) {
if(ord(substr($str, $i, 1)) > 0xa0) {
$tmpstr .= substr($str, $i, 2);
$i++;
} else
$tmpstr .= substr($str, $i, 1);
}
return $tmpstr;
}
function check_has_f($url,$str)
{
echo $url;
$HTML = @file_get_contents($url);
$text=strip_tags($HTML);
$x = iconv("gbk","utf-8",$text);
$m = preg_replace("/[^\x{4e00}-\x{9fa5}]+/u","",$x);
$text = iconv("utf-8","gbk",$m);
$str_arr=explode("_",$str);
if(count($str_arr)>1)
{
$num_f=strpos($text,$str_arr[0]);
$num_s=strpos($text,$str_arr[1]);
if($num_f==0)
{
$num_f=1;
}
if($num_s==0)
{
$num_s=1;
}
if( ($num_f==NUll || $num_f=="") || ($num_s==NUll || $num_s==""))
{
$has="";
}
else
{
//preg_match('/('.$str_arr[0].')(.*)('.$str_arr[1].')/', $text, $matches);
$a=$str_arr[0];
$b=$str_arr[1];
preg_match('/('.$a.')(.*)('.$b.')/', $text, $matches);
if((strlen($matches[2])/2) >23) //间隔中有多少个中文
{
$has="";
}
else
{
$has=$matches[0];
}
}
}
else
{
$has_z= (preg_match('/'.$str.'+/',$text) ?'':'no').'yes';
$num_o=strpos($text,$str);
if($has_z=="yes")
{
$has=substr($text,$num_o,12);
}
}
return $has;
}
function out_result($current_url)
{
//preg_match('/^http:\/\/(.*)\.[^\.]+$/Ui', $current_url, $matches);
//$txt=$matches[1]."url.txt";
$cur_mat= substr($current_url,7);
$txt=$cur_mat."_url.txt";
$key_txt=$cur_mat."_keyword.txt";
$handle_key = fopen($key_txt,'r');//输出文本内容
$handle = fopen($txt,'r');//输出文本内容
$txt_out=$cur_mat."_out.txt";
if(file_exists($txt_out))
{
unlink($txt_out);
}
$fp_out = fopen($txt_out,"ab");//记录url列表
$fp_out = fopen($txt_out,"r");//保存url列表
if($handle_key)
{
while (!feof($handle_key))
{
$contents_key = fgets($handle_key);
$handle = fopen($txt,'r');//输出文本内容
if($handle)
{
while (!feof($handle))
{
$contents = fgets($handle);
$check_has=check_has_f($contents,trim($contents_key));
if(($contents==NULL || $contents=="") ||($check_has==NULL || $check_has=="") )
{
}
else
{
echo trim($contents)."|".$check_has."\n";
$con_out=trim($contents)."|".$check_has;
$fp_out = fopen($txt_out,"ab");//记录url列表
fputs($fp_out,$con_out."\r\n");
}
}
}
}
}
} $current_url_r = trim($argv[1]);
if(eregi("^http\:\/\/",$current_url_r))
{
$current_url=$current_url_r;
}
else
{
$current_url= "http://".$current_url_r ;
}
$cur_mat= substr($current_url,7);
//preg_match('/^http:\/\/(.*)\.[^\.]+$/Ui', $current_url, $matches);
//$txt=$matches[1]."url.txt";
$txt=$cur_mat."_url.txt";
$key_txt= $cur_mat."_keyword.txt";
if(file_exists($key_txt))
{
}
else
{
$fp_puts_key = fopen($key_txt,"ab");//记录url列表
$fp_gets_key = fopen($key_txt,"r");//保存url列表
file_put_contents($key_txt, iconv('utf-8', 'gbk', file_get_contents($key_txt)));
}
//file_put_contents($key_txt, iconv('utf-8', 'gbk', file_get_contents($key_txt)));
if(filesize($txt)<=0)
{
$fp_puts = fopen($txt,"ab");//记录url列表
$fp_gets = fopen($txt,"r");//保存url列表
do
{
$result_url_arr = crawler($current_url);
if($result_url_arr)
{
foreach ($result_url_arr as $url)
{
$has_write=0;
$handle_url = fopen($txt,'r');//输出文本内容
if($handle_url)
{
while (!feof($handle_url))
{
$contents_url = fgets($handle_url);
if(trim($contents_url)==trim($url))
{
$has_write=1;
}
}
}
if($has_write==0)
{
echo $url."\n";
fputs($fp_puts,$url."\r\n");
}
}
}
}
while ($current_url = fgets($fp_gets,1024));//不断获得url
}
out_result($current_url);
?>
可以了,