页面中包含字的内容的问题，在线等

现要用php做个
function get_zh($url,$str)
{
return ....;
}
要求：传入一个$url,把$url中的所有子页中的内容含有$str的子页url返回出来，并把子页含有$str的那句话也返回出来，返回格式是 $url_z|$str_z
如: 传入二个参数是　　"http://www.hao123.com/"　　　　"网"
那么就要返回出http://www.hao123.com/页面中所有的子页中含有“网”字的子页的url与子页中含有“网”字的那句话也要返回出来
(如：返回出　http://www.hao123.com/netbuy.htm|淘宝网)大家帮我想想，多谢

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

给你个思路：
用XMLHTTP来读取页面内容，然后自己分析字符串就可以了
这里有VC使用XMLHTTP的范例
http://blog.csdn.net/SupermanKing/archive/2008/10/16/3082913.aspx这里是VB使用XMLHTTP的范例
http://blog.csdn.net/SupermanKing/archive/2008/10/16/3082860.aspxPHP使用起来和VB差不多，只是创建对象过程稍微有点差异而已$XMLObject = new com('Microsoft.XMLHTTP');   //创建对象
$URLPath = 'http://www.baidu.com';           //设置要访问的网址
$XMLObject->open('GET', $URLPath, false);    //向对象传递打开目标地址的相关参数
//向HTTP头添加信息
$XMLObject->setRequestHeader('CONTENT-TYPE', 'application/x-www-form-urlencoded');
$XMLObject->Send($SendStr);                  //开始提交请求$StrCode = 'gb2312';                         //设置编码方式
echo bytes2bstr($XMLObject->responseBody,$StrCode);   //按照编码取出信息并输出 //转换字节数组为字符串
function bytes2bstr($body, $StrCode){
$objStream = new com("adodb.stream");
$objStream->Type = 1;
$objStream->Mode = 3;
$objStream->open();
$objStream->Write($body);
$objStream->Position = 0;
$objStream->Type = 2;
$objStream->Charset = $StrCode;
Return $objStream->ReadText;
}
你的不会是在Linux下的PHP吧，Windows中有COM组件，
通过new com(组件类名) 就可以创建出组件来使用，所以
在Windows的IIS下用XMLHTTP是可以用的，别的就难说了
难怪不行，Linux下没有XMLHTTP这个组件的。
那就用PHP自代的函数<?php
$url = "http://www.baidu.com";
$str = file_get_contents($url);
echo htmlspecialchars($str);   //将源代码编码成HTML，这样才能显示出代码
?>
$urlarr = get_zh('http://www.hao123.com', '网');print_r($urlarr);function get_zh($url,$str)
{
$arrUrl = array();

    $content = getUrlContent($url);

    preg_match_all('/\<a.+href=[\'\"](.+)[\'\"].*\>(.+)\<\/a\>/Ui', $content, $matchs);

    foreach ($matchs[2] as $key => $v)
    {
    if(strstr($v, $str)) {
         $arrUrl[] = $matchs[1][$key].'|'.$v;
        }
    }

    return $arrUrl;
}
function getUrlContent($url, $showerr = false, $port=80, $outtime=30)
{
$tmp    = str_replace('http://', '', $url);
if(!strstr($tmp, '/')) {
$domain = $tmp;
$path = '/';
} else {
$domain = substr($tmp, 0, strpos($tmp, '/'));
$path   = strstr($tmp, '/');
}

$fp = fsockopen($domain, $port, $errno, $errstr, $outtime);
if (!$fp)
{
if ($showerr)
{
echo "$errstr ($errno)<br />\n";
}
else
{
return "";
}
}
else
{
$out = "GET $path HTTP/1.1\r\n";
$out .= "Host: {$domain}\r\n";
//$out .= "Referer: {$url}\r\n";
$out .= "Connection: Close\r\n\r\n";

fwrite($fp, $out);
$html = '';
while (!feof($fp))
{
$html .= fgets($fp, 128);
}

fclose($fp);

return substr(strstr($html, "\r\n\r\n"), 4);
}
}
1楼的，用php来写ASP，让人感觉怪怪的，哈哈
应该是用ASP的方法来写PHP程序，呵呵
八楼的，多谢
可你还没有把hao123子页含网的列出来呢，
<?
error_reporting(0);
set_time_limit(0);
function _getUrlContent($url)
{
$handle = fopen($url, "r");
    if($handle){
        $content = stream_get_contents($handle,1024*1024);
        return $content;
    }else{
        return false;
    }
}
function _filterUrl($web_content){
    $reg_tag_a = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
    $result = preg_match_all($reg_tag_a,$web_content,$match_result);
    if($result){
        return $match_result[1];
    }
}
function _reviseUrl($base_url,$url_list){
    $url_info = parse_url($base_url);
    $base_url = $url_info["scheme"].'://';
    if($url_info["user"]&&$url_info["pass"]){
        $base_url .= $url_info["user"].":".$url_info["pass"]."@";
    }
    $base_url .= $url_info["host"];
    if($url_info["port"]){
        $base_url .= ":".$url_info["port"];
    }
    $base_url .= $url_info["path"];
    //print_r($base_url);
    if(is_array($url_list)){
        foreach ($url_list as $url_item)
{
preg_match('/^http:\/\/(.*)\.[^\.]+$/Ui', $base_url, $matches);
            $str=$matches[1];
    $has_num = substr_count($url_item,$str);
if(preg_match('/^http/',$url_item))
{
                //已经是完整的url
if($has_num>0)
{
  $result[] = $url_item;
}
            }
else
    {
                //不完整的url
$has_cn= (preg_match('/.cn+/',$base_url) ?'':'不').'存在';
                $has_com=(preg_match('/.com+/',$base_url) ?'':'不').'存在';
                if($has_com=="存在")
                {
                  $num_f=strpos($base_url,".com");
                  $base_url_c=substr($base_url,0,$num_f+4);
                }
                if($has_cn=="存在")
                {
                  $num_f=strpos($base_url,".cn");
                  $base_url_c= substr($base_url,0,$num_f+3);
                }
if(substr($url_item,0,1)=="/")
{
$real_url = $base_url_c.$url_item;
}
else
                $real_url = $base_url_c.'/'.$url_item;
                $result[] = $real_url;
            }
        }
        return $result;
    }
else
    {
        return;
    }
}   function crawler($url){
    $content = _getUrlContent($url);
    if($content)
{
        $url_list = _reviseUrl($url,_filterUrl($content));
        if($url_list)
{
            return $url_list;
        }
else
{
            return ;
        }
    }
else
{
        return ;
    }
}    function msubstr($str, $start, $len) {
    $tmpstr = "";
    $strlen = $start + $len;
    for($i = 0; $i < $strlen; $i++) {
        if(ord(substr($str, $i, 1)) > 0xa0) {
            $tmpstr .= substr($str, $i, 2);
            $i++;
        } else
            $tmpstr .= substr($str, $i, 1);
    }
    return $tmpstr;
}
function check_has_f($url,$str)
{
   echo $url;
   $HTML = @file_get_contents($url);
   $text=strip_tags($HTML);
       $x =  iconv("gbk","utf-8",$text);
       $m =  preg_replace("/[^\x{4e00}-\x{9fa5}]+/u","",$x);
       $text =  iconv("utf-8","gbk",$m);
   $str_arr=explode("_",$str);
   if(count($str_arr)>1)
   {
    $num_f=strpos($text,$str_arr[0]);
        $num_s=strpos($text,$str_arr[1]);
if($num_f==0)
{
$num_f=1;
}
if($num_s==0)
{
  $num_s=1;
}
        if( ($num_f==NUll || $num_f=="") || ($num_s==NUll || $num_s==""))
        {
           $has="";
        }
else
{
          //preg_match('/('.$str_arr[0].')(.*)('.$str_arr[1].')/', $text, $matches);
  $a=$str_arr[0];
  $b=$str_arr[1];
  preg_match('/('.$a.')(.*)('.$b.')/', $text, $matches);
          if((strlen($matches[2])/2) >23)   //间隔中有多少个中文
  {
    $has="";
  }
  else
  {
   $has=$matches[0];
  }
}
   }
   else
   {
        $has_z= (preg_match('/'.$str.'+/',$text) ?'':'no').'yes';
$num_o=strpos($text,$str);
        if($has_z=="yes")
    {
  $has=substr($text,$num_o,12);
    }
   }
   return $has;
}
function out_result($current_url)
{
//preg_match('/^http:\/\/(.*)\.[^\.]+$/Ui', $current_url, $matches);
    //$txt=$matches[1]."url.txt";
    $cur_mat= substr($current_url,7);
$txt=$cur_mat."_url.txt";
$key_txt=$cur_mat."_keyword.txt";
$handle_key = fopen($key_txt,'r');//输出文本内容
$handle = fopen($txt,'r');//输出文本内容
$txt_out=$cur_mat."_out.txt";
if(file_exists($txt_out))
{
unlink($txt_out);
}
$fp_out = fopen($txt_out,"ab");//记录url列表
     $fp_out = fopen($txt_out,"r");//保存url列表
    if($handle_key)
{
        while (!feof($handle_key))
{
            $contents_key = fgets($handle_key);
$handle = fopen($txt,'r');//输出文本内容
            if($handle)
        {
                 while (!feof($handle))
         {
                   $contents = fgets($handle);
                   $check_has=check_has_f($contents,trim($contents_key));
           if(($contents==NULL || $contents=="") ||($check_has==NULL || $check_has=="") )
   {
   }
           else
   {
      echo trim($contents)."|".$check_has."\n";
  $con_out=trim($contents)."|".$check_has;
  $fp_out = fopen($txt_out,"ab");//记录url列表
  fputs($fp_out,$con_out."\r\n");
   }
                  }
             }
        }
     }
} $current_url_r = trim($argv[1]);
if(eregi("^http\:\/\/",$current_url_r))
{
        $current_url=$current_url_r;
    }
    else
    {
       $current_url= "http://".$current_url_r ;
    }
    $cur_mat= substr($current_url,7);
//preg_match('/^http:\/\/(.*)\.[^\.]+$/Ui', $current_url, $matches);
//$txt=$matches[1]."url.txt";
$txt=$cur_mat."_url.txt";
    $key_txt= $cur_mat."_keyword.txt";
if(file_exists($key_txt))
{

}
else
{
$fp_puts_key = fopen($key_txt,"ab");//记录url列表
    $fp_gets_key = fopen($key_txt,"r");//保存url列表
    file_put_contents($key_txt, iconv('utf-8', 'gbk', file_get_contents($key_txt)));
}
    //file_put_contents($key_txt, iconv('utf-8', 'gbk', file_get_contents($key_txt)));
    if(filesize($txt)<=0)
    {
     $fp_puts = fopen($txt,"ab");//记录url列表
     $fp_gets = fopen($txt,"r");//保存url列表
     do
{
        $result_url_arr = crawler($current_url);
        if($result_url_arr)
{
            foreach ($result_url_arr as $url)
{
$has_write=0;
    $handle_url = fopen($txt,'r');//输出文本内容
            if($handle_url)
        {
             while (!feof($handle_url))
     {
                $contents_url = fgets($handle_url);
if(trim($contents_url)==trim($url))
{
   $has_write=1;
}
             }
            }
if($has_write==0)
{
  echo $url."\n";
  fputs($fp_puts,$url."\r\n");
}
            }
        }
    }
    while ($current_url = fgets($fp_gets,1024));//不断获得url
    }
   out_result($current_url);
?>
可以了，