我平时用正则从网页长抓取信系,主要是连接,现在要做个系统,能告诉我一下思路么
解决方案 »
- 麻烦问下大家!smarty的fetch的用法!请大家指教
- 唉,还得努力,谁弄过呵
- 在线购物订单cookie问题。请各位有经验的人帮我疏导一下思路谢谢了
- 【远程下载】打开远程主机出错!内容为非文本类型或网址重定向! php_network_getaddresses: getaddrinfo failed: 不知道
- phpcms v9为什么专题上传缩略图,图会扩大一倍
- 请教cms问题
- 【百度地图api】通过php读取本地数据库的坐标后,如何绘制折线呢?
- 想学算法,有推荐书籍吗,基础较差
- Warning: mysql_num_rows(): supplied argument is not a valid MySQL result resourc
- 在windows中如何利用PHP重启httpd
- 帝国CMS那个默认新闻列表页模板怎么改 head 模板变量 没用 ?
- roundcube修改密码问题
一般抓取系统都要自己写匹配的标签 还有抓取的深度 这是靠循环的控制
set_time_limit(0);
//2010-03-17 自动生成htm页面程序
$link = mysql_connect("localhost","root","123") or die("数据库未连接");
mysql_select_db("test") or die("数据库未选");
mysql_query("SET NAMES 'gbk'");
echo "--- 开始 ---<br>";
//连接数据库
$rs = mysql_query("SELECT did FROM test "); // where did = '15'
while(list($did) = mysql_fetch_row($rs)) {
$path = "html/";
$g_htm = $path.$did.".htm" ;
$f_get = file_get_contents("http://www.xxx.com/index.php?id=".$did);
$f_put = file_put_contents($g_htm,$f_get);
echo "---htm 页面---".$did."---成功生成------<br>";
}
echo "--- 结束 ---<br>";
?>
//采集的是小说阅读网---九阳绝脉1-163章的内容,而这1-163章的网址都是依次递增的。
//目标网址从http://www.readnovel.com/novel/69764/1.html到http://www.readnovel.com/novel/69764/163.html
//所以我这里可以用一个循环,依次读取每一章的内容;
for($i=1;$i<=163;$i++){
$sUrl="http://www.readnovel.com/novel/69764/".$i.".html";
$con=file_get_contents($sUrl); //读取网站的全部内容;
//查看源文件,分析我们想得到的内容在什么标记之间。我这里要得到是每一章的标题跟内容,它们在<div id="article"></div>标记之间。
//但是却不能直接匹配<div id="article"></div>之间的内容,因为后面还有</div>这个标记,它会把后面的内容也匹配进来,所以采集的时候要注意我们
//要匹配的内容是不是在惟一的标记之间。所以我又在<div id="article"></div>标记下找了一个<p class="shop">标记。
eregi("<div id=\"article\">(.*)<p class=\"shop\">",$con,$con_arr)//把<div id="article"><p class="shop">标记之间的内容匹配出来。
//我们已经获得了我们想要的内容,接下来还要把文章的标题跟文章的内容分离出来
eregi("<h2>(.*)</h2>",$con_arr[1],$title_arr) //从我们获得的内容再把<h2></h2>标记之间的内容匹配出来,也就是文章的标题。
$title=addslashes($title_arr[1]); //前面已经说了$title_arr[1]才是我们真正要的内容,另外还要进行转义,因为还要插入到数据库。
eregi("</h2>(.*)</div>",$con_arr[1],$content_arr);//把</h2></div>标记之间的内容匹配出来,也就是文章的内容。
$content=addslashes($content_arr[1]);
$sql="insert into novel(id,title,content) values('','$title','$content');"
mysql_query($sql); //插入到数据库,前面还要对数据库连接,这里没写了。
}
//搞定,收工
?>
我现在是单写匹配,用的主要就是file_get_content()和preg_match_all(),就是需要哪些东西时,我单写一个php文档,现在要做个系统,规则可以往里面添加,可有几个循环怎么弄呵
还有要在前台显示出来
用file_get_contents,file等常用函数
要做就用curl或是fopensocket
class grep extends Controller {
var $tableName = 'grep';
var $pagesize =31;
var $order_string = "grep_order desc,grep_id desc";
var $filter_field = "grep_title";
var $check_repeat_field = "grep_title";
var $buttons = array(
);
var $description = "[爬取体系]";
function indexxxs()
{
//get the story list
$story_model = "story_model";
$this->load->model($story_model);
$where = array("story_id < 448");
$rows_story = $this->$story_model->get($where);
foreach ($rows_story as $key=>$val_story):
$url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;
$src_content = file_get_contents($url);
$src_content = iconv("GBK","utf-8//IGNORE",$src_content);
$src_content = str_replace("/style=\"border-width:0px\s*1px\s*1px\s*0px;border-color:#C8D8B8;border-style:solid;padding:3px;float:left;width:313px;\"/i","",$src_content);
$src_content = str_replace("style=\"BORDER-RIGHT: #c8d8b8 1px solid; PADDING-RIGHT: 3px; BORDER-TOP: #c8d8b8 0px solid; PADDING-LEFT: 3px; FLOAT: left; PADDING-BOTTOM: 3px; BORDER-LEFT: #c8d8b8 0px solid; WIDTH: 313px; PADDING-TOP: 3px; BORDER-BOTTOM: #c8d8b8 1px solid\"","",$src_content);
$src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/<LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<\/LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<a(?!href)[\d\D]*href/iU","<a href",$src_content);
$src_content = preg_replace('/\s(?=\s)/', '', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = str_replace("http://www.xiaoxiaoshuo.net/yanqingxiaoshuo2/tijiaxinniang/","",$src_content);
preg_match_all("/<td\s*bgcolor=\"#EDF5EA\"([\d\D]*)<\/ul>/iU",$src_content,$arr_dstorycate);
$dstorycate_arr = $arr_dstorycate[1];
foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate)
{
preg_match_all("/<font\s*color=\"#000000\">([^<]*)<\/font>/i",$val_dstory_cate,$dcate_title);
$datacate["dstorycate_pid"] = $val_story->story_id;
$datacate["dstorycate_title"] = $dcate_title[1][0];
//获取类别对象,记将之前的类别标置为已下载
$dtitle =$datacate["dstorycate_title"];
$obj_storycate = $this->check_dcate($dtitle,$val_story );
//pr($obj_storycate);
if($obj_storycate->dstorycate_ishot == 1)
{
$this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");
continue;
}
preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);
$list_story_url = $dinfo_list[1];
$list_story_title = $dinfo_list[2];
$story_url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;
$this->add_storyxxs_info($obj_storycate,$list_story_url,$list_story_title,$story_url);
}
endforeach;
}
/**
* *下载445后的章节
*
*/
function index()
{
$story_model = "story_model";
$this->load->model($story_model);
$where = array("story_id > 445");
$rows_story = $this->$story_model->get($where);foreach ($rows_story as $key=>$val_story): if($val_story->story_id < 612) continue;
//get the story_content
$story_url_arr = explode("/",$val_story->story_url);
$story_url= $story_url_arr[1]."/".$story_url_arr[2]."/".$story_url_arr[3]."/".$story_url_arr[4];
$dest_url = "http://www.xiaoshuo520.com/".$story_url;
$src_content = CS_file_get_contents($dest_url);
$src_content = iconv("GBK","utf-8//IGNORE",$src_content); $src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/<LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<\/LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<a(?!href)[\d\D]*href/iU","<a href",$src_content);
$src_content = preg_replace('/\s(?=\s)/', '', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = str_replace("http://www.xiaoxiaoshuo.net/yanqingxiaoshuo2/tijiaxinniang/","",$src_content);
//按类别进行分类分组数据.
preg_match_all("/(<div\s*id=\"NclassTitle\">[\d\D]*)<div\s*id=\"ListEnd/iU",$src_content,$arr_dstorycate);
$dstorycate_arr = $arr_dstorycate[1];foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate)
{ preg_match_all("/<div\s*id=\"NclassTitle\">([\d\D]*)<\/div>/i",$val_dstory_cate,$dcate_title);
$datacate["dstorycate_pid"] = $val_story->story_id;
$datacate["dstorycate_title"] = $dcate_title[1][0];
//获取类别对象,记将之前的类别标置为已下载
$dtitle =$datacate["dstorycate_title"];
$obj_storycate = $this->check_dcate($dtitle,$val_story );
//pr($obj_storycate);
if($obj_storycate->dstorycate_ishot == 1)
{
$this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");
continue;
}
preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);
$list_story_url = $dinfo_list[1];
$list_story_title = $dinfo_list[2];
$this->add_story520_info($obj_storycate,$list_story_url,$list_story_title,$story_url);
}
endforeach;
}
/**
* 根据章节标准 和小说对象,获取标题对象
* */
function check_dcate($title,$obj_story)
{
$dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$where = array("dstorycate_pid = $obj_story->story_id","dstorycate_title = '$title'");
$rows = $this->$dstorycate_model->get($where);
if(!$rows)
{
$datacate["dstorycate_pid"] = $obj_story->story_id;
$datacate["dstorycate_title"] = $title;
$this->$dstorycate_model->insert($datacate);
$obj_cate_id = $this->db->insert_id();
$where = array("dstorycate_id = $obj_cate_id","dstorycate_title = '$title'");
$rows = $this->$dstorycate_model->get($where);
$this->log( "此书没有相关类别,将进行添加 小说$obj_story->story_title - $title ");
}else {
$this->log( "已存在相关小说类别 $obj_story->story_title - $title ,跳过");
}
$obj_cate = $rows[0];
$sql = "update dstorycate set dstorycate_published = 1 where dstorycate_pid = $obj_story->story_id && dstorycate_id < $obj_cate->dstorycate_id ";
$this->db->query($sql);
return $obj_cate;
}
/***
*添加小说对旬
*/
function add_story520_info($cate_obj,$list_story_url,$list_story_title,$url)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$min_key = intval($cate_obj->dstorycate_pvcount);
if(!$min_key) $min_key = 0 ;
foreach($list_story_url as $key=>$val):
if($key < $min_key) {
continue;
}
$this->check_dstory($cate_obj,"http://www.xiaoshuo520.com/".$url."/".$val,$list_story_title[$key],"grep_520_info");
endforeach;
}function add_storyxxs_info($cate_obj,$list_story_url,$list_story_title,$url)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$min_key = intval($cate_obj->dstorycate_pvcount);
if(!$min_key) $min_key = 0 ;
foreach($list_story_url as $key=>$val):
if($key < $min_key) {
$this->log("$cate_obj->dstorycate_id 号 $cate_obj->dstorycate_title ".$list_story_title[$key]." 章 $key < $min_key "); continue;
}
$this->check_dstory($cate_obj,$url."/".$val,$list_story_title[$key],"grep_xxs_info");
endforeach;
}function patch_forxxs()
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$where = array("dstory_info is null");
$rows = $this->$dstory_model->get($where);
foreach($rows as $key=>$dstory_obj):
$data["dstory_info"] = $this->grep_xxs_info($dstory_obj->dstory_url);
// $data= img2local($data);
$this->$dstory_model->update_by_id($data,$dstory_obj->dstory_id);
endforeach;
$this->output->enable_profiler(TRUE);}function patch_for520()
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$where = array("dstory_info like '%<img%'","story_id > 455","dstory_status != 2");
$rows = $this->$dstory_model->getinfo($where,1000);
foreach($rows as $key=>$dstory_obj):
$data["dstory_info"] = $this->grep_520_info($dstory_obj->dstory_url);
$data["dstory_status"] = 2;
$this->$dstory_model->update_by_id($data,$dstory_obj->dstory_id);
endforeach;
}
/**
* 处理url 链接
* */
function check_dstory($cate_obj,$url,$title,$handle)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$where = array("dstory_cid = $cate_obj->dstorycate_id","dstory_title = '$title'");
$rows = $this->$dstory_model->get($where);
$data["dstory_url"] = $url;
if(!$rows) {
$data["dstory_cid"] = $cate_obj->dstorycate_id;
$data["dstory_title"] = $title;
$data["dstory_info"] = $this->$handle($url);
// $data= img2local($data);
$this->$dstory_model->insert($data);
$this->change_max_dstoryid($cate_obj->dstorycate_id);
$this->log( "正在采$cate_obj->dstorycate_id :$title---- ");
$this->update_jump($cate_obj);
} else {
/*
$data["dstory_cid"] = $cate_obj->dstorycate_id;
$data["dstory_title"] = $title;
$data["dstory_info"] = $this->$handle($url);
$data= img2local($data);
$this->$dstory_model->update_by_id($data,$rows[0]->dstory_id);
*/
$this->log( "已采集 $cate_obj->dstorycate_id ".$rows[0]->dstory_id."$title 跳过 ");
/*
if(!$rows[0]->dstory_info){
}else {
$this->log( "已采 $title 跳过");
}*/
}
}/**
* 新采集小说后更新小说
* */
function change_max_dstoryid($cate_id)
{
echo $cate_id;
$sql = "select max(dstory_id) as max_id from dstory where dstory_cid = $cate_id";
$query = $this->db->query($sql);
$row = $query->row() ;
$sql = "update dstory set dstory_status = 0 where dstory_id = $row->max_id ";
$this->log($sql);
$this->db->query($sql);}
function grep_520_info($url)
{
$cont = CS_file_get_contents($url);
preg_match_all("/<iframe(?!src)[\d\D]*src=\"([^\"]*)\"[^>]*>/i",$cont,$inner_links);
$links = $inner_links[1][0];
$real_cont = CS_file_get_contents($links);
$real_cont = iconv("GBK","utf-8//IGNORE",$real_cont);
preg_match_all("/<div\s*id=\"BookText\">(?!<script)([\d\D]*)<script/iU",$real_cont,$real_cont_info);
$dstory_info = $real_cont_info[0][0];
$dstory_info = str_replace("/<div\s*style='display:\s*none;'>[\d\D]*</div>/iU","",$dstory_info);
return $dstory_info;
}function grep_xxs_info($url)
{
$cont = CS_file_get_contents($url);
$cont = iconv("GBK","utf-8//IGNORE",$cont);
preg_match_all("/<div\s*id=\"zf\"\s*style=\"color:#000;font-size:16px;line-height:23px;padding:0px\s*6px\s*0px\s*20px;\">([\d\D]*)<\/div>/iU",$cont,$cont_arr);
$dstory_info = $cont_arr[1][0];
return $dstory_info;
}/**重新下载跳过的小说*/
function update_jump($cate_obj)
{ $dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$where = array("dstory_cid = $cate_obj->dstorycate_id");
$count = $this->$dstory_model->get_count($where);
$data["dstorycate_pvcount"] = intval($count);
$this->$dstorycate_model->update_by_id($data,$cate_obj->dstorycate_id);
}