最近用火车头,ET采集小说,按他们的规则经常配不出来。碰到像小说520 里面的用iframe 的直接挂掉,只有自己写了一个,刚开始觉得就两正则解决[列表,内容]的事,写着写着便复杂起来了。
好好的改了几版,碰到最大的麻烦,,如何封装代码,就是采不同站时改动成本较小。这里小小的用了一个决策者模式。然后把该封装的功能一封。以及再次采集时,对已采集的章节的跳过机制,避免重采[毕竟一个小说站好几w篇文章,中断一次,接不上去,是很郁闷的事]class grep extends Controller {
var $tableName = 'grep';
var $pagesize =31;
var $order_string = "grep_order desc,grep_id desc";
var $filter_field = "grep_title";
var $check_repeat_field = "grep_title";
var $buttons = array(
);
var $description = "[爬取小说]";
function index()
{
//get the story list
$story_model = "story_model";
$this->load->model($story_model);
$where = array("story_id < 445");
$rows_story = $this->$story_model->get($where);
foreach ($rows_story as $key=>$val_story):
if($key < 237) continue;
$url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;
$src_content = file_get_contents($url);
$src_content = iconv("GBK","utf-8//IGNORE",$src_content);
$src_content = str_replace("/style=\"border-width:0px\s*1px\s*1px\s*0px;border-color:#C8D8B8;border-style:solid;padding:3px;float:left;width:313px;\"/i","",$src_content);
$src_content = str_replace("style=\"BORDER-RIGHT: #c8d8b8 1px solid; PADDING-RIGHT: 3px; BORDER-TOP: #c8d8b8 0px solid; PADDING-LEFT: 3px; FLOAT: left; PADDING-BOTTOM: 3px; BORDER-LEFT: #c8d8b8 0px solid; WIDTH: 313px; PADDING-TOP: 3px; BORDER-BOTTOM: #c8d8b8 1px solid\"","",$src_content);
$src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/<LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<\/LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<a(?!href)[\d\D]*href/iU","<a href",$src_content);
$src_content = preg_replace('/\s(?=\s)/', '', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = str_replace("http://www.xiaoxiaoshuo.net/yanqingxiaoshuo2/tijiaxinniang/","",$src_content);
preg_match_all("/<td\s*bgcolor=\"#EDF5EA\"([\d\D]*)<\/ul>/iU",$src_content,$arr_dstorycate);
$dstorycate_arr = $arr_dstorycate[1];
foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate)
{
preg_match_all("/<font\s*color=\"#000000\">([^<]*)<\/font>/i",$val_dstory_cate,$dcate_title);
$datacate["dstorycate_pid"] = $val_story->story_id;
$datacate["dstorycate_title"] = $dcate_title[1][0];
//获取类别对象,记将之前的类别标置为已下载
$dtitle =$datacate["dstorycate_title"];
$obj_storycate = $this->check_dcate($dtitle,$val_story );
//pr($obj_storycate);
if($obj_storycate->dstorycate_ishot == 1)
{
$this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");
continue;
}
preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);
$list_story_url = $dinfo_list[1];
$list_story_title = $dinfo_list[2];
$story_url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;
$this->add_storyxxs_info($obj_storycate,$list_story_url,$list_story_title,$story_url);
}
endforeach;
}
function status()
{
$sql = "select count(dstory_id) as all_story from dstory;";
$query = $this->db->query($sql);
$cont_all = $query->row();
echo $cont_all->all_story;
$sql = "select count(dstory_id) as story1 from dstory where dstory_status = 1";
$query = $this->db->query($sql);
$cont_all = $query->row();
echo "--".$cont_all->story1;
$sql = "select max(dstorycate_id) as max_id,max(dstorycate_pid) as max_pid from dstorycate";
$query = $this->db->query($sql);
$cont_all = $query->row();
echo "--".$cont_all->max_id."--".$cont_all->max_pid;
}
/**
* *下载445后的章节
*
*/
function index445()
{
$story_model = "story_model";
$this->load->model($story_model);
$where = array("story_id > 445");
$rows_story = $this->$story_model->get($where);foreach ($rows_story as $key=>$val_story):
//get the story_content
$story_url_arr = explode("/",$val_story->story_url);
$story_url= $story_url_arr[1]."/".$story_url_arr[2]."/".$story_url_arr[3]."/".$story_url_arr[4];
$dest_url = "http://www.xiaoshuo520.com/".$story_url;
$src_content = CS_file_get_contents($dest_url);
$src_content = iconv("GBK","utf-8//IGNORE",$src_content);
//按类别进行分类分组数据.
preg_match_all("/(<div\s*id=\"NclassTitle\">[\d\D]*)<div\s*id=\"ListEnd/iU",$src_content,$arr_dstorycate);
$dstorycate_arr = $arr_dstorycate[1];foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate)
{ preg_match_all("/<div\s*id=\"NclassTitle\">([\d\D]*)<\/div>/i",$val_dstory_cate,$dcate_title);
$datacate["dstorycate_pid"] = $val_story->story_id;
$datacate["dstorycate_title"] = $dcate_title[1][0];
//获取类别对象,记将之前的类别标置为已下载
$dtitle =$datacate["dstorycate_title"];
$obj_storycate = $this->check_dcate($dtitle,$val_story );
//pr($obj_storycate);
if($obj_storycate->dstorycate_ishot == 1)
{
$this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");
continue;
}
preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);
$list_story_url = $dinfo_list[1];
$list_story_title = $dinfo_list[2];
$this->add_story520_info($obj_storycate,$list_story_url,$list_story_title,$story_url);
}
endforeach;
}
/**
* 根据章节标准 和小说对象,获取标题对象
* */
function check_dcate($title,$obj_story)
{
$dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$where = array("dstorycate_pid = $obj_story->story_id","dstorycate_title = '$title'");
$rows = $this->$dstorycate_model->get($where);
if(!$rows)
{
$datacate["dstorycate_pid"] = $obj_story->story_id;
$datacate["dstorycate_title"] = $title;
$this->$dstorycate_model->insert($datacate);
$obj_cate_id = $this->db->insert_id();
$where = array("dstorycate_id = $obj_cate_id","dstorycate_title = '$title'");
$rows = $this->$dstorycate_model->get($where);
$this->log( "此书没有相关类别,将进行添加 小说$obj_story->story_title - $title ");
}else {
$this->log( "已存在相关小说类别 $obj_story->story_title - $title ,跳过");
}
$obj_cate = $rows[0];
$sql = "update dstorycate set dstorycate_published = 1 where dstorycate_pid = $obj_story->story_id && dstorycate_id < $obj_cate->dstorycate_id ";
$this->db->query($sql);
return $obj_cate;
}
/***
*添加小说对旬
*/
function add_story520_info($cate_obj,$list_story_url,$list_story_title,$url)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$min_key = intval($cate_obj->dstorycate_pvcount);
if(!$min_key) $min_key = 0 ;
foreach($list_story_url as $key=>$val):
if($key < $min_key) {
continue;
}
$this->check_dstory($cate_obj,"http://www.xiaoshuo520.com/".$url."/".$val,$list_story_title[$key],"grep_520_info");
endforeach;
}function add_storyxxs_info($cate_obj,$list_story_url,$list_story_title,$url)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$min_key = intval($cate_obj->dstorycate_pvcount);
if(!$min_key) $min_key = 0 ;
foreach($list_story_url as $key=>$val):
if($key < $min_key) {
$this->log("$cate_obj->dstorycate_id 号 $cate_obj->dstorycate_title ".$list_story_title[$key]." 章 $key < $min_key "); continue;
}
$this->check_dstory($cate_obj,$url."/".$val,$list_story_title[$key],"grep_xxs_info");
endforeach;
}
受字数限制,未完,接着贴。
好好的改了几版,碰到最大的麻烦,,如何封装代码,就是采不同站时改动成本较小。这里小小的用了一个决策者模式。然后把该封装的功能一封。以及再次采集时,对已采集的章节的跳过机制,避免重采[毕竟一个小说站好几w篇文章,中断一次,接不上去,是很郁闷的事]class grep extends Controller {
var $tableName = 'grep';
var $pagesize =31;
var $order_string = "grep_order desc,grep_id desc";
var $filter_field = "grep_title";
var $check_repeat_field = "grep_title";
var $buttons = array(
);
var $description = "[爬取小说]";
function index()
{
//get the story list
$story_model = "story_model";
$this->load->model($story_model);
$where = array("story_id < 445");
$rows_story = $this->$story_model->get($where);
foreach ($rows_story as $key=>$val_story):
if($key < 237) continue;
$url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;
$src_content = file_get_contents($url);
$src_content = iconv("GBK","utf-8//IGNORE",$src_content);
$src_content = str_replace("/style=\"border-width:0px\s*1px\s*1px\s*0px;border-color:#C8D8B8;border-style:solid;padding:3px;float:left;width:313px;\"/i","",$src_content);
$src_content = str_replace("style=\"BORDER-RIGHT: #c8d8b8 1px solid; PADDING-RIGHT: 3px; BORDER-TOP: #c8d8b8 0px solid; PADDING-LEFT: 3px; FLOAT: left; PADDING-BOTTOM: 3px; BORDER-LEFT: #c8d8b8 0px solid; WIDTH: 313px; PADDING-TOP: 3px; BORDER-BOTTOM: #c8d8b8 1px solid\"","",$src_content);
$src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
$src_content = preg_replace("/<LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<\/LI[^>]*>/iU","",$src_content);
$src_content = preg_replace("/<a(?!href)[\d\D]*href/iU","<a href",$src_content);
$src_content = preg_replace('/\s(?=\s)/', '', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);
$src_content = str_replace("http://www.xiaoxiaoshuo.net/yanqingxiaoshuo2/tijiaxinniang/","",$src_content);
preg_match_all("/<td\s*bgcolor=\"#EDF5EA\"([\d\D]*)<\/ul>/iU",$src_content,$arr_dstorycate);
$dstorycate_arr = $arr_dstorycate[1];
foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate)
{
preg_match_all("/<font\s*color=\"#000000\">([^<]*)<\/font>/i",$val_dstory_cate,$dcate_title);
$datacate["dstorycate_pid"] = $val_story->story_id;
$datacate["dstorycate_title"] = $dcate_title[1][0];
//获取类别对象,记将之前的类别标置为已下载
$dtitle =$datacate["dstorycate_title"];
$obj_storycate = $this->check_dcate($dtitle,$val_story );
//pr($obj_storycate);
if($obj_storycate->dstorycate_ishot == 1)
{
$this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");
continue;
}
preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);
$list_story_url = $dinfo_list[1];
$list_story_title = $dinfo_list[2];
$story_url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;
$this->add_storyxxs_info($obj_storycate,$list_story_url,$list_story_title,$story_url);
}
endforeach;
}
function status()
{
$sql = "select count(dstory_id) as all_story from dstory;";
$query = $this->db->query($sql);
$cont_all = $query->row();
echo $cont_all->all_story;
$sql = "select count(dstory_id) as story1 from dstory where dstory_status = 1";
$query = $this->db->query($sql);
$cont_all = $query->row();
echo "--".$cont_all->story1;
$sql = "select max(dstorycate_id) as max_id,max(dstorycate_pid) as max_pid from dstorycate";
$query = $this->db->query($sql);
$cont_all = $query->row();
echo "--".$cont_all->max_id."--".$cont_all->max_pid;
}
/**
* *下载445后的章节
*
*/
function index445()
{
$story_model = "story_model";
$this->load->model($story_model);
$where = array("story_id > 445");
$rows_story = $this->$story_model->get($where);foreach ($rows_story as $key=>$val_story):
//get the story_content
$story_url_arr = explode("/",$val_story->story_url);
$story_url= $story_url_arr[1]."/".$story_url_arr[2]."/".$story_url_arr[3]."/".$story_url_arr[4];
$dest_url = "http://www.xiaoshuo520.com/".$story_url;
$src_content = CS_file_get_contents($dest_url);
$src_content = iconv("GBK","utf-8//IGNORE",$src_content);
//按类别进行分类分组数据.
preg_match_all("/(<div\s*id=\"NclassTitle\">[\d\D]*)<div\s*id=\"ListEnd/iU",$src_content,$arr_dstorycate);
$dstorycate_arr = $arr_dstorycate[1];foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate)
{ preg_match_all("/<div\s*id=\"NclassTitle\">([\d\D]*)<\/div>/i",$val_dstory_cate,$dcate_title);
$datacate["dstorycate_pid"] = $val_story->story_id;
$datacate["dstorycate_title"] = $dcate_title[1][0];
//获取类别对象,记将之前的类别标置为已下载
$dtitle =$datacate["dstorycate_title"];
$obj_storycate = $this->check_dcate($dtitle,$val_story );
//pr($obj_storycate);
if($obj_storycate->dstorycate_ishot == 1)
{
$this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");
continue;
}
preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);
$list_story_url = $dinfo_list[1];
$list_story_title = $dinfo_list[2];
$this->add_story520_info($obj_storycate,$list_story_url,$list_story_title,$story_url);
}
endforeach;
}
/**
* 根据章节标准 和小说对象,获取标题对象
* */
function check_dcate($title,$obj_story)
{
$dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$where = array("dstorycate_pid = $obj_story->story_id","dstorycate_title = '$title'");
$rows = $this->$dstorycate_model->get($where);
if(!$rows)
{
$datacate["dstorycate_pid"] = $obj_story->story_id;
$datacate["dstorycate_title"] = $title;
$this->$dstorycate_model->insert($datacate);
$obj_cate_id = $this->db->insert_id();
$where = array("dstorycate_id = $obj_cate_id","dstorycate_title = '$title'");
$rows = $this->$dstorycate_model->get($where);
$this->log( "此书没有相关类别,将进行添加 小说$obj_story->story_title - $title ");
}else {
$this->log( "已存在相关小说类别 $obj_story->story_title - $title ,跳过");
}
$obj_cate = $rows[0];
$sql = "update dstorycate set dstorycate_published = 1 where dstorycate_pid = $obj_story->story_id && dstorycate_id < $obj_cate->dstorycate_id ";
$this->db->query($sql);
return $obj_cate;
}
/***
*添加小说对旬
*/
function add_story520_info($cate_obj,$list_story_url,$list_story_title,$url)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$min_key = intval($cate_obj->dstorycate_pvcount);
if(!$min_key) $min_key = 0 ;
foreach($list_story_url as $key=>$val):
if($key < $min_key) {
continue;
}
$this->check_dstory($cate_obj,"http://www.xiaoshuo520.com/".$url."/".$val,$list_story_title[$key],"grep_520_info");
endforeach;
}function add_storyxxs_info($cate_obj,$list_story_url,$list_story_title,$url)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$min_key = intval($cate_obj->dstorycate_pvcount);
if(!$min_key) $min_key = 0 ;
foreach($list_story_url as $key=>$val):
if($key < $min_key) {
$this->log("$cate_obj->dstorycate_id 号 $cate_obj->dstorycate_title ".$list_story_title[$key]." 章 $key < $min_key "); continue;
}
$this->check_dstory($cate_obj,$url."/".$val,$list_story_title[$key],"grep_xxs_info");
endforeach;
}
受字数限制,未完,接着贴。
function patch_forxxs()
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$where = array("dstory_info is null");
$rows = $this->$dstory_model->get($where);
foreach($rows as $key=>$dstory_obj): $data["dstory_info"] = $this->grep_xxs_info($dstory_obj->dstory_url);
$data= img2local($data);
$this->$dstory_model->update_by_id($data,$dstory_obj->dstory_id);
/*
$data["dstory_info"] = $this->grep_xxs_info($dstory_obj->dstory_url);
echo $dstory_obj->dstory_url;
*/
endforeach;
$this->output->enable_profiler(TRUE);}
/**
* 处理url 链接
* */
function check_dstory($cate_obj,$url,$title,$handle)
{
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$where = array("dstory_cid = $cate_obj->dstorycate_id","dstory_title = '$title'");
$rows = $this->$dstory_model->get($where);
$data["dstory_url"] = $url;
if(!$rows) {
$data["dstory_cid"] = $cate_obj->dstorycate_id;
$data["dstory_title"] = $title;
$data["dstory_info"] = $this->$handle($url);
$data= img2local($data);
$this->$dstory_model->insert($data);
$this->log( "正在采$cate_obj->dstorycate_id :$title---- ");
$this->update_jump($cate_obj);
} else {
/*
$data["dstory_cid"] = $cate_obj->dstorycate_id;
$data["dstory_title"] = $title;
$data["dstory_info"] = $this->$handle($url);
$data= img2local($data);
$this->$dstory_model->update_by_id($data,$rows[0]->dstory_id);
*/
$this->log( "已采集 $cate_obj->dstorycate_id ".$rows[0]->dstory_id."$title 跳过 ");
/*
if(!$rows[0]->dstory_info){
}else {
$this->log( "已采 $title 跳过");
}*/
}
}function grep_520_info($url)
{
$cont = CS_file_get_contents($url);
preg_match_all("/<iframe(?!src)[\d\D]*src=\"([^\"]*)\"[^>]*>/i",$cont,$inner_links);
$links = $inner_links[1][0];
$real_cont = CS_file_get_contents($links);
$real_cont = iconv("GBK","utf-8//IGNORE",$real_cont);
preg_match_all("/<div\s*id=\"BookText\">(?!<script)([\d\D]*)<script/iU",$real_cont,$real_cont_info);
$dstory_info = $real_cont_info[0][0];
$dstory_info = str_replace("/<div\s*style='display:\s*none;'>[\d\D]*</div>/iU","",$dstory_info);
return $dstory_info;
}function grep_xxs_info($url)
{
$cont = CS_file_get_contents($url);
$cont = iconv("GBK","utf-8//IGNORE",$cont);
preg_match_all("/<div\s*id=\"zf\"\s*style=\"color:#000;font-size:16px;line-height:23px;padding:0px\s*6px\s*0px\s*20px;\">([\d\D]*)<\/div>/iU",$cont,$cont_arr);
$dstory_info = $cont_arr[1][0];
return $dstory_info;
}//设置已跳过
function update_jump($cate_obj)
{ $dstorycate_model = "dstorycate_model";
$this->load->model($dstorycate_model);
$dstory_model = "dstory_model";
$this->load->model($dstory_model);
$where = array("dstory_cid = $cate_obj->dstorycate_id");
$count = $this->$dstory_model->get_count($where);
$data["dstorycate_pvcount"] = intval($count);
$this->$dstorycate_model->update_by_id($data,$cate_obj->dstorycate_id);
}//日志
function log($txt)
{
$log_file = $_SERVER['DOCUMENT_ROOT']."/logs/".date("Y-m-dHi")."grep.html";
CS_log(date("H:i:s").$txt,$log_file);
}
并非这样,只是大家对这代码不感兴趣而已,也可能和标题有关,
人家一般叫小偷程序,很少叫采集,再者,这方面内容大家看倦了,
还有高手都比较少用cms的采集