php文章内容抓取 本帖最后由 yanfangphp 于 2014-08-13 10:14:37 编辑 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 抓取 研究研究 phpquery $url = 'http://sports.sohu.com/zhongchao.shtml';$s = file_get_contents($url);preg_match_all('/(?<=<div class="turn cons">)\s<table.+table>/isU', $s, $m);print_r(preg_grep('/名次/', $m[0]));Array( [2] => <table border=0 cellSpacing=0 cellPadding=0 width="100%"><tbody><tr><th width="15%">名次</th><th width="47%">球队</th><th width="9%">场次</th><th width="29%">积分</th></tr><tr><td>01</td><td><a href="http://sports.sohu.com/s2010/7742/s277701524/" target="_blank">广州恒大</a></td><td>20</td><td>45</td></tr><tr><td>02</td><td><a href="http://sports.sohu.com/s2006/7742/s242155493/" target="_blank">北京国安</a></td>......接下来自己做 可以使用preg_match去抓取对应的HTML代码然后再正则过滤你想要的数据即可。 给你推荐个类 simple_html_dominclude "simple_html_dom.class.php";$url = "http://sports.sohu.com/zhongchao.shtml";$dom = new simple_html_dom();$html = $dom->load(file_get_contents($url));$res = $html->find("div#turnIDB div.turn");# 积分榜echo $res[0]->outertext;# 射手榜echo $res[1]->outertext;结果 $str=file_get_contents("http://sports.sohu.com/zhongchao.shtml");preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$str,$match1);foreach($match1 as $k=>$v){ if($k!=0){ foreach($v as $k1=>$v1){ if($k1<=15){ $jifen[$k][]=$v1; }else{ $sheshou[$k][]=$v1; } } }}echo "<pre>";print_r($jifen);print_r($sheshou);echo "</pre>";/*Array( [1] => Array ( [0] => 01 [1] => 02 [2] => 03 [3] => 04 [4] => 05 [5] => 06 [6] => 07 [7] => 08 [8] => 09 [9] => 10 [10] => 11 [11] => 12 [12] => 13 [13] => 14 [14] => 15 [15] => 16 ) [2] => Array ( [0] => 广州恒大 [1] => 北京国安 [2] => 广州富力 [3] => 上海东亚 [4] => 贵州茅台 [5] => 山东鲁能 [6] => 天津泰达 [7] => 江苏舜天 [8] => 上海绿地 [9] => 长春亚泰 [10] => 杭州绿城 [11] => 大连阿尔滨 [12] => 上海申鑫 [13] => 河南建业 [14] => 辽宁宏运 [15] => 哈尔滨毅腾 ) [3] => Array ( [0] => 20 [1] => 19 [2] => 19 [3] => 19 [4] => 19 [5] => 19 [6] => 19 [7] => 18 [8] => 20 [9] => 19 [10] => 19 [11] => 19 [12] => 19 [13] => 19 [14] => 19 [15] => 18 ) [4] => Array ( [0] => 45 [1] => 41 [2] => 34 [3] => 31 [4] => 30 [5] => 28 [6] => 27 [7] => 25 [8] => 23 [9] => 21 [10] => 21 [11] => 20 [12] => 19 [13] => 17 [14] => 16 [15] => 12 ))Array( [1] => Array ( [0] => 01 [1] => 02 [2] => 03 [3] => 04 [4] => 04 [5] => 04 [6] => 04 [7] => 08 [8] => 09 [9] => 09 [10] => 09 [11] => 09 [12] => 09 [13] => 09 [14] => 15 [15] => 15 ) [2] => Array ( [0] => 埃尔克森 [1] => 哈默德 [2] => 海森 [3] => 达维 [4] => 多利 [5] => 洛维 [6] => 拉蒙 [7] => 德扬 [8] => 巴塔拉 [9] => 布鲁诺 [10] => 里卡多 [11] => 武磊 [12] => 埃尼奥 [13] => 尤里 [14] => 莫雷诺 [15] => 雷内 ) [3] => Array ( [0] => 17 [1] => 16 [2] => 13 [3] => 9 [4] => 9 [5] => 9 [6] => 9 [7] => 8 [8] => 7 [9] => 7 [10] => 7 [11] => 7 [12] => 7 [13] => 7 [14] => 6 [15] => 6 ) [4] => Array ( [0] => 广州恒大 [1] => 广州富力 [2] => 上海东亚 [3] => 广州富力 [4] => 哈尔滨毅腾 [5] => 山东鲁能 [6] => 杭州绿城 [7] => 北京国安 [8] => 北京国安 [9] => 大连阿尔滨 [10] => 哈尔滨毅腾 [11] => 上海东亚 [12] => 长春亚泰 [13] => 贵州茅台 [14] => 上海绿地 [15] => 广州恒大 ))*/后面的自己处理吧 sohu的页面是gb2312的,采集后需要转utf8,否则会乱码echo '<meta http-equiv="content-type" content="text/html;charset=utf-8">';$url = 'http://sports.sohu.com/zhongchao.shtml';$s = file_get_contents($url);$s = iconv('GBK','UTF8', $s); // gb2312转utf8preg_match_all('/(?<=<div class="turn cons">)\s<table.+table>/isU', $s, $m);// 获取积分榜preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$m[0][2],$scores);$scoreboard = array();for($i=0,$len=count($scores[1]); $i<$len; $i++){ $tmp = array($scores[1][$i],strip_tags($scores[2][$i]),$scores[3][$i],$scores[4][$i]); array_push($scoreboard, $tmp);}print_r($scoreboard);// 射手榜preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$m[0][3],$shooters);$shooterboard = array();for($i=0,$len=count($shooters[1]); $i<$len; $i++){ $tmp = array($shooters[1][$i],strip_tags($shooters[2][$i]),$shooters[3][$i],$shooters[4][$i]); array_push($shooterboard, $tmp);}print_r($shooterboard);积分榜Array( [0] => Array ( [0] => 01 [1] => 广州恒大 [2] => 20 [3] => 45 ) [1] => Array ( [0] => 02 [1] => 北京国安 [2] => 19 [3] => 41 ) [2] => Array ( [0] => 03 [1] => 广州富力 [2] => 19 [3] => 34 ) [3] => Array ( [0] => 04 [1] => 上海东亚 [2] => 19 [3] => 31 ) [4] => Array ( [0] => 05 [1] => 贵州茅台 [2] => 19 [3] => 30 ) [5] => Array ( [0] => 06 [1] => 山东鲁能 [2] => 19 [3] => 28 ) [6] => Array ( [0] => 07 [1] => 天津泰达 [2] => 19 [3] => 27 ) [7] => Array ( [0] => 08 [1] => 江苏舜天 [2] => 18 [3] => 25 ) [8] => Array ( [0] => 09 [1] => 上海绿地 [2] => 20 [3] => 23 ) [9] => Array ( [0] => 10 [1] => 长春亚泰 [2] => 19 [3] => 21 ) [10] => Array ( [0] => 11 [1] => 杭州绿城 [2] => 19 [3] => 21 ) [11] => Array ( [0] => 12 [1] => 大连阿尔滨 [2] => 19 [3] => 20 ) [12] => Array ( [0] => 13 [1] => 上海申鑫 [2] => 19 [3] => 19 ) [13] => Array ( [0] => 14 [1] => 河南建业 [2] => 19 [3] => 17 ) [14] => Array ( [0] => 15 [1] => 辽宁宏运 [2] => 19 [3] => 16 ) [15] => Array ( [0] => 16 [1] => 哈尔滨毅腾 [2] => 18 [3] => 12 ))射手榜Array( [0] => Array ( [0] => 01 [1] => 埃尔克森 [2] => 17 [3] => 广州恒大 ) [1] => Array ( [0] => 02 [1] => 哈默德 [2] => 16 [3] => 广州富力 ) [2] => Array ( [0] => 03 [1] => 海森 [2] => 13 [3] => 上海东亚 ) [3] => Array ( [0] => 04 [1] => 达维 [2] => 9 [3] => 广州富力 ) [4] => Array ( [0] => 04 [1] => 多利 [2] => 9 [3] => 哈尔滨毅腾 ) [5] => Array ( [0] => 04 [1] => 洛维 [2] => 9 [3] => 山东鲁能 ) [6] => Array ( [0] => 04 [1] => 拉蒙 [2] => 9 [3] => 杭州绿城 ) [7] => Array ( [0] => 08 [1] => 德扬 [2] => 8 [3] => 北京国安 ) [8] => Array ( [0] => 09 [1] => 巴塔拉 [2] => 7 [3] => 北京国安 ) [9] => Array ( [0] => 09 [1] => 布鲁诺 [2] => 7 [3] => 大连阿尔滨 ) [10] => Array ( [0] => 09 [1] => 里卡多 [2] => 7 [3] => 哈尔滨毅腾 ) [11] => Array ( [0] => 09 [1] => 武磊 [2] => 7 [3] => 上海东亚 ) [12] => Array ( [0] => 09 [1] => 埃尼奥 [2] => 7 [3] => 长春亚泰 ) [13] => Array ( [0] => 09 [1] => 尤里 [2] => 7 [3] => 贵州茅台 ) [14] => Array ( [0] => 15 [1] => 莫雷诺 [2] => 6 [3] => 上海绿地 ) [15] => Array ( [0] => 15 [1] => 雷内 [2] => 6 [3] => 广州恒大 )) 如何获取新浪微薄分享成功的返回值 php 整站生成 html的思路 终于可以访问了 如何判断一个字符串是否含有汉字 请教php安装问题!! 怎样实现提前提醒? php调用java api问题! mysql 到底有没有day()函数啊? php之horde应用设置问题求解 目录问题: 如何判断 ./ ../111/ ../../abc/111/是指的同一个目录??? uploadify 上传问题 请教各位老师关于PHP数组的问题
$s = file_get_contents($url);
preg_match_all('/(?<=<div class="turn cons">)\s<table.+table>/isU', $s, $m);
print_r(preg_grep('/名次/', $m[0]));Array
(
[2] =>
<table border=0 cellSpacing=0 cellPadding=0 width="100%">
<tbody>
<tr>
<th width="15%">名次</th>
<th width="47%">球队</th>
<th width="9%">场次</th>
<th width="29%">积分</th></tr>
<tr>
<td>01</td>
<td><a href="http://sports.sohu.com/s2010/7742/s277701524/" target="_blank">广州恒大</a></td>
<td>20</td>
<td>45</td>
</tr>
<tr>
<td>02</td>
<td><a href="http://sports.sohu.com/s2006/7742/s242155493/" target="_blank">北京国安</a></td>
......接下来自己做
include "simple_html_dom.class.php";$url = "http://sports.sohu.com/zhongchao.shtml";
$dom = new simple_html_dom();
$html = $dom->load(file_get_contents($url));$res = $html->find("div#turnIDB div.turn");
# 积分榜
echo $res[0]->outertext;
# 射手榜
echo $res[1]->outertext;
结果
if($k!=0){
foreach($v as $k1=>$v1){
if($k1<=15){
$jifen[$k][]=$v1;
}else{
$sheshou[$k][]=$v1;
}
}
}
}
echo "<pre>";
print_r($jifen);
print_r($sheshou);
echo "</pre>";
/*
Array
(
[1] => Array
(
[0] => 01
[1] => 02
[2] => 03
[3] => 04
[4] => 05
[5] => 06
[6] => 07
[7] => 08
[8] => 09
[9] => 10
[10] => 11
[11] => 12
[12] => 13
[13] => 14
[14] => 15
[15] => 16
) [2] => Array
(
[0] => 广州恒大
[1] => 北京国安
[2] => 广州富力
[3] => 上海东亚
[4] => 贵州茅台
[5] => 山东鲁能
[6] => 天津泰达
[7] => 江苏舜天
[8] => 上海绿地
[9] => 长春亚泰
[10] => 杭州绿城
[11] => 大连阿尔滨
[12] => 上海申鑫
[13] => 河南建业
[14] => 辽宁宏运
[15] => 哈尔滨毅腾
) [3] => Array
(
[0] => 20
[1] => 19
[2] => 19
[3] => 19
[4] => 19
[5] => 19
[6] => 19
[7] => 18
[8] => 20
[9] => 19
[10] => 19
[11] => 19
[12] => 19
[13] => 19
[14] => 19
[15] => 18
) [4] => Array
(
[0] => 45
[1] => 41
[2] => 34
[3] => 31
[4] => 30
[5] => 28
[6] => 27
[7] => 25
[8] => 23
[9] => 21
[10] => 21
[11] => 20
[12] => 19
[13] => 17
[14] => 16
[15] => 12
))
Array
(
[1] => Array
(
[0] => 01
[1] => 02
[2] => 03
[3] => 04
[4] => 04
[5] => 04
[6] => 04
[7] => 08
[8] => 09
[9] => 09
[10] => 09
[11] => 09
[12] => 09
[13] => 09
[14] => 15
[15] => 15
) [2] => Array
(
[0] => 埃尔克森
[1] => 哈默德
[2] => 海森
[3] => 达维
[4] => 多利
[5] => 洛维
[6] => 拉蒙
[7] => 德扬
[8] => 巴塔拉
[9] => 布鲁诺
[10] => 里卡多
[11] => 武磊
[12] => 埃尼奥
[13] => 尤里
[14] => 莫雷诺
[15] => 雷内
) [3] => Array
(
[0] => 17
[1] => 16
[2] => 13
[3] => 9
[4] => 9
[5] => 9
[6] => 9
[7] => 8
[8] => 7
[9] => 7
[10] => 7
[11] => 7
[12] => 7
[13] => 7
[14] => 6
[15] => 6
) [4] => Array
(
[0] => 广州恒大
[1] => 广州富力
[2] => 上海东亚
[3] => 广州富力
[4] => 哈尔滨毅腾
[5] => 山东鲁能
[6] => 杭州绿城
[7] => 北京国安
[8] => 北京国安
[9] => 大连阿尔滨
[10] => 哈尔滨毅腾
[11] => 上海东亚
[12] => 长春亚泰
[13] => 贵州茅台
[14] => 上海绿地
[15] => 广州恒大
))
*/
后面的自己处理吧
$s = file_get_contents($url);
$s = iconv('GBK','UTF8', $s); // gb2312转utf8
preg_match_all('/(?<=<div class="turn cons">)\s<table.+table>/isU', $s, $m);// 获取积分榜
preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$m[0][2],$scores);$scoreboard = array();
for($i=0,$len=count($scores[1]); $i<$len; $i++){
$tmp = array($scores[1][$i],strip_tags($scores[2][$i]),$scores[3][$i],$scores[4][$i]);
array_push($scoreboard, $tmp);
}print_r($scoreboard);// 射手榜
preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$m[0][3],$shooters);$shooterboard = array();
for($i=0,$len=count($shooters[1]); $i<$len; $i++){
$tmp = array($shooters[1][$i],strip_tags($shooters[2][$i]),$shooters[3][$i],$shooters[4][$i]);
array_push($shooterboard, $tmp);
}print_r($shooterboard);
积分榜Array
(
[0] => Array
(
[0] => 01
[1] => 广州恒大
[2] => 20
[3] => 45
) [1] => Array
(
[0] => 02
[1] => 北京国安
[2] => 19
[3] => 41
) [2] => Array
(
[0] => 03
[1] => 广州富力
[2] => 19
[3] => 34
) [3] => Array
(
[0] => 04
[1] => 上海东亚
[2] => 19
[3] => 31
) [4] => Array
(
[0] => 05
[1] => 贵州茅台
[2] => 19
[3] => 30
) [5] => Array
(
[0] => 06
[1] => 山东鲁能
[2] => 19
[3] => 28
) [6] => Array
(
[0] => 07
[1] => 天津泰达
[2] => 19
[3] => 27
) [7] => Array
(
[0] => 08
[1] => 江苏舜天
[2] => 18
[3] => 25
) [8] => Array
(
[0] => 09
[1] => 上海绿地
[2] => 20
[3] => 23
) [9] => Array
(
[0] => 10
[1] => 长春亚泰
[2] => 19
[3] => 21
) [10] => Array
(
[0] => 11
[1] => 杭州绿城
[2] => 19
[3] => 21
) [11] => Array
(
[0] => 12
[1] => 大连阿尔滨
[2] => 19
[3] => 20
) [12] => Array
(
[0] => 13
[1] => 上海申鑫
[2] => 19
[3] => 19
) [13] => Array
(
[0] => 14
[1] => 河南建业
[2] => 19
[3] => 17
) [14] => Array
(
[0] => 15
[1] => 辽宁宏运
[2] => 19
[3] => 16
) [15] => Array
(
[0] => 16
[1] => 哈尔滨毅腾
[2] => 18
[3] => 12
))
射手榜Array
(
[0] => Array
(
[0] => 01
[1] => 埃尔克森
[2] => 17
[3] => 广州恒大
) [1] => Array
(
[0] => 02
[1] => 哈默德
[2] => 16
[3] => 广州富力
) [2] => Array
(
[0] => 03
[1] => 海森
[2] => 13
[3] => 上海东亚
) [3] => Array
(
[0] => 04
[1] => 达维
[2] => 9
[3] => 广州富力
) [4] => Array
(
[0] => 04
[1] => 多利
[2] => 9
[3] => 哈尔滨毅腾
) [5] => Array
(
[0] => 04
[1] => 洛维
[2] => 9
[3] => 山东鲁能
) [6] => Array
(
[0] => 04
[1] => 拉蒙
[2] => 9
[3] => 杭州绿城
) [7] => Array
(
[0] => 08
[1] => 德扬
[2] => 8
[3] => 北京国安
) [8] => Array
(
[0] => 09
[1] => 巴塔拉
[2] => 7
[3] => 北京国安
) [9] => Array
(
[0] => 09
[1] => 布鲁诺
[2] => 7
[3] => 大连阿尔滨
) [10] => Array
(
[0] => 09
[1] => 里卡多
[2] => 7
[3] => 哈尔滨毅腾
) [11] => Array
(
[0] => 09
[1] => 武磊
[2] => 7
[3] => 上海东亚
) [12] => Array
(
[0] => 09
[1] => 埃尼奥
[2] => 7
[3] => 长春亚泰
) [13] => Array
(
[0] => 09
[1] => 尤里
[2] => 7
[3] => 贵州茅台
) [14] => Array
(
[0] => 15
[1] => 莫雷诺
[2] => 6
[3] => 上海绿地
) [15] => Array
(
[0] => 15
[1] => 雷内
[2] => 6
[3] => 广州恒大
))