如何取红色字体部分数据
<div class="fl">
<div class="bm">
<div class="bm_h">
<a href="forum.php?viewgid=0&mobile=yes">数据A</a> </div>
<div class="bm_c add">
<a href="forum.php?mod=forumdisplay&fid=86&mobile=yes" >1</a> <font class="xg1">(3)</font> </div>
<div class="bm_c even">
<a href="forum.php?mod=forumdisplay&fid=23&mobile=yes" >2</a> <font class="xg1">(2474)</font> </div>
<div class="bm_c add">
<a href="forum.php?mod=forumdisplay&fid=95&mobile=yes" >3</a> <font class="xg1">(83)</font> </div> </div>
</div>
<div class="fl">
<div class="bm">
<div class="bm_h">
<a href="forum.php?viewgid=1&mobile=yes">数据B</a> </div>
</div>
</div>
<div class="fl">
<div class="bm">
<div class="bm_h">
<a href="forum.php?viewgid=3923&mobile=yes">数据C</a> </div>
</div>
</div>
<div class="box">
<div class="fl">
<div class="bm">
<div class="bm_h">
<a href="forum.php?viewgid=0&mobile=yes">数据A</a> </div>
<div class="bm_c add">
<a href="forum.php?mod=forumdisplay&fid=86&mobile=yes" >1</a> <font class="xg1">(3)</font> </div>
<div class="bm_c even">
<a href="forum.php?mod=forumdisplay&fid=23&mobile=yes" >2</a> <font class="xg1">(2474)</font> </div>
<div class="bm_c add">
<a href="forum.php?mod=forumdisplay&fid=95&mobile=yes" >3</a> <font class="xg1">(83)</font> </div> </div>
</div>
<div class="fl">
<div class="bm">
<div class="bm_h">
<a href="forum.php?viewgid=1&mobile=yes">数据B</a> </div>
</div>
</div>
<div class="fl">
<div class="bm">
<div class="bm_h">
<a href="forum.php?viewgid=3923&mobile=yes">数据C</a> </div>
</div>
</div>
<div class="box">
<div class="bm_c add">
是不变得
百度 jsonp 或者htmlparse 推荐使用jsonp 后者已经停止更新
按你所需要的标签id去提取就行了
private static String getDocument(File html) {
String text = "";
try {
//设置编码集
// org.jsoup.nodes.Document doc = Jsoup.parse(html, "UTF-8");
org.jsoup.nodes.Document doc = Jsoup.parse(html,"GBK"); //提取标题信息
Elements title = doc.select("title");
for (org.jsoup.nodes.Element link : title) {
text += link.text() + " ";
}
//提取table中的文本信息
Elements links = doc.select("table");
for (org.jsoup.nodes.Element link : links) {
text += link.text() + " ";
}
//提取div中的文本信息
Elements divs = doc.select("div[class=post]");
for (org.jsoup.nodes.Element link : divs) {
text += link.text() + " ";
}
} catch (IOException e) {
e.printStackTrace();
} return text;
}
String htmlcontents = ".............";
Document doc = Jsoup.parseBodyFragment(htmlcontents);
Elements elements = doc.select("div.bm_c");
for(Element element: elements){}
"<div class=\"bm\">"+
" <div class=\"bm_h\">"+
" <a href=\"forum.php?viewgid=0&mobile=yes\">数据A</a> </div>"+
" <div class=\"bm_c add\">"+
"<a href=\"forum.php?mod=forumdisplay&fid=86&mobile=yes\" >1</a> <font class=\"xg1\">(3)</font> </div>"+
"<div class=\"bm_c even\">"+
"<a href=\"forum.php?mod=forumdisplay&fid=23&mobile=yes\" >2</a> <font class=\"xg1\">(2474)</font> </div>"+
"<div class=\"bm_c add\">"+
"<a href=\"forum.php?mod=forumdisplay&fid=95&mobile=yes\" >3</a> <font class=\"xg1\">(83)</font> </div> </div>"+
" </div>"+
"<div class=\"fl\">"+
" <div class=\"bm\">"+
" <div class=\"bm_h\">"+
" <a href=\"forum.php?viewgid=1&mobile=yes\">数据B</a> </div>"+
" </div>"+
" </div>"+
" <div class=\"fl\">"+
" <div class=\"bm\">"+
" <div class=\"bm_h\">"+
" <a href=\"forum.php?viewgid=3923&mobile=yes\">数据C</a> </div>"+
" </div>"+
" </div>";
Pattern p = Pattern.compile("<div class=\"bm_c (\\w)*\">(.*</font>\\s*)</div>");
Matcher m = p.matcher(s);
while(m.find()){
System.out.println(m.group(0));
}
(2)得到一个数组还是一个串?
你好,上面字符串中</font> 可能没有怎么办
下面列出在抓取网页中常用的正则规则,其中$content代表网页内容,$tmparray为抓取的结果存储数组。 抓取HTML中CSS里背景图片地址
preg_match_all('/(background|background-image):url\([\"\']?([^\)]+)[\"\']?\)/i', $content, $tmparray);
抓取HTML中图片标签中的地址
preg_match_all('/<img[^>]+src=[\"\']{1}([^\"\'\s]+)[\"\']{1}[^>]+>/i', $content, $tmparray);
抓取HTML中图片背景地址 preg_match_all('/background=[\"\']?([^\"\'\s>]+)[\"\']?/i', $content, $tmparray);
抓取HTML中的Flash地址 preg_match_all('/<embed[^>]+src=[\"\']{1}(([^\"\'\s]+)\.swf)[\"\']{1}[^>]+>/i', $content, $tmparray);
抓取HTML中的iframe的链接地址
preg_match_all('/<iframe[^>]+src=[\"\']?([\w\-\/\.]+)[\"\']?[^>]+><\/iframe>/i', $content, $tmparray);
抓取HTML中的超链接地址
preg_match_all('/<a[^>]+href=[\"\']{1}([^>\"\']+)[\"\']?[^>]+>([^<]+)<\/a>/i', $content, $iframeurlarray);
import java.util.regex.Pattern;
// import java.util.*;
public class Test{
public static void main(String args[]){
String str;
// str="数据A</a> </div><div class=\"bm_c add\"><a href=\"forum.php?mod=forumdisplay&fid=86&mobile=yes(3)>1</a></font> </div> ";
// str+="<div class=\"bm_c even\"><a href=\"forum.php?mod=forumdisplay&fid=23&mobile=yes(3)>2</a></font> </div>bb.....";
str="<div class=\"fl\">"+
"<div class=\"bm\">"+
" <div class=\"bm_h\">"+
" <a href=\"forum.php?viewgid=0&mobile=yes\">数据A</a> </div>"+
" <div class=\"bm_c add\">"+
"<a href=\"forum.php?mod=forumdisplay&fid=86&mobile=yes\" >1</a> <font class=\"xg1\">(3)</font> </div>"+
"<div class=\"bm_c even\">"+
"<a href=\"forum.php?mod=forumdisplay&fid=23&mobile=yes\" >2</a> <font class=\"xg1\">(2474)</font> </div>"+
"<div class=\"bm_c add\">"+
"<a href=\"forum.php?mod=forumdisplay&fid=95&mobile=yes\" >3</a> <font class=\"xg1\">(83)</font> </div> </div>"+
" </div>"+
"<div class=\"fl\">"+
" <div class=\"bm\">"+
" <div class=\"bm_h\">"+
" <a href=\"forum.php?viewgid=1&mobile=yes\">数据B</a> </div>"+
" </div>"+
" </div>"+
" <div class=\"fl\">"+
" <div class=\"bm\">"+
" <div class=\"bm_h\">"+
" <a href=\"forum.php?viewgid=3923&mobile=yes\">数据C</a> </div>"+
" </div>"+
" </div>";
String reg="<div class=\"bm_c \\w+\">.+?</div>";
Pattern p=Pattern.compile(reg);
Matcher m=p.matcher(str);
while(m.find()){
System.out.println(m.group());
}
}
}“.+</div>“匹配到最后一个</div>,”.+?</div>“加问号,匹配到最近一个</div>。