我在编写一个程序,功能是获取网页里面标签<b>和<p>里面的内容(包括这两个标签),可是我只能获取到<b>的内容,这是为什么呢??我的程序如下所示:
import java.io.*;
import java.net.URL;
import java.util.regex.*;public class Filter {
static String sourceContent;
//以下是获取网页内容
public static void getSourceContent(){
StringBuffer sb = new StringBuffer();
try{
URL url = new UR("http://www.wyu.cn/news/news_zxtz/200942111210750924.htm");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr); String temp = null;
while((temp = br.readLine())!= null){
//System.out.println(temp);
pw.println(temp);
sb.append(temp + "\r");
sourceContent = sb.toString();
}
br.close();
}
catch(Exception e){
System.out.println(e);
}
}//end getSourceContent
//以下是运用正则表达式获取<bady></body>之间内容
public static void match(){
String patternStr = "(?is)<body.*?>.*?((<b>.*?</b>)|(<p.*?>.*?</p>)).*?</body>"; //正则表达式
StringBuffer sb1 = new StringBuffer();
Pattern p = Pattern.compile(patternStr); //匹配<body>开头,</body>结尾的文档
Matcher m = p.matcher(sourceContent); //开始编译
while(m.find()){
sb1.append(m.group(0)); //获取被匹配的部分
}
System.out.println(sb1);
}
public static void main(String args[]) throws IOException{ getSourceContent();
match();
}
}
import java.io.*;
import java.net.URL;
import java.util.regex.*;public class Filter {
static String sourceContent;
//以下是获取网页内容
public static void getSourceContent(){
StringBuffer sb = new StringBuffer();
try{
URL url = new UR("http://www.wyu.cn/news/news_zxtz/200942111210750924.htm");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr); String temp = null;
while((temp = br.readLine())!= null){
//System.out.println(temp);
pw.println(temp);
sb.append(temp + "\r");
sourceContent = sb.toString();
}
br.close();
}
catch(Exception e){
System.out.println(e);
}
}//end getSourceContent
//以下是运用正则表达式获取<bady></body>之间内容
public static void match(){
String patternStr = "(?is)<body.*?>.*?((<b>.*?</b>)|(<p.*?>.*?</p>)).*?</body>"; //正则表达式
StringBuffer sb1 = new StringBuffer();
Pattern p = Pattern.compile(patternStr); //匹配<body>开头,</body>结尾的文档
Matcher m = p.matcher(sourceContent); //开始编译
while(m.find()){
sb1.append(m.group(0)); //获取被匹配的部分
}
System.out.println(sb1);
}
public static void main(String args[]) throws IOException{ getSourceContent();
match();
}
}
String patternStr = "(?is) <body.*?>.*?(( <b>.*? </b>)|( <p.*?>.*? </p>)).*? </body>"; //正则表达式
你把String patternStr这里的清清空格;
就是正则表达式的毛病,
我调试后运行后出来了import java.io.*;
import java.net.URL;
import java.util.regex.*;public class Filter {
static String sourceContent;
public static void main(String args[]) throws IOException { getSourceContent();
match();
}
// 以下是获取网页内容
public static void getSourceContent() {
StringBuffer sb = new StringBuffer();
try {
URL url = new URL("http://localhost:8080/CSDN_Question/index.html");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr); String temp = null;
System.out.println("进入了getSourceContent============");
// temp=br.readLine();
while ((temp= br.readLine())!= null) { System.out.println(temp);
sb.append(temp + "\r");
sourceContent = sb.toString();
}
br.close();
} catch (Exception e) {
System.out.println(e);
}
}// end getSourceContent // 以下是运用正则表达式获取 <bady> </body>之间内容
public static void match() {
String patternStr = "(?is) <body.*?>.*?(( <b>.*?</b>)|(<p.*?>.*? </p>)).*?</body>"; // 正则表达式
StringBuffer sb1 = new StringBuffer();
Pattern p = Pattern.compile(patternStr); // 匹配 <body>开头, </body>结尾的文档
Matcher m = p.matcher(sourceContent); // 开始编译
System.out.println("进入了match()");
while (m.find()) {
System.out.println("进入了 m.find()");
sb1.append(m.group(0)); // 获取被匹配的部分
}
System.out.println(sb1);
}
import java.net.URL;
import java.util.Vector;
import java.util.regex.*;
/**
* 作者:李 世贵
* JDK: 1.6
* 来源: http://blog.csdn.net/lishigui
* 欢迎转接,请保留作者和来源,谢谢!
* 2009-4-26 14:12:48
*/
public class Filter {
private String sourceContent = null; public Filter() {
getSourceContent();
getBody();
System.out.println(sourceContent);
getB();
getP();
} private void getB() {
System.out.println("匹配 <body>体中的 <b>");
Pattern bP = Pattern.compile("<b>.*</b>");
Matcher bM = bP.matcher(sourceContent);
while (bM.find()) {
System.out.println(sourceContent.substring(bM.start(), bM.end()));
}
} private void getP() {
System.out.println("匹配 <body>体中的 <p>");
Pattern pP = Pattern.compile("<p>.*</p>");
Matcher pM = pP.matcher(sourceContent);
while (pM.find()) {
System.out.println(sourceContent.substring(pM.start(), pM.end()));
} } private void getBody() {
System.out.println("匹配 <body>开头, </body>结尾的文档 ");
Pattern p = Pattern.compile("<body.*");
Matcher mStart = p.matcher(sourceContent);
mStart.find();
p = Pattern.compile(".*</body>");
Matcher mEnd = p.matcher(sourceContent);
mEnd.find();
sourceContent = sourceContent.substring(mStart.start(), mEnd.end());
} private void getSourceContent() {
StringBuffer sb = new StringBuffer();
try {
URL url = new URL("http://www.wyu.cn/news/news_zxtz/200942111210750924.htm");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr); String temp = null;
while ((temp = br.readLine()) != null) {
sb.append(temp + "\r");
sourceContent = sb.toString();
}
br.close();
} catch (Exception e) {
System.out.println(e);
}
} public static void main(String[] lsg) {
new Filter();
}
}
Pattern body = Pattern.compile("<body.*</body>");
Matcher mbody = body.matcher(sourceContent);
if(mbody.find()) {
System.out.println(sourceContent.substring(mbody.start(), mbody.end()));
}标签<p>的正则表达是:<p>.*</p>
标签<b>的正则表达是:<b>.*</b>
标签<body>的正则表达是:<body.*</body>
import java.io.*;
import java.net.URL;
import java.util.Vector;
import java.util.regex.*;
/**
* 作者:李 世贵
* JDK: 1.6
* 来源: http://blog.csdn.net/lishigui
* 欢迎转接,请保留作者和来源,谢谢!
* 2009-4-27 09:12:48
*/
public class Filter {
private String sourceContent = null;
private Vector vB = new Vector();
private Vector vP = new Vector();
public Filter() {
getSourceContent();
getBody();
System.out.println(sourceContent);
getB();
getP();
System.out.println("提取<body>的内容是:");
System.out.println(sourceContent);
System.out.println("提取<b>的内容是:");
for(int i = 0; i < vB.size(); i++){
System.out.println(vB.get(i).toString());
}
System.out.println("提取<p>的内容是:");
for(int i = 0; i < vP.size(); i++){
System.out.println(vP.get(i).toString());
}
} private void getB() {
System.out.println("匹配 <body>体中的 <b>");
Pattern bP = Pattern.compile("<b>.*</b>");
Matcher bM = bP.matcher(sourceContent);
while (bM.find()) {
vB.add(sourceContent.substring(bM.start(), bM.end()));
}
} private void getP() {
System.out.println("匹配 <body>体中的 <p>");
Pattern pP = Pattern.compile("<p>.*</p>");
Matcher pM = pP.matcher(sourceContent);
while (pM.find()) {
vP.add(sourceContent.substring(pM.start(), pM.end()));
} } private void getBody() {
Pattern body = Pattern.compile(" <body.* </body>");
Matcher mbody = body.matcher(sourceContent);
if(mbody.find()) {
sourceContent.substring(mbody.start(), mbody.end());
}
} private void getSourceContent() {
StringBuffer sb = new StringBuffer();
try {
URL url = new URL("http://www.wyu.cn/news/news_zxtz/200942111210750924.htm");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr); String temp = null;
while ((temp = br.readLine()) != null) {
sb.append(temp + "\r");
sourceContent = sb.toString();
}
br.close();
} catch (Exception e) {
System.out.println(e);
}
} public static void main(String[] lsg) {
new Filter();
}
}
你的运行结果如下:提取<b>的内容是:
<b>学术讲座</b>
提取<p>的内容是:<p>的内容没有出来啊?
<b>和 <p>标签的匹配过程是一样的,既然<b>能匹配,那 <p>也应该能,你应该先看看 <p>和 </p>是不是没有匹对出现。
import java.net.URL;
import java.util.Vector;
import java.util.regex.*;
/**
* 作者:李 世贵
* JDK: 1.6
* 来源: http://blog.csdn.net/lishigui
* 欢迎转接,请保留作者和来源,谢谢!
* 2009-4-29 08:43:18
*/
public class Filter {
private String sourceContent = null;
private Vector vB = new Vector();
private Vector vP = new Vector();
public Filter() {
getSourceContent();
getBody();
System.out.println(sourceContent);
getB();
getP();
System.out.println("提取<body>的内容是:");
System.out.println(sourceContent);
System.out.println("提取<b>的内容是:");
for(int i = 0; i < vB.size(); i++){
System.out.println(vB.get(i).toString());
}
System.out.println("提取<p>的内容是:");
for(int i = 0; i < vP.size(); i++){
System.out.println(vP.get(i).toString());
}
} private void getB() {
System.out.println("匹配 <body>体中的 <b>");
Pattern bP = Pattern.compile("<b>.*</b>");
Matcher bM = bP.matcher(sourceContent);
while (bM.find()) {
vB.add(sourceContent.substring(bM.start(), bM.end()));
}
} private void getP() {
System.out.println("匹配 <body>体中的 <p>");
Pattern pP = Pattern.compile("<p>.*</p>");
Matcher pM = pP.matcher(sourceContent);
while (pM.find()) {
vP.add(sourceContent.substring(pM.start(), pM.end()));
} } private void getBody() {
Pattern body = Pattern.compile(" <body.* </body>");
Matcher mbody = body.matcher(sourceContent);
if(mbody.find()) {
sourceContent.substring(mbody.start(), mbody.end());
}
} private void getSourceContent() {
StringBuffer sb = new StringBuffer();
try {
URL url = new URL("http://www.wyu.cn/news/news_zxtz/200942111210750924.htm");
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader br = new BufferedReader(isr); String temp = null;
while ((temp = br.readLine()) != null) {
// sb.append(temp + "\r");
sb.append(temp);
sourceContent = sb.toString();
}
br.close();
} catch (Exception e) {
System.out.println(e);
}
} public static void main(String[] lsg) {
new Filter();
}
}