java web开发中的正则问题 小弟所在的项目需要从网上找到相应网页,并取得网页内容。但是不想要其中的标签,(就是取得一个网页的源代码,将其中的各种标签去掉)请教各位大侠们,如何编写正则表达式去掉网页中的标签呀? 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 package org.regular;import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;public class HtmlRegTest { public static String RMHTML(String Htmlstring) { if (Htmlstring.isEmpty()) return ""; Htmlstring = RegexPattern("<!--[\\w\\W]*?-->", "", Htmlstring); Htmlstring = RegexPattern("<div[\\w\\W]*?>", "", Htmlstring); Htmlstring = RegexPattern( "<\\s*?script[^>]*>[\\s\\S]*?<\\s*?/\\s*?script\\s*?>", "", Htmlstring); Htmlstring = RegexPattern("</font>", "", Htmlstring); Htmlstring = RegexPattern("</div>", "", Htmlstring); Htmlstring = RegexPattern("<font>", "", Htmlstring); Htmlstring = RegexPattern("^\\s*|\\s*$", "", Htmlstring); /* * * Htmlstring = RegexPattern("&(quot|#34);", "\"", Htmlstring); * * Htmlstring = RegexPattern("&(amp|#38);", "&", Htmlstring); * * Htmlstring = RegexPattern("&(lt|#60);", "<", Htmlstring); * * Htmlstring = RegexPattern("&(gt|#62);", ">", Htmlstring); * * Htmlstring = RegexPattern("&(nbsp|#160);", " ", Htmlstring); * * Htmlstring = RegexPattern("&(iexcl|#161);", "\\xa1", Htmlstring); * * Htmlstring = RegexPattern("&(cent|#162);", "\\xa2", Htmlstring); * * Htmlstring = RegexPattern("&(pound|#163);", "\\xa3", Htmlstring); * * Htmlstring = RegexPattern("&(copy|#169);", "\\xa9", Htmlstring); * * Htmlstring = RegexPattern("&#(\\d+);", "", Htmlstring); * * Htmlstring = RegexPattern("<", "", Htmlstring); * * Htmlstring = RegexPattern(">", "", Htmlstring); */ // Htmlstring.replace("\r\n", "",Htmlstring); return Htmlstring; } public static String RegexPattern(String pattern, String str, String content) { if (pattern != null && !pattern.equals("")) { Pattern p = Pattern.compile(pattern, 2); Matcher m = p.matcher(content); content = m.replaceAll(str); } return content; }} 注释?把这几句看懂就ok了: Pattern p = Pattern.compile(pattern, 2); Matcher m = p.matcher(content);//匹配 content = m.replaceAll(str);//替换 网站配置二级域名,用JSP页面调不到类,急呀? jbittorrentapi-v1.1.zip BT 做过联通SP页面的近来看看,求教问题 今天遇到三个棘手问题 java删除linux下的文件怎么写? ghost做成的镜像最大是不是2G? sysbase连接池配置? 高分求教一个简单问题! js的 showModalDialog 怎么控制关闭窗口后返回的位置 关于SQL语句的执行结果。 Struts2 标签排列问题! FCKeditor怎样得到提交的文本内容,如果没用过那MyEclipse调试时怎么查看request的各个param的值
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class HtmlRegTest {
public static String RMHTML(String Htmlstring) {
if (Htmlstring.isEmpty())
return ""; Htmlstring = RegexPattern("<!--[\\w\\W]*?-->", "", Htmlstring); Htmlstring = RegexPattern("<div[\\w\\W]*?>", "", Htmlstring); Htmlstring = RegexPattern(
"<\\s*?script[^>]*>[\\s\\S]*?<\\s*?/\\s*?script\\s*?>", "",
Htmlstring);
Htmlstring = RegexPattern("</font>", "", Htmlstring);
Htmlstring = RegexPattern("</div>", "", Htmlstring);
Htmlstring = RegexPattern("<font>", "", Htmlstring); Htmlstring = RegexPattern("^\\s*|\\s*$", "", Htmlstring); /*
* * Htmlstring = RegexPattern("&(quot|#34);", "\"", Htmlstring); *
* Htmlstring = RegexPattern("&(amp|#38);", "&", Htmlstring);
*
* Htmlstring = RegexPattern("&(lt|#60);", "<", Htmlstring);
*
* Htmlstring = RegexPattern("&(gt|#62);", ">", Htmlstring);
*
* Htmlstring = RegexPattern("&(nbsp|#160);", " ", Htmlstring);
*
* Htmlstring = RegexPattern("&(iexcl|#161);", "\\xa1", Htmlstring);
*
* Htmlstring = RegexPattern("&(cent|#162);", "\\xa2", Htmlstring);
*
* Htmlstring = RegexPattern("&(pound|#163);", "\\xa3", Htmlstring);
*
* Htmlstring = RegexPattern("&(copy|#169);", "\\xa9", Htmlstring);
*
* Htmlstring = RegexPattern("&#(\\d+);", "", Htmlstring);
*
* Htmlstring = RegexPattern("<", "", Htmlstring);
*
* Htmlstring = RegexPattern(">", "", Htmlstring);
*/
// Htmlstring.replace("\r\n", "",Htmlstring);
return Htmlstring;
} public static String RegexPattern(String pattern, String str, String content) {
if (pattern != null && !pattern.equals("")) {
Pattern p = Pattern.compile(pattern, 2);
Matcher m = p.matcher(content);
content = m.replaceAll(str);
}
return content;
}
}
把这几句看懂就ok了:
Pattern p = Pattern.compile(pattern, 2);
Matcher m = p.matcher(content);//匹配
content = m.replaceAll(str);//替换