请问如何去掉HTML代码???
就连<style .....>....</style> 与 <script ...>....</script> 里面的内容一起去掉承诺加分至200
就连<style .....>....</style> 与 <script ...>....</script> 里面的内容一起去掉承诺加分至200
解决方案 »
- 关于quartz,熟悉的请进来帮帮忙
- 使用poi类库读取excel中的数据时,如何将excel中的数值数据读出???
- 哎呀。JavaScript遇到问题。不知道怎么解决了。
- JSP页面刷新问题
- 关于相对路径
- LobCreator.setClobAsString不报错,但更新不起作用?求解
- 帅哥们!刚出炉的问题!大家快看一下啊!!!快快快快快快快
- jSP如何做成BLOG那种显示日期东西
- 我以前做了很长时间的ASO,现在要转JSP,请问需要什么样的过程?
- 我编了个copyright.js文件,如何在其它网页中调用。(在线等待)
- 提取字符串,从select中
- java.exe占用虚拟内存不断增加导致服务器挂掉的问题~~高手或有经验者帮下忙
if (Str == null || Str.equals(""))
return "";
int i,j;
Str = Str.replace(" "," ");
while(Str.indexOf("<")!=-1 && Str.indexOf(">")!=-1){
i=Str.indexOf("<");
j=Str.indexOf(">")+1;
Str=Str.replace(Str.substring(i,j),"");
}
return Str;
}
这些 < > 应该是写成: < > 的
< > 转化成<、>,再用楼上的正则表达式。
public String FormatHtml(String Str){
if (Str == null ¦ ¦ Str.equals(""))
return "";
int i,j;
Str = Str.replace(" "," ");
while(Str.indexOf(" <")!=-1 && Str.indexOf(">")!=-1){
i=Str.indexOf(" <");
j=Str.indexOf(">")+1;
Str=Str.replace(Str.substring(i,j),"");
}
return Str;
}
这个没有用过,不过感觉好像会有很多意外情况
-------------------------------------------------String noHtmlContent = content.replaceAll(" <[^>]*>","");
可以解决一些问题 例如:<div class="ddd" ....> xxxx</div> 会去掉<div class="ddd" ....> 与 </div>
但我想<script ...>.... </script> 所有的东西一起去掉--------------------------------------------------最后,自己想办法解决了,与大家一起分享:
doc=doc.replaceAll("\\s*", "");
doc=doc.replaceAll("<style[^>]*>.*</style>", "");
doc=doc.replaceAll("<script[^>]*>.*</script>", "");
doc=doc.replaceAll("<[^<>]+>", "");--------------------------------------------------
不过还是了解了一些其他的东西,明天加分50,自己解决,省一些分
* 替换所有html标签,注释,脚本,css代码
* @param h 要替换的串(源字符串)
* @param replacement 用于替换的串,即用该串替换源字符串中的html标签,注释,脚本,css代码
* @return
*/
public static String cleanHTML(String h, String replacement) {
if (h == null) {
return h;
}
if (replacement == null) {
replacement = "";
}
String html = new String(h);
String[] htmlTag = {
"A", "ABBR", "ACRONYM", "ADDRESS", "APPLET", "AREA", "B", "BASE",
"BASEFONT", "BDO", "BIG", "BLOCKQUOTE", "BODY", "BR", "BUTTON",
"CAPTION", "CENTER", "CITE", "CODE", "COL", "COLGROUP", "DD", "DEL",
"DFN", "DIR", "DIV", "DL", "DT", "EM", "FIELDSET", "FONT", "FORM",
"FRAME", "FRAMESET", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HR",
"HTML", "I", "IFRAME", "IMG", "INPUT", "INS", "ISINDEX", "KBD", "LABEL",
"LEGEND", "LI", "LINK", "MAP", "MENU", "META", "NOFRAMES", "NOSCRIPT",
"OBJECT", "OL", "OPTGROUP", "OPTION", "P", "PARAM", "PRE"
, "Q", "S", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRIKE",
"STRONG", "STYLE", "SUB", "SUP", "TABLE", "TBODY", "TD", "TEXTAREA",
"TFOOT", "TH", "THEAD", "TITLE", "TR", "TT", "U", "UL", "VAR"};
String regex = "";
//替换所有注释,脚本,css代码
html = Pattern.compile("<!--((?!<!--).)*-->", Pattern.DOTALL).matcher(html).
replaceAll(replacement);
html = Pattern.compile("<script((?!</script).)*</script>",
Pattern.DOTALL |
Pattern.CASE_INSENSITIVE).matcher(html).replaceAll(
replacement);
html = Pattern.compile("<style((?!</style).)*</style>",
Pattern.DOTALL |
Pattern.CASE_INSENSITIVE).matcher(html).replaceAll(
replacement); //循环替换所有html标签
for (int i = 0; i < htmlTag.length; i++) {
regex = "<" + htmlTag[i] + "[^<]*>";
html = Pattern.compile(regex, Pattern.CASE_INSENSITIVE).matcher(html).
replaceAll(replacement);
regex = "</" + htmlTag[i] + ">";
html = Pattern.compile(regex, Pattern.CASE_INSENSITIVE).matcher(html).
replaceAll(replacement);
}
html = html.replaceAll("\\s+", replacement);
return html;
}
import java.util.regex.Pattern;
import java.util.regex.Matcher;
String str = "<html><b>HTML去除,<go>Go!!!!</go>用正则才是王道!!!</b></html>";
String code = "<.*?>";
System.out.println(str.replaceAll(code, ""));
}
String str = "<html><head><script>function(){}</script><style>.FCool{margin:1px}
</style></head><b>HTML去除,<go>哈哈!!!!</go>用正则才是王道!!!</b></html>";
String code = "<.*?>(<script.*?>.*?</script.*?>)?(<style.*?>.*?</style.*?>)?";
System.out.println(str.replaceAll(code, ""));
}输出:HTML去除,哈哈!!!!用正则才是王道!!!
if (html == null || html.trim().equals("")) {
return "";
} else {
Pattern pattern = Pattern.compile("<[^<|^>]*>");
Matcher matcher = pattern.matcher(html);
StringBuffer txt = new StringBuffer();
while (matcher.find()) {
String group = matcher.group();
if (group.matches("<[\\s]*>")) {
matcher.appendReplacement(txt, group);
} else {
matcher.appendReplacement(txt, "");
}
}
matcher.appendTail(txt);
String t = txt.toString();
//替换通用的html片段为正常形式
t = t.replaceAll("“", "“");
t = t.replaceAll("”", "”");
t = t.replaceAll("&", "&");
t = t.replaceAll("‘", "‘");
t = t.replaceAll("’", "’");
t = t.replaceAll(" ", "");
t = t.replaceAll("<", "<");
t = t.replaceAll(">", ">");
System.out.println(t);
return t;
}
来自网络,奉献大家
return "";
} Pattern pattern = Pattern.compile("<[^<|>]*>");
Pattern pattern1 = Pattern.compile(" ");
Matcher matcher = pattern.matcher(str); String returnStr = matcher.replaceAll(""); Matcher matcher1 = pattern.matcher(returnStr);
return returnStr;
}
* 替换所有html标签,注释,脚本,css代码
* @param h 要替换的串(源字符串)
* @param replacement 用于替换的串,即用该串替换源字符串中的html标签,注释,脚本,css代码
* @return
*/
public static String cleanHTML(String h, String replacement) {
if (h == null) {
return h;
}
if (replacement == null) {
replacement = "";
}
String html = new String(h);
String[] htmlTag = {
"A", "ABBR", "ACRONYM", "ADDRESS", "APPLET", "AREA", "B", "BASE",
"BASEFONT", "BDO", "BIG", "BLOCKQUOTE", "BODY", "BR", "BUTTON",
"CAPTION", "CENTER", "CITE", "CODE", "COL", "COLGROUP", "DD", "DEL",
"DFN", "DIR", "DIV", "DL", "DT", "EM", "FIELDSET", "FONT", "FORM",
"FRAME", "FRAMESET", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HR",
"HTML", "I", "IFRAME", "IMG", "INPUT", "INS", "ISINDEX", "KBD", "LABEL",
"LEGEND", "LI", "LINK", "MAP", "MENU", "META", "NOFRAMES", "NOSCRIPT",
"OBJECT", "OL", "OPTGROUP", "OPTION", "P", "PARAM", "PRE"
, "Q", "S", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRIKE",
"STRONG", "STYLE", "SUB", "SUP", "TABLE", "TBODY", "TD", "TEXTAREA",
"TFOOT", "TH", "THEAD", "TITLE", "TR", "TT", "U", "UL", "VAR"};
String regex = "";
//替换所有注释,脚本,css代码
html = Pattern.compile(" <!--((?! <!--).)*-->", Pattern.DOTALL).matcher(html).
replaceAll(replacement);
html = Pattern.compile(" <script((?! </script).)* </script>",
Pattern.DOTALL ¦
Pattern.CASE_INSENSITIVE).matcher(html).replaceAll(
replacement);
html = Pattern.compile(" <style((?! </style).)* </style>",
Pattern.DOTALL ¦
Pattern.CASE_INSENSITIVE).matcher(html).replaceAll(
replacement); //循环替换所有html标签
for (int i = 0; i < htmlTag.length; i++) {
regex = " <" + htmlTag[i] + "[^ <]*>";
html = Pattern.compile(regex, Pattern.CASE_INSENSITIVE).matcher(html).
replaceAll(replacement);
regex = " </" + htmlTag[i] + ">";
html = Pattern.compile(regex, Pattern.CASE_INSENSITIVE).matcher(html).
replaceAll(replacement);
}
html = html.replaceAll("\\s+", replacement);
return html;
}