function GetText(obj){ var ret = ""; for(var i = 0; i < obj.childNodes.length; i++){ var o = obj.childNodes[i]; if ( o.nodeType != 8 ){ ret += o.nodeType != 1 ? o.nodeValue : GetText(o); } } return ret; }抄jq的
//用这个试试看 var regex=/\<[^\>]+?\>/igm;
写了个简单的,可能有考虑不全的地方。观察html中的文本大致上 文本节点无外乎以下两种情形 <node>cccc</node> <node>dddd<node>ffff</node>ggg</node> 他们的共同特点就是,文本内容夹在><之间 <table id="tbl1"> <tr> <td><font>aa</font></td> <td><font>bb </font></td> </tr> <tr> <td><font>cc</font></td> <td><font>dd</font></td> </tr> </table> <script type="text/javascript"> function $(sId){ return document.getElementById(sId); }var oTbl=$("tbl1"); var oReg=/>([^<>]+)</g; var sHtml=$("tbl1").innerHTML.replace(/\n+/g,""); var aTxt=[]; while(oReg.exec(sHtml)){ aTxt.push(RegExp.$1); }alert(aTxt); </script>
function delHtmlTag4Pager(str) { return str.replace(/<[^>]+>/gi,"");//去掉所有的html标记 }
{
str = Regex.Replace(str, @"\<(img)[^>]*>|<\/(img)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(table|tbody|tr|td|th|)[^>]*>|<\/(table|tbody|tr|td|th|)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(div|blockquote|fieldset|legend)[^>]*>|<\/(div|blockquote|fieldset|legend)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(font|i|u|h[1-9]|s)[^>]*>|<\/(font|i|u|h[1-9]|s)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(style|strong)[^>]*>|<\/(style|strong)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<a[^>]*>|<\/a>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(meta|iframe|frame|span|tbody|layer)[^>]*>|<\/(iframe|frame|meta|span|tbody|layer)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<a[^>]*", "", RegexOptions.IgnoreCase);
return str;
}
public static string delHtml(string str)
{
if (str != null && str.Trim() != "")
return Regex.Replace(str, "<[^>]+>", "");
return str;
}
一类:开始标签,到结束标签,及之间的内容要全部除去的。如:STYLE, SCRIPT, COMMENT
二类:只需要删除开始标签和结束标记本身,而保留标签之间的内容的,大部分标签是这一类。如B,U,I,A,TD......
<pre id="test">
<table>
<tr>
<td> <font>aa </font> </td>
<td> <font>bb </font> </td>
</tr>
<tr>
<td> <font>cc </font> </td>
<td> <font>dd </font> </td>
</tr>
</table>
</pre>
<script type="text/javascript">
var test=document.getElementById("test").innerHTML;
alert(test.replace(/<[^>]+>\n?/g, " ").replace(/\s+/g, " "));
</script>
<tr>
<td> <font>aa </font> </td>
<td> <font>bb </font> </td>
</tr>
<tr>
<td> <font>cc </font> </td>
<td> <font>dd </font> </td>
</tr>
</table> <script type="text/javascript">
function $(sId){
return document.getElementById(sId);
}var oTbl=$("tbl1");
var sTxt=oTbl.innerText||oTbl.textContent;
alert(sTxt);
</script>
我也知道可以用innerText或textContent,呵呵
不过LZ要的估计是字符串的处理把
var ret = "";
for(var i = 0; i < obj.childNodes.length; i++){
var o = obj.childNodes[i];
if ( o.nodeType != 8 ){
ret += o.nodeType != 1 ? o.nodeValue : GetText(o);
}
}
return ret;
}抄jq的
//用这个试试看
var regex=/\<[^\>]+?\>/igm;
文本节点无外乎以下两种情形
<node>cccc</node>
<node>dddd<node>ffff</node>ggg</node>
他们的共同特点就是,文本内容夹在><之间
<table id="tbl1">
<tr>
<td><font>aa</font></td>
<td><font>bb </font></td>
</tr>
<tr>
<td><font>cc</font></td>
<td><font>dd</font></td>
</tr>
</table> <script type="text/javascript">
function $(sId){
return document.getElementById(sId);
}var oTbl=$("tbl1");
var oReg=/>([^<>]+)</g;
var sHtml=$("tbl1").innerHTML.replace(/\n+/g,"");
var aTxt=[];
while(oReg.exec(sHtml)){
aTxt.push(RegExp.$1);
}alert(aTxt);
</script>
function delHtmlTag4Pager(str)
{
return str.replace(/<[^>]+>/gi,"");//去掉所有的html标记
}
str = Regex.Replace(str, @"\<(table|tbody|tr|td|th|)[^>]*>|<\/(table|tbody|tr|td|th|)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(div|blockquote|fieldset|legend)[^>]*>|<\/(div|blockquote|fieldset|legend)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(font|i|u|h[1-9]|s)[^>]*>|<\/(font|i|u|h[1-9]|s)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(style|strong)[^>]*>|<\/(style|strong)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<a[^>]*>|<\/a>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<(meta|iframe|frame|span|tbody|layer)[^>]*>|<\/(iframe|frame|meta|span|tbody|layer)>", "", RegexOptions.IgnoreCase);
str = Regex.Replace(str, @"\<a[^>]*", "", RegexOptions.IgnoreCase);