// 取得下一段字符串 function GetNextToken(const s:string; const StartIdx:integer):string; var i:integer; begin result:=s[StartIdx]; if result='&' then begin for i:=StartIdx+1 to length(s) do begin if s[i] in ['&',' ',#13,'<'] then break; result:=result+s[i]; if s[i]=';' then break; end; end else if result='<' then begin for i:=StartIdx+1 to length(s) do begin result:=result+s[i]; if s[i]='>' then break; end; end else begin for i:=StartIdx+1 to length(s) do if s[i] in ['&','<'] then break else result:=result+s[i]; end; end;
//在str 中找到第 id 个 substr的位置, length(SubStr) =1 function NPos(SubStr, Str: String; Id: Integer) : Integer; var i, Count :integer; begin Count := 0; for i := 1 to Length(Str) do begin if Str[i] = SubStr then Inc(Count);
if Count >= Id then Break; end;
Result := i; end;
// 输入:<a href="http://delphigroup.yeah.net"> // 输出:http://delphigroup.yeah.net function GetLink(s:string):string; var LPos,RPos,LQuot,RQuot:integer; begin result:='';
// 去掉'....<' LPos:=pos('<',s); if LPos=0 then exit; delete(s,1,LPos); s:=Trim(s);
// 去掉'>....' RPos:=pos('>',s); if RPos=0 then exit; delete(s,RPos,MaxInt);
if uppercase(copy(s,1,2))='A ' then begin LPos:=pos('HREF',uppercase(s)); if LPos=0 then exit;
LQuot:=NPos('"',s,1); RQuot:=NPos('"',s,2);
if (LQuot<LPos) or (RQuot>RPos) then exit;
// 开头带'#'的超链接,视为无效 if s[LQuot+1]='#' then exit;
// 开头带'javascript:'的超链接,也视为无效 // 如:<div align=right><a href="javascript:window.close()"><IMG SRC="button_close.gif"></a></div> if copy(s,LQuot+1,11)='javascript:' then exit;
result:=copy(s,LQuot+1,RQuot-LQuot-1); end; end;
function MakeStr(Str: String; id : Integer):String; var i : integer; Buffer : String; begin Buffer := ''; for i := 1 to id do Buffer := Buffer + Str;
Result := Buffer; end;
function ReplaceStr(const S, Srch, Replace: string): string; var I: Integer; Source: string; begin Source := S; Result := ''; repeat I := Pos(uppercase(Srch), uppercase(Source)); if I > 0 then begin Result := Result + Copy(Source, 1, I - 1) + Replace; Source := Copy(Source, I + Length(Srch), MaxInt); end else Result := Result + Source; until I <= 0; end;
// 把所有&xxx的转义;所有<xxx>取消;其它照样返回 function ConvertHTMLToken(const s:string;var inPre:boolean):string; var s0,s0_2,s0_3,s0_4:string; begin if s[1]='&' then begin s0:=lowerCase(s); result:=''; if s0='&nbsp;' then result:=' ' else if s0='&quot;' then result:='"' else if s0='&gt;' then result:='>' else if s0='&lt;' then result:='<' else if s0='&middot;' then result:='·' else if s0='&trade;' then result:=' TM ' else if s0='&copy;' then result:='(c)' else if s0='&amp;' then result:='&' else if s0='&amp' then result:='&'; end else if s[1]='<' then begin s0:=lowerCase(s); s0_2:=copy(s0,1,2); s0_3:=copy(s0,1,3); s0_4:=copy(s0,1,4);
result:=''; // 将所有<hr>替换成为'------' if s0='<br>' then result:= CR else if s0_4='<pre' then // <pre 一定要在 <p 之前判断! begin inPre:=true;result:=CR; end else if s0_2='<p' then result:=CR+CR else if s0_3='<hr' then result:=CR+MakeStr('-',40)+CR else if s0_3='<ol' then result:=CR else if s0_3='<ul' then result:=CR // else if s0_4='</ol' then result:=CR // else if s0_4='</ul' then result:=CR else if s0_3='<li' then result:='·' else if s0_4='</li' then result:=CR else if s0_4='</tr' then result:=CR else if s0='</td>' then result:=#9 else if s0='<title>' then result:=' 标题 <<' else if s0='</title>' then result:='>>'+CR+CR else if s0='</pre>' then inPre:=false else if copy(s0,1,6)='<table' then result:=CR else if MarkLinks and (s0[2]='a') then begin CurrLink:=GetLink(s); if CurrLink<>'' then result:='['; end else if MarkLinks and (s0='</a>') then if CurrLink<>'' then result:=format(' %s ]',[CurrLink]); end else if inPre then result:=s else // 不在<pre>..</pre>内,则删除所有CR result:=ReplaceStr(s,CR,''); end;
begin s0:=HTMLText; result:=''; InputLen:=length(s0); InputIdx:=1; inPre:=false; CurrLink:='';
while InputIdx<=InputLen do begin NextToken:=GetNextToken(s0,InputIdx);
// 去除<style ...> -- </style>之间的内容 if lowercase(copy(NextToken,1,6))='<style' then begin while lowercase(NextToken)<>'</style>' do begin inc(InputIdx,length(NextToken)); NextToken:=GetNextToken(s0,InputIdx); end; inc(InputIdx,length(NextToken)); NextToken:=GetNextToken(s0,InputIdx); end;
// 去除<Script ...> -- </Script>之间的内容 if lowercase(copy(NextToken,1,7))='<script' then begin // while lowercase(NextToken)<>'</script>' do // begin // inc(InputIdx,length(NextToken)); // NextToken:=GetNextToken(s0,InputIdx); // end; // inc(InputIdx,length(NextToken)); // NextToken:=GetNextToken(s0,InputIdx); inc(InputIdx,length(NextToken)); inQuot:=false; i:=InputIdx-1; while I<InputLen do begin inc(i); if s0[i]='"' then begin inQuot:=not inQuot; continue; end; if not inQuot then // 去除<script>段里的<!-- ... -->注释段, 99.8.2 if copy(s0,i,4)='<!--' then begin HelpIdx:=pos('-->',copy(s0,i+4,MaxInt)); if HelpIdx>0 then begin inc(i,4+HelpIdx+2); end else begin i:=InputLen; break; end; end; if lowercase(copy(s0,i,9))='</script>' then begin break; end; end; InputIdx:=i; end;
j:=Pos('</html>',s);s2:=Copy(s,i+6,j-i-6);
pos 和copy 就可以满足一般的要求了
CONST CR = #13#10;
var
NextToken,s0:string;
i:integer;
HelpIdx:integer;
inQuot:boolean; // 去除<script>段之用
InputLen:integer;
InputIdx:integer; // 指向输入字符的下一个待处理字符
inPre:boolean; // 表示是否在<pre>...</pre>段内
CurrLink:string;
// 取得下一段字符串
function GetNextToken(const s:string; const StartIdx:integer):string;
var
i:integer;
begin
result:=s[StartIdx];
if result='&' then
begin
for i:=StartIdx+1 to length(s) do
begin
if s[i] in ['&',' ',#13,'<'] then break;
result:=result+s[i];
if s[i]=';' then break;
end;
end
else if result='<' then
begin
for i:=StartIdx+1 to length(s) do
begin
result:=result+s[i];
if s[i]='>' then break;
end;
end
else
begin
for i:=StartIdx+1 to length(s) do
if s[i] in ['&','<'] then break
else result:=result+s[i];
end;
end;
//在str 中找到第 id 个 substr的位置, length(SubStr) =1
function NPos(SubStr, Str: String; Id: Integer) : Integer;
var i, Count :integer;
begin
Count := 0;
for i := 1 to Length(Str) do
begin
if Str[i] = SubStr then Inc(Count);
if Count >= Id then Break;
end;
Result := i;
end;
// 输入:<a href="http://delphigroup.yeah.net">
// 输出:http://delphigroup.yeah.net
function GetLink(s:string):string;
var
LPos,RPos,LQuot,RQuot:integer;
begin
result:='';
// 去掉'....<'
LPos:=pos('<',s);
if LPos=0 then exit;
delete(s,1,LPos);
s:=Trim(s);
// 去掉'>....'
RPos:=pos('>',s);
if RPos=0 then exit;
delete(s,RPos,MaxInt);
if uppercase(copy(s,1,2))='A ' then
begin
LPos:=pos('HREF',uppercase(s));
if LPos=0 then exit;
LQuot:=NPos('"',s,1);
RQuot:=NPos('"',s,2);
if (LQuot<LPos) or (RQuot>RPos) then exit;
// 开头带'#'的超链接,视为无效
if s[LQuot+1]='#' then exit;
// 开头带'javascript:'的超链接,也视为无效
// 如:<div align=right><a href="javascript:window.close()"><IMG SRC="button_close.gif"></a></div>
if copy(s,LQuot+1,11)='javascript:' then exit;
result:=copy(s,LQuot+1,RQuot-LQuot-1);
end;
end;
function MakeStr(Str: String; id : Integer):String;
var i : integer;
Buffer : String;
begin
Buffer := '';
for i := 1 to id do Buffer := Buffer + Str;
Result := Buffer;
end;
function ReplaceStr(const S, Srch, Replace: string): string;
var
I: Integer;
Source: string;
begin
Source := S;
Result := '';
repeat
I := Pos(uppercase(Srch), uppercase(Source));
if I > 0 then begin
Result := Result + Copy(Source, 1, I - 1) + Replace;
Source := Copy(Source, I + Length(Srch), MaxInt);
end
else Result := Result + Source;
until I <= 0;
end;
// 把所有&xxx的转义;所有<xxx>取消;其它照样返回
function ConvertHTMLToken(const s:string;var inPre:boolean):string;
var
s0,s0_2,s0_3,s0_4:string;
begin
if s[1]='&' then
begin
s0:=lowerCase(s);
result:='';
if s0='&nbsp;' then result:=' '
else if s0='&quot;' then result:='"'
else if s0='&gt;' then result:='>'
else if s0='&lt;' then result:='<'
else if s0='&middot;' then result:='·'
else if s0='&trade;' then result:=' TM '
else if s0='&copy;' then result:='(c)'
else if s0='&amp;' then result:='&'
else if s0='&amp' then result:='&';
end
else if s[1]='<' then
begin
s0:=lowerCase(s);
s0_2:=copy(s0,1,2);
s0_3:=copy(s0,1,3);
s0_4:=copy(s0,1,4);
result:='';
// 将所有<hr>替换成为'------'
if s0='<br>' then result:= CR
else if s0_4='<pre' then // <pre 一定要在 <p 之前判断!
begin inPre:=true;result:=CR; end
else if s0_2='<p' then result:=CR+CR
else if s0_3='<hr' then result:=CR+MakeStr('-',40)+CR
else if s0_3='<ol' then result:=CR
else if s0_3='<ul' then result:=CR
// else if s0_4='</ol' then result:=CR
// else if s0_4='</ul' then result:=CR
else if s0_3='<li' then result:='·'
else if s0_4='</li' then result:=CR
else if s0_4='</tr' then result:=CR
else if s0='</td>' then result:=#9
else if s0='<title>' then result:=' 标题 <<'
else if s0='</title>' then result:='>>'+CR+CR
else if s0='</pre>' then inPre:=false
else if copy(s0,1,6)='<table' then result:=CR
else if MarkLinks and (s0[2]='a') then
begin
CurrLink:=GetLink(s);
if CurrLink<>'' then result:='[';
end
else if MarkLinks and (s0='</a>') then
if CurrLink<>'' then result:=format(' %s ]',[CurrLink]);
end
else if inPre then
result:=s
else // 不在<pre>..</pre>内,则删除所有CR
result:=ReplaceStr(s,CR,'');
end;
begin
s0:=HTMLText;
result:='';
InputLen:=length(s0);
InputIdx:=1;
inPre:=false;
CurrLink:='';
while InputIdx<=InputLen do
begin
NextToken:=GetNextToken(s0,InputIdx);
// 去除<style ...> -- </style>之间的内容
if lowercase(copy(NextToken,1,6))='<style' then
begin
while lowercase(NextToken)<>'</style>' do
begin
inc(InputIdx,length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;
inc(InputIdx,length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;
// 去除<Script ...> -- </Script>之间的内容
if lowercase(copy(NextToken,1,7))='<script' then
begin
// while lowercase(NextToken)<>'</script>' do
// begin
// inc(InputIdx,length(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
// end;
// inc(InputIdx,length(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,length(NextToken));
inQuot:=false;
i:=InputIdx-1;
while I<InputLen do
begin
inc(i);
if s0[i]='"' then
begin
inQuot:=not inQuot;
continue;
end;
if not inQuot then
// 去除<script>段里的<!-- ... -->注释段, 99.8.2
if copy(s0,i,4)='<!--' then
begin
HelpIdx:=pos('-->',copy(s0,i+4,MaxInt));
if HelpIdx>0 then
begin
inc(i,4+HelpIdx+2);
end
else
begin
i:=InputLen;
break;
end;
end;
if lowercase(copy(s0,i,9))='</script>' then
begin
break;
end;
end;
InputIdx:=i;
end;
NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,length(NextToken));
result:=result+ConvertHTMLToken(NextToken,inPre);
end;
end;