procedure TFindWebThread.AddToList; begin if Form1.ListBox1.Items.IndexOf (Addr) < 0 then begin Form1.ListBox1.Items.Add (Addr); Form1.DetailsList.Add (Text); end; end;
procedure TFindWebThread.Execute; begin GrabHtml; HtmlToList; Status := 'Done with ' + StrUrl; Synchronize (ShowStatus); end;
procedure TFindWebThread.HtmlToList; var strAddr, strText: string; nText: integer; nBegin, nEnd: Integer; begin Status := 'Extracting data for: ' + StrUrl; Synchronize (ShowStatus); strRead := LowerCase (strRead); repeat // find the initial part HTTP reference nBegin := Pos ('href=http', strRead); if nBegin <> 0 then begin // get the remaining part of the string, starting with 'http' strRead := Copy (strRead, nBegin + 5, 1000000); // find the end of the HTTP reference nEnd := Pos ('>', strRead); strAddr := Copy (strRead, 1, nEnd - 1); // move on strRead := Copy (strRead, nEnd + 1, 1000000); // add the URL if 'google' is not in it if Pos ('google', strAddr) = 0 then begin nText := Pos ('</a>', strRead); strText := copy (strRead, 1, nText - 1); // remove cached references and duplicates if (Pos ('cached', strText) = 0) then begin Addr := strAddr; Text := strText; AddToList; end; end; end; until nBegin = 0; end;
procedure TFindWebThread.HttpWork(Sender: TObject; AWorkMode: TWorkMode; const AWorkCount: Integer); begin Status := 'Received ' + IntToStr (AWorkCount) + ' for ' + strUrl; Synchronize (ShowStatus); end;
procedure TFindWebThread.ShowStatus; begin Form1.StatusBar1.SimpleText := Status; end;
interface
uses
Classes, IdComponent, SysUtils, IdHTTP;
type
TFindWebThread = class(TThread)
protected
Addr, Text, Status: string;
procedure Execute; override;
procedure AddToList;
procedure ShowStatus;
procedure GrabHtml;
procedure HtmlToList;
procedure HttpWork (Sender: TObject; AWorkMode: TWorkMode;
const AWorkCount: Integer);
public
strUrl: string;
strRead: string;
end;
implementation
{ TFindWebThread }
uses
WebFindF;
procedure TFindWebThread.AddToList;
begin
if Form1.ListBox1.Items.IndexOf (Addr) < 0 then
begin
Form1.ListBox1.Items.Add (Addr);
Form1.DetailsList.Add (Text);
end;
end;
procedure TFindWebThread.Execute;
begin
GrabHtml;
HtmlToList;
Status := 'Done with ' + StrUrl;
Synchronize (ShowStatus);
end;
procedure TFindWebThread.GrabHtml;
var
Http1: TIdHTTP;
begin
Status := 'Sending query: ' + StrUrl;
Synchronize (ShowStatus);
Http1 := TIdHTTP.Create (nil);
try
Http1.Request.UserAgent := 'User-Agent: NULL';
Http1.OnWork := HttpWork;
strRead := Http1.Get (StrUrl);
finally
Http1.Free;
end;
end;
procedure TFindWebThread.HtmlToList;
var
strAddr, strText: string;
nText: integer;
nBegin, nEnd: Integer;
begin
Status := 'Extracting data for: ' + StrUrl;
Synchronize (ShowStatus);
strRead := LowerCase (strRead);
repeat
// find the initial part HTTP reference
nBegin := Pos ('href=http', strRead);
if nBegin <> 0 then
begin
// get the remaining part of the string, starting with 'http'
strRead := Copy (strRead, nBegin + 5, 1000000);
// find the end of the HTTP reference
nEnd := Pos ('>', strRead);
strAddr := Copy (strRead, 1, nEnd - 1);
// move on
strRead := Copy (strRead, nEnd + 1, 1000000);
// add the URL if 'google' is not in it
if Pos ('google', strAddr) = 0 then
begin
nText := Pos ('</a>', strRead);
strText := copy (strRead, 1, nText - 1);
// remove cached references and duplicates
if (Pos ('cached', strText) = 0) then
begin
Addr := strAddr;
Text := strText;
AddToList;
end;
end;
end;
until nBegin = 0;
end;
procedure TFindWebThread.HttpWork(Sender: TObject; AWorkMode: TWorkMode;
const AWorkCount: Integer);
begin
Status := 'Received ' + IntToStr (AWorkCount) + ' for ' + strUrl;
Synchronize (ShowStatus);
end;
procedure TFindWebThread.ShowStatus;
begin
Form1.StatusBar1.SimpleText := Status;
end;
end.
http://www.delphipages.cn/dispbbs.asp?boardID=8&ID=207&page=1希望对你有用!
<a href="/bbb/aaa.htm">aaa</a> <----相对,对应根目录
<a href="../bbb/aaa.htm">aaa</a> <----相对,上级
<a href="http://www.csdn.net/aaa.htm">aaa</a> <----绝对地址,可以直接用
<a href="javascript:window.open('aaa.htm','');">aaa</a> <----脚本模式,相对
<a href="#AAAA">aaa</a> <----这个是特殊标记,用于到当前页定位的
<a href="#" onclick="xxxxx();">aaa</a> <----这个是脚本调用的另一种形式,#是指向本页,所以你还得访问标记 onclick 事件内容,这个就复杂了。。自己拆字符串的话,就可以根据这个标记的一些特征,如,开头必定是<A ,然后找结尾</a>,再分析<a href=的位置,再找出>,再找href=后面是否有"或是' 字符,如果没有则内容为URL,一直找到空格或>表示URL查找完毕,要做好解析工作还是很复杂的,可以使用一些html的解析库,JEDI 的就提供一个。2、使用正则表达式,这个是现在处理字符串常用的方法,前提是你的正则表达式写的够好,那么提取数据就很容易了;这个是我原来提取baidu 的mp3的url正则,可能有问题,可以供参考。
(?i)\<\s*a[^\>]*(?<!(href\s*=".*\w+.mp3"))[\s ]*target=_blank\>.*</\s*a\s*\>delphi用正则的库有,名字叫做 TRegExpr ,它速度我不清楚,我一般是用c#或script。相关例子可以访问www.2ccc.com ,刚好前几天有人传了个下载baidu mp3的工具程序代码,写的很不错,可以学习。
3、用IE自己的功能,这里我做了个例子,就是列出所有LINK,这个恐怕是最简单的很强大的了。。首先,你放一个TWebBrowser ,我这里设置名称为:wb代码如下:
procedure TForm1.Button1Click(Sender: TObject);
begin
wb.Navigate('http://www.csdn.net/'); //这里指定地址
end;procedure TForm1.Button2Click(Sender: TObject);
var links :OleVariant;
uri : string;
i:integer;
begin
links := wb.OleObject.document.getElementsByTagName('A'); //访问标记A的集合对象
for i:=0 to Integer(links.length)-1 do
begin
uri := links.item(i).outerHtml; //获取索引元素的HTML
ListBox1.Items.Add(uri); //显示到listbox1
end;
end;