刚好在实现了这样的功能,我的是还要查找里面的Email地址 初步测试成功r,e:TRegExpr; link:string; begin idp.Request.UserAgent:='Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; TheWorld)'; memo1.Text:=idp.Get(trim(edit1.text)); r:=TRegExpr.Create; e:=TRegExpr.Create; //HTTP: '^http:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[\/=\?%\-&_~`@[\]\':+!]*([^\"\"])*$' //href="*[A-Za-z0-9\.\/\/=\?%\-&_\\\':+!]*[email|abort|contact|serv|sendmessage]+ r.Expression:='(http://)*[0-9a-zA-Z\/=\?%\-&_\\\.:]*(contact|about|lianxi|email|service|support|send)+[0-9a-zA-Z\/=\?%\-&_\\\.]*'; e.Expression:='[_a-zA-Z\d\-\.]+@[_a-zA-Z\d\-]+(\.[_a-zA-Z\d\-]+)+';if not connDB then showmessage('数据库链接失败!'); if r.Exec (memo1.text) then begin repeat if pos('http',r.Match[0])=0 then link:=trim(edit1.text)+'/'+r.Match[0] else if pos('https',r.Match[0])=0 then link:=r.Match[0] else r.ExecNext; link:=stringreplace(link,'//','/',[rfreplaceall]); if pos('http://',link)=0 then link:=stringreplace(link,'http:/','http://',[rfreplaceall]); memo2.Lines.Add(link); with dm.query do begin Close; sql.Clear; sql.Add('select * from linkList where link='+quotedstr(link)); open; if recordcount=0 then begin Append; FieldByName('link').value:=link; FieldByName('addTime').value:=now(); FieldByName('lKeyId').value:='0'; post; end; end; until not r.ExecNext; end;r.Expression:='[_a-zA-Z\d\-\.]+@[_a-zA-Z\d\-]+(\.[_a-zA-Z\d\-]+)+'; with dm.query do begin close; sql.Clear; sql.Add('select lid,link from linkList where sendTime is null'); open; showmessage(fieldbyname('link').asstring); while not eof do begin try if r.Exec(idp.Get(trim(fieldbyname('link').AsString))) then begin with Tadoquery.create(nil) do begin connection:=dm.conn; repeat; Close; sql.Clear; sql.Add('select * from linkList where link='+quotedstr(link)); open; if recordcount=0 then begin Append; FieldByName('link').value:=link; FieldByName('addTime').value:=now(); FieldByName('lKeyId').value:='0'; post; end; until not r.ExecNext; free; end; end else begin //link:='delete from linkList where lid='+fe;end; except r.ExecNext; end; end;end; r.free; end;procedure TEmail.CheckBox6Click(Sender: TObject); begin showmessage('因为google经常给墙,所以不建议使用,用yahoo!'); end;
比如flash里的链接,就不好获取。2、另外,大多数网站都有外部链接到其他网站。
比如你抓新浪网,上面有个链接是到腾讯网站的,腾讯网站有个是到搜狐网站的。
如果你这么找下去,啥时候能结束呢?
呵呵
估计这么一直爬下去,你有可能抓出所有互联网链接。嘿,这些数据,你可以做个简单搜索引擎啦 哈哈
用webbrowser 遍历又感觉太费资源。
初步测试成功r,e:TRegExpr;
link:string;
begin
idp.Request.UserAgent:='Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; TheWorld)';
memo1.Text:=idp.Get(trim(edit1.text));
r:=TRegExpr.Create;
e:=TRegExpr.Create;
//HTTP: '^http:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[\/=\?%\-&_~`@[\]\':+!]*([^\"\"])*$'
//href="*[A-Za-z0-9\.\/\/=\?%\-&_\\\':+!]*[email|abort|contact|serv|sendmessage]+
r.Expression:='(http://)*[0-9a-zA-Z\/=\?%\-&_\\\.:]*(contact|about|lianxi|email|service|support|send)+[0-9a-zA-Z\/=\?%\-&_\\\.]*';
e.Expression:='[_a-zA-Z\d\-\.]+@[_a-zA-Z\d\-]+(\.[_a-zA-Z\d\-]+)+';if not connDB then showmessage('数据库链接失败!');
if r.Exec (memo1.text) then begin repeat
if pos('http',r.Match[0])=0 then
link:=trim(edit1.text)+'/'+r.Match[0]
else
if pos('https',r.Match[0])=0 then
link:=r.Match[0]
else
r.ExecNext; link:=stringreplace(link,'//','/',[rfreplaceall]);
if pos('http://',link)=0 then
link:=stringreplace(link,'http:/','http://',[rfreplaceall]);
memo2.Lines.Add(link); with dm.query do begin
Close;
sql.Clear;
sql.Add('select * from linkList where link='+quotedstr(link));
open;
if recordcount=0 then begin
Append;
FieldByName('link').value:=link;
FieldByName('addTime').value:=now();
FieldByName('lKeyId').value:='0';
post;
end; end;
until not r.ExecNext;
end;r.Expression:='[_a-zA-Z\d\-\.]+@[_a-zA-Z\d\-]+(\.[_a-zA-Z\d\-]+)+';
with dm.query do begin
close;
sql.Clear;
sql.Add('select lid,link from linkList where sendTime is null');
open;
showmessage(fieldbyname('link').asstring);
while not eof do begin
try
if r.Exec(idp.Get(trim(fieldbyname('link').AsString))) then begin
with Tadoquery.create(nil) do begin
connection:=dm.conn;
repeat;
Close;
sql.Clear;
sql.Add('select * from linkList where link='+quotedstr(link));
open;
if recordcount=0 then begin
Append;
FieldByName('link').value:=link;
FieldByName('addTime').value:=now();
FieldByName('lKeyId').value:='0';
post;
end;
until not r.ExecNext;
free;
end;
end else begin
//link:='delete from linkList where lid='+fe;end;
except
r.ExecNext;
end;
end;end;
r.free;
end;procedure TEmail.CheckBox6Click(Sender: TObject);
begin
showmessage('因为google经常给墙,所以不建议使用,用yahoo!');
end;