下面是一段网页源文件,如何通过字符串操作或其他方式查询出这段源文件中所有的链接地址和对应的链接文字并存储到access中
<!doctype html public "-//w3c//dtd xhtml 1.0 transitional//en" "http://www.w3.org/tr/xhtml1/dtd/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="gb2312">
<head>
<meta http-equiv="content-type" content="text/html; charset=gb2312">
<title>百谷虎山寨搜索-baigoohoo---百度Google雅虎一起搜!</title>
<link rel="search" type="application/opensearchdescription+xml" href="search/find.xml" title="百谷虎山寨搜索-baigoohoo---百度Google雅虎一起搜!">
<link rel="stylesheet" href="search/style/index_baidu.css" type="text/css" media="screen">
<link rel="stylesheet" href="css.css" type="text/css" media="screen">
<link rel="shortcut icon" href="search/favicon.ico">
<META content=百谷虎,baigoohoo,搜索,百度,Google,雅虎,山寨搜索,yahoo,百Google度,yahoo,msn,baidu,搜索引擎,baigoogledu
name=keywords>
<META content=百谷虎,baigoohoo,搜索,百度,Google,雅虎,山寨搜索,yahoo,百Google度,yahoo,msn,baidu,搜索引擎,baigoogledu
name=description>
<style type="text/css">
<!--
.STYLE2 {
color: #FF0000;
font-weight: bold;
}
-->
</style>
<script src="Scripts/AC_RunActiveContent.js" type="text/javascript"></script>
</head><body onLoad="_load();" scroll="yes"><table width="100%" border="0" cellspacing="0" cellpadding="5">
<tr>
<td> </td>
<td align="right"><a onClick="this.style.behavior='url(#default#homepage)';this.setHomePage('http://www.baigoohoo.com/');return false;" href=# id=topnav>设为主页</a> |
<a href="javascript:void(0);" onClick='window.external.AddFavorite(location.href, document.title);'>加入收藏</a> | <a href="search/getcode.asp" id=topnav>搜索代码</a> |
<a href="http://www.dyciii.com">论坛</a></td>
</tr>
</table> <table width="497" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td width="22"> </td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.baigoohoo.cn" target="_top" class="navfoot">淘宝铺</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.lanrw.com" target="_top" class="navfoot">网址导航</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.baigoohoo.com/radio/" target="_top" class="navfoot">网络电台</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.jixiedai.com" target="_top" class="navfoot">系鞋带网</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.yagezi.com" target="_top" class="navfoot">呀格子</a></td>
</tr>
</table></td>
<td width="160"> </td>
</tr>
</table></div>
<table width="627" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td height="70" colspan="4" align="center"><a href="http://www.baigoohoo.cn" target="_blank"><img src="http://www.baigoohoo.com/images/tbp.jpg" alt="百谷虎淘宝铺上线" width="500" height="60" border="0"></a></td>
</tr>
<tr>
<td width="25%" height="25" align="center"><a href="http://www.baigoohoo.cn" target="_blank">淘宝网拍拍网网店推广</a></td>
<td width="26%" height="25" align="center"><a href="http://shop34469444.taobao.com/" target="_blank">江南E栈淘宝网网网网网</a></td>
<td width="26%" height="25" align="center"><a href="http://www.you369.com/" target="_blank">You369机票预订</a></td>
<td width="23%" height="25" align="center"><a href="http://shanzhaiji.com/forumdisplay.php?fid=6" target="_blank">首部山寨电影招山寨演员</a><a href="http://www.baigoohoo.cn/viewthread.php?tid=13&extra=page%3D1" target="_blank"></a></td>
</tr>
</table>
<div id="footer01">
<br>版权所有 © 2008-2010 <a href=http://www.baigoohoo.com target="_blank">百谷虎山寨搜索-baigoohoo---百度Google雅虎一起搜!</a><div class="help"><script src="http://s73.cnzz.com/stat.php?id=1075371&web_id=1075371&show=pic" language="JavaScript" charset="gb2312"></script></div>
</div><table width="608" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td align="center">友情链接:</td>
<td height="22" align="left"><a href="http://www.baigoohoo.com" target="_blank">百谷虎山寨搜索</a> <a href="http://www.591man.com/" target="_blank">wo男人帮商城</a> <a href="http://xuxule.net/" target="_blank">嘘嘘乐</a> <a href="http://ylzt888.blog.163.com/" target="_blank">一路涨停</a> <a href="http://everbaidu.com" target="_blank">百度天下</a> <a href="http://www.zxko.com" target="_blank">小游戏</a> <a href="http://www.lanrw.com" target="_blank">懒人网址</a> <a href="http://www.18vr.com/?5index.html" target="_blank">策略大师</a></td>
</tr>
<tr>
<td align="center"> </td>
<td height="22" align="left"><a href="http://www.lepaw.cn/" target="_blank">乐趴网</a> <a href="http://www.zzmo.cn" target="_blank">猪猪手机书</a>
<a href="http://www.jingjiweiji.com/" target="_blank">经济危机</a><a href="http://www.zxko.com" target="_blank"></a> <a href="http://www.dyciii.com" target="_blank">第一次论坛</a> <a href="http://www.jeanscoo.com" target="_blank">牛仔品牌</a> <a href="http://www.shanzhaiba.com/" target="_blank">山寨吧</a> <a href="http://www.qegee.com" target="_blank">前景网</a>
<a href="http://www.shanzhaiji.com" target="_blank">山寨机</a>
<a href="link.html" target="_blank">更多>></a></td>
</tr>
</table><table width="500" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td align="center" class="STYLE2"> </td>
</tr>
<tr>
<td align="center" class="STYLE2">李春啸《民工也疯狂》之《社会主义好》 </td>
</tr>
<tr>
<td align="center"><embed src="http://player.youku.com/player.php/sid/XNzAzMTI0NDA=/v.swf" quality="high" width="480" height="400" align="middle" allowScriptAccess="sameDomain" type="application/x-shockwave-flash"></embed></td>
</tr>
<tr>
<td align="center">大家可能觉得这个视频放在这里不太合适,不过实在佩服他的创意,拍的很不错<br>
<a href="http://www.baigoohoo.cn/mgyfk.html" target="_blank"><span class="STYLE2">点击查看民工也疯狂系列全集</span></a></td>
</tr>
</table>
<p> </p>
</body>
</html>
<!doctype html public "-//w3c//dtd xhtml 1.0 transitional//en" "http://www.w3.org/tr/xhtml1/dtd/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="gb2312">
<head>
<meta http-equiv="content-type" content="text/html; charset=gb2312">
<title>百谷虎山寨搜索-baigoohoo---百度Google雅虎一起搜!</title>
<link rel="search" type="application/opensearchdescription+xml" href="search/find.xml" title="百谷虎山寨搜索-baigoohoo---百度Google雅虎一起搜!">
<link rel="stylesheet" href="search/style/index_baidu.css" type="text/css" media="screen">
<link rel="stylesheet" href="css.css" type="text/css" media="screen">
<link rel="shortcut icon" href="search/favicon.ico">
<META content=百谷虎,baigoohoo,搜索,百度,Google,雅虎,山寨搜索,yahoo,百Google度,yahoo,msn,baidu,搜索引擎,baigoogledu
name=keywords>
<META content=百谷虎,baigoohoo,搜索,百度,Google,雅虎,山寨搜索,yahoo,百Google度,yahoo,msn,baidu,搜索引擎,baigoogledu
name=description>
<style type="text/css">
<!--
.STYLE2 {
color: #FF0000;
font-weight: bold;
}
-->
</style>
<script src="Scripts/AC_RunActiveContent.js" type="text/javascript"></script>
</head><body onLoad="_load();" scroll="yes"><table width="100%" border="0" cellspacing="0" cellpadding="5">
<tr>
<td> </td>
<td align="right"><a onClick="this.style.behavior='url(#default#homepage)';this.setHomePage('http://www.baigoohoo.com/');return false;" href=# id=topnav>设为主页</a> |
<a href="javascript:void(0);" onClick='window.external.AddFavorite(location.href, document.title);'>加入收藏</a> | <a href="search/getcode.asp" id=topnav>搜索代码</a> |
<a href="http://www.dyciii.com">论坛</a></td>
</tr>
</table> <table width="497" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td width="22"> </td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.baigoohoo.cn" target="_top" class="navfoot">淘宝铺</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.lanrw.com" target="_top" class="navfoot">网址导航</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.baigoohoo.com/radio/" target="_top" class="navfoot">网络电台</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.jixiedai.com" target="_top" class="navfoot">系鞋带网</a></td>
</tr>
</table></td>
<td width="77"><table width="60" border="0" align="center" cellpadding="2" cellspacing="0" background="images/navbg.gif">
<tr>
<td height="21" align="center"><a href="http://www.yagezi.com" target="_top" class="navfoot">呀格子</a></td>
</tr>
</table></td>
<td width="160"> </td>
</tr>
</table></div>
<table width="627" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td height="70" colspan="4" align="center"><a href="http://www.baigoohoo.cn" target="_blank"><img src="http://www.baigoohoo.com/images/tbp.jpg" alt="百谷虎淘宝铺上线" width="500" height="60" border="0"></a></td>
</tr>
<tr>
<td width="25%" height="25" align="center"><a href="http://www.baigoohoo.cn" target="_blank">淘宝网拍拍网网店推广</a></td>
<td width="26%" height="25" align="center"><a href="http://shop34469444.taobao.com/" target="_blank">江南E栈淘宝网网网网网</a></td>
<td width="26%" height="25" align="center"><a href="http://www.you369.com/" target="_blank">You369机票预订</a></td>
<td width="23%" height="25" align="center"><a href="http://shanzhaiji.com/forumdisplay.php?fid=6" target="_blank">首部山寨电影招山寨演员</a><a href="http://www.baigoohoo.cn/viewthread.php?tid=13&extra=page%3D1" target="_blank"></a></td>
</tr>
</table>
<div id="footer01">
<br>版权所有 © 2008-2010 <a href=http://www.baigoohoo.com target="_blank">百谷虎山寨搜索-baigoohoo---百度Google雅虎一起搜!</a><div class="help"><script src="http://s73.cnzz.com/stat.php?id=1075371&web_id=1075371&show=pic" language="JavaScript" charset="gb2312"></script></div>
</div><table width="608" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td align="center">友情链接:</td>
<td height="22" align="left"><a href="http://www.baigoohoo.com" target="_blank">百谷虎山寨搜索</a> <a href="http://www.591man.com/" target="_blank">wo男人帮商城</a> <a href="http://xuxule.net/" target="_blank">嘘嘘乐</a> <a href="http://ylzt888.blog.163.com/" target="_blank">一路涨停</a> <a href="http://everbaidu.com" target="_blank">百度天下</a> <a href="http://www.zxko.com" target="_blank">小游戏</a> <a href="http://www.lanrw.com" target="_blank">懒人网址</a> <a href="http://www.18vr.com/?5index.html" target="_blank">策略大师</a></td>
</tr>
<tr>
<td align="center"> </td>
<td height="22" align="left"><a href="http://www.lepaw.cn/" target="_blank">乐趴网</a> <a href="http://www.zzmo.cn" target="_blank">猪猪手机书</a>
<a href="http://www.jingjiweiji.com/" target="_blank">经济危机</a><a href="http://www.zxko.com" target="_blank"></a> <a href="http://www.dyciii.com" target="_blank">第一次论坛</a> <a href="http://www.jeanscoo.com" target="_blank">牛仔品牌</a> <a href="http://www.shanzhaiba.com/" target="_blank">山寨吧</a> <a href="http://www.qegee.com" target="_blank">前景网</a>
<a href="http://www.shanzhaiji.com" target="_blank">山寨机</a>
<a href="link.html" target="_blank">更多>></a></td>
</tr>
</table><table width="500" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td align="center" class="STYLE2"> </td>
</tr>
<tr>
<td align="center" class="STYLE2">李春啸《民工也疯狂》之《社会主义好》 </td>
</tr>
<tr>
<td align="center"><embed src="http://player.youku.com/player.php/sid/XNzAzMTI0NDA=/v.swf" quality="high" width="480" height="400" align="middle" allowScriptAccess="sameDomain" type="application/x-shockwave-flash"></embed></td>
</tr>
<tr>
<td align="center">大家可能觉得这个视频放在这里不太合适,不过实在佩服他的创意,拍的很不错<br>
<a href="http://www.baigoohoo.cn/mgyfk.html" target="_blank"><span class="STYLE2">点击查看民工也疯狂系列全集</span></a></td>
</tr>
</table>
<p> </p>
</body>
</html>
解决方案 »
- 急!!!关于excel表导入dbgrid的问题
- 使用MS SQL2000数据库,table.post时出错。
- 多个SpinEdit关联问题
- FastReport 数据设定问题
- 如何实现不间断打印主从表(非分组报表问题)
- 在delphi中使用什么报表控件打印的报表效果较好
- 为什么QReport预览的时候ColumnHeader显示不出来,而且Detail区只能显示一条数据?
- 数据库更新
- stringgrid 问题高手请进
- 为什么我回复别人的贴子都不见了呢?哪位好心告诉我一下。我怎么找到?
- Printer 为何在点个按钮时,可打印出内容,为何用别的按钮调用这个这按钮时,为何打印机只走纸,却没有内容呢?
- 资源文件已经放进去了,为何还是在释放时的报错
分不够可以再加,关键是能解决问题
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls, OleCtrls, SHDocVw,mshtml,ActiveX;type
TForm1 = class(TForm)
WebBrowser1: TWebBrowser;
Button1: TButton;
Memo1: TMemo;
procedure Button1Click(Sender: TObject);
procedure FormCreate(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;var
Form1: TForm1;implementation{$R *.dfm}procedure TForm1.Button1Click(Sender: TObject);
var
i : integer;
begin WebBrowser1.OleObject.Document.body.innerHTML := Memo1.Text;
for i := 0 to WebBrowser1.OleObject.document.anchors.Length - 1 do
begin
Memo1.lines.append( WebBrowser1.OleObject.document.anchors.item(i).href);
end;
end;procedure TForm1.FormCreate(Sender: TObject);
begin
WebBrowser1.Navigate('about:blank');
end;end.
unit Unit1;interfaceuses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls,StrUtils;type
TForm1 = class(TForm)
Button1: TButton;
Memo_Log: TMemo;
procedure Button1Click(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;var
Form1: TForm1;implementation{$R *.dfm}function GetLinkTextByUniqueURL(HTML,URL : string):string;
var
iPos : Integer;
begin
iPos := Pos(URL,HTML);
if iPos > 0 then
begin
Delete(HTML,1,iPos);
iPos := Pos('>',HTML);
if iPos>0 then
begin
Delete(HTML,1,iPos);
iPos := Pos('<',HTML);
if iPos>0 then
Result := Trim(Copy(HTML,1,iPos-1));
end;
end;
end;function ExtractHtmlTagValues(const HtmlText: string; TagName, AttribName: string; var Values: TStringList): integer; function FindFirstCharAfterSpace(const Line: string; StartPos: integer): Integer;
var i: integer;
begin
Result := -1;
for i := StartPos to Length(Line) do
begin
if (Line[i] <> ' ') then
begin
Result := i;
exit;
end;
end;
end; function FindFirstSpaceAfterChars(const Line: string; StartPos: integer): Integer;
begin
Result := PosEx(' ', Line, StartPos);
end; function FindFirstSpaceBeforeChars(const Line: string; StartPos: integer): Integer;
var i: integer;
begin
Result := 1;
for i := StartPos downto 1 do
begin
if (Line[i] = ' ') then
begin
Result := i;
exit;
end;
end;
end;var InnerTag: string;
LastPos, LastInnerPos: Integer;
SPos, LPos, RPos: Integer;
AttribValue: string;
ClosingChar: char;
TempAttribName: string;
begin
Result := 0;
LastPos := 1;
while (true) do
begin
// find outer tags '<' & '>'
LPos := PosEx('<', HtmlText, LastPos);
if (LPos <= 0) then break;
RPos := PosEx('>', HtmlText, LPos+1);
if (RPos <= 0) then
LastPos := LPos + 1
else
LastPos := RPos + 1; // get inner tag
InnerTag := Copy(HtmlText, LPos+1, RPos-LPos-1);
InnerTag := Trim(InnerTag); // remove spaces
if (Length(InnerTag) < Length(TagName)) then continue; // check tag name
if (SameText(Copy(InnerTag, 1, Length(TagName)), TagName)) then
begin
// found tag
AttribValue := '';
LastInnerPos := Length(TagName)+1;
while (LastInnerPos < Length(InnerTag)) do
begin
// find first '=' after LastInnerPos
RPos := PosEx('=', InnerTag, LastInnerPos);
if (RPos <= 0) then break; // this way you can check for multiple attrib names and not a specific attrib
SPos := FindFirstSpaceBeforeChars(InnerTag, RPos);
TempAttribName := Trim(Copy(InnerTag, SPos, RPos-SPos));
if (true) then
begin
// found correct tag
LPos := FindFirstCharAfterSpace(InnerTag, RPos+1);
if (LPos <= 0) then
begin
LastInnerPos := RPos + 1;
continue;
end;
LPos := FindFirstCharAfterSpace(InnerTag, LPos); // get to first char after '='
if (LPos <= 0) then continue;
if ((InnerTag[LPos] <> '"') and (InnerTag[LPos] <> '''')) then
begin
// AttribValue is not between '"' or ''' so get it
RPos := FindFirstSpaceAfterChars(InnerTag, LPos+1);
if (RPos <= 0) then
AttribValue := Copy(InnerTag, LPos, Length(InnerTag)-LPos+1)
else
AttribValue := Copy(InnerTag, LPos, RPos-LPos+1);
end
else
begin
// get url between '"' or '''
ClosingChar := InnerTag[LPos];
RPos := PosEx(ClosingChar, InnerTag, LPos+1);
if (RPos <= 0) then
AttribValue := Copy(InnerTag, LPos+1, Length(InnerTag)-LPos-1)
else
AttribValue := Copy(InnerTag, LPos+1, RPos-LPos-1)
end; if (SameText(TempAttribName, AttribName)) and (AttribValue <> '') then
begin
Values.Add(AttribValue);
inc(Result);
end;
end; if (RPos <= 0) then
LastInnerPos := Length(InnerTag)
else
LastInnerPos := RPos+1;
end;
end;
end;
end;procedure TForm1.Button1Click(Sender: TObject);
var
Codes,URLs:TStringList;
i:Integer;
begin
Codes:=TStringList.Create;
URLs:=TStringList.Create;
try
Codes.LoadFromFile('11.htm');
ExtractHtmlTagValues(Codes.Text,'','href',URLs);
Memo_Log.Text:=URLs.Text;
for i:=0 to URLs.Count-1 do
begin
Memo_Log.Lines.Add(GetLinkTextByUniqueURL(Codes.Text,URLs[i]));
end;
finally
if Assigned(Codes) then FreeAndNil(Codes);
if Assigned(URLs) then FreeAndNil(URLs);
end;
end;end.
解决了,不是很完美,能基本的取你要的东西了,有更好的解决方法的朋友,分享下
for i:=0 to WebBrowser1.OleObject.document.anchors.Length - 1 do
這裏出什麼錯??MEMO!裏面放的是你貼出來的HTML源碼
问什么我把“WebBrowser1.Navigate('about:blank');”放到"procedure TForm1.Button1Click(Sender: TObject);"中,循环那里就报错
代码:
procedure TForm1.Button1Click(Sender: TObject);
var
i : integer;
begin
WebBrowser1.Navigate('http://del.cnblogs.com');
// WebBrowser1.OleObject.Document.body.innerHTML := Memo1.Text;
for i := 0 to WebBrowser1.OleObject.document.anchors.Length - 1 do
begin
Memo2.lines.append( WebBrowser1.OleObject.document.anchors.item(i).innerText+'-->'+WebBrowser1.OleObject.document.anchors.item(i).href);
end;
end;procedure TForm1.FormCreate(Sender: TObject);
begin
//WebBrowser1.Navigate('http://del.cnblogs.com');
end;
begin
while WebBrowser1.ReadyState<>4 do
Application.ProcessMessages;
end;
procedure TForm1.Button3Click(Sender: TObject);
var
i : integer;
begin
WebBrowser1.Navigate('http://del.cnblogs.com');
while WebBrowser1.ReadyState<>4 do
Application.ProcessMessages;
for i := 0 to WebBrowser1.OleObject.document.anchors.Length - 1 do
begin
Memo1.lines.append( WebBrowser1.OleObject.document.anchors.item(i).innerText+'-->'+WebBrowser1.OleObject.document.anchors.item(i).href);
end;
end;
放到CREATE中,或等待加載完成,不然會出錯的。
WebBrowser1.OleObject.document.anchors.item(i).href;
为什么用上面方法不能抓取出部分网站的链接?比如Google、sohu等网站的,
yahoo的可以
如果积分真的对你那么重要的话,我可以开个送分贴给你,不会让你白忙活的!