各位老师们好,小生在VB中利用WB打开网页,然后提取里面的URL地址;但一个页面里面有很多杂七杂八的链接;
比如:http://hzk520.blog.hexun.com
这个页面,我要提取里面的最新访问列表及留言板中用户的链接地址,其格式为:http://hexun.com/******/default.html但根据自己的代码运行后,却把http://hzk520.blog.hexun.com这个里面的所有链接都一起给保存了……但我只想要:
http://hexun.com/******/default.html
这个格式的URL地址……
请各位老师支招,谢谢……
我的代码如下:'URL为网页地址
'URLS为网页代码中的URL组
Function Url_In_Html(URL As String, ByRef URLS() As String)
Dim i As Long, j As Integer URL = Replace(URL, "\", "/") '将网页地址中可能含有的“\“全部替换成成”/”,这样地址中的分割符合就一致。 i = inStr_n(URL, "/") '比如给的路径是http://www.codefans.net
If i = 2 Then URL = URL & "/"
i = InStrRev(URL, "/")
Dim URL1 As String
URL1 = Left(URL, i) '查找地址的绝对地址路径 Dim HTML As String
HTML = viewSource(URL)
If HTML = "" Then Exit Function
HTML = UCase(HTML) '将网页源码全部转换成大写
Dim N As Integer
Dim index() As Long
N = inStr_n(HTML, "HREF", index)
'MsgBox "总共有" & N & "个href标签"
Dim Temp As String
Dim Temp1 As String
Dim Temp2 As Integer
Dim Temp3 As Integer
Dim Temp4 As Integer
Dim Temp5 As Integer
Dim temp6 As Integer
Dim temp7 As Integer
Dim M As Integer
For i = 0 To N - 1
Temp = Mid(HTML, index(i) + 5, 300)
'这里取url的长度为300,如果超过则检测不到,这300个字符中可能包含下一个或几个HREF标签,但这不用担心,程序会分析每个标签的。
'为什么取那么多,是因为很多网页的URL编码可能很长,比如百度推广的广告和陶宝网的网址都很长。
For j = 2 To Len(Temp)
If Mid(Temp, j, 1) = """" Or Mid(Temp, j, 1) = ">" Or Mid(Temp, j, 1) = "'" Then
Temp1 = Left(Temp, j - 1)
Temp2 = InStr(Temp1, "+") 'URL中含有+号的(比如<a href="'+location.href+'">)
Temp3 = InStr(Temp1, "#") 'URL中含有#号的(比如<a href="#top">)
Temp4 = InStr(Temp1, " ") 'URL中含有空格的(比如<a href=http://www.codefans.net/ target=_blank>)
Temp5 = InStr(Temp1, "MAILTO") 'URL中含有空格的(比如<a href="mailto:[email protected]">)
temp6 = InStr(Temp1, "http://hexun.com/")
temp7 = InStr(Temp1, "/default.html")
'没有能检查所以的情况
If Temp2 = 0 And Temp3 = 0 And Temp4 = 0 And Temp5 = 0 Then
ReDim Preserve URLS(M)
If Left(Temp1, 1) = """" Or Left(Temp1, 1) = "'" Then Temp1 = Right(Temp1, Len(Temp1) - 1) 'URL前面可能还有个引号或单引号
If Temp2 = InStr(Temp1, ":") <> 0 And temp6 = InStr(Temp1, "http://hexun.com/") <> 0 And temp7 = InStr(Temp1, "/default.html") <> 0 Then
'存在冒号,说明是绝对路径(HTTP://),没有用判断HTTP来判断,是因为windows可以用HTTP给文件夹命名,而不可以用冒号
If Left(Temp1, 17) <> "http://hexun.com/" And Right(Temp1, 13) <> "/default.html" Then
Temp1 = URL1 & Right(Temp1, Len(Temp1) - 1)
Else
Temp1 = URL1 & Temp1
End If
End If
URLS(M) = Temp1
M = M + 1
Exit For
End If
End If
Next
NextEnd Function
比如:http://hzk520.blog.hexun.com
这个页面,我要提取里面的最新访问列表及留言板中用户的链接地址,其格式为:http://hexun.com/******/default.html但根据自己的代码运行后,却把http://hzk520.blog.hexun.com这个里面的所有链接都一起给保存了……但我只想要:
http://hexun.com/******/default.html
这个格式的URL地址……
请各位老师支招,谢谢……
我的代码如下:'URL为网页地址
'URLS为网页代码中的URL组
Function Url_In_Html(URL As String, ByRef URLS() As String)
Dim i As Long, j As Integer URL = Replace(URL, "\", "/") '将网页地址中可能含有的“\“全部替换成成”/”,这样地址中的分割符合就一致。 i = inStr_n(URL, "/") '比如给的路径是http://www.codefans.net
If i = 2 Then URL = URL & "/"
i = InStrRev(URL, "/")
Dim URL1 As String
URL1 = Left(URL, i) '查找地址的绝对地址路径 Dim HTML As String
HTML = viewSource(URL)
If HTML = "" Then Exit Function
HTML = UCase(HTML) '将网页源码全部转换成大写
Dim N As Integer
Dim index() As Long
N = inStr_n(HTML, "HREF", index)
'MsgBox "总共有" & N & "个href标签"
Dim Temp As String
Dim Temp1 As String
Dim Temp2 As Integer
Dim Temp3 As Integer
Dim Temp4 As Integer
Dim Temp5 As Integer
Dim temp6 As Integer
Dim temp7 As Integer
Dim M As Integer
For i = 0 To N - 1
Temp = Mid(HTML, index(i) + 5, 300)
'这里取url的长度为300,如果超过则检测不到,这300个字符中可能包含下一个或几个HREF标签,但这不用担心,程序会分析每个标签的。
'为什么取那么多,是因为很多网页的URL编码可能很长,比如百度推广的广告和陶宝网的网址都很长。
For j = 2 To Len(Temp)
If Mid(Temp, j, 1) = """" Or Mid(Temp, j, 1) = ">" Or Mid(Temp, j, 1) = "'" Then
Temp1 = Left(Temp, j - 1)
Temp2 = InStr(Temp1, "+") 'URL中含有+号的(比如<a href="'+location.href+'">)
Temp3 = InStr(Temp1, "#") 'URL中含有#号的(比如<a href="#top">)
Temp4 = InStr(Temp1, " ") 'URL中含有空格的(比如<a href=http://www.codefans.net/ target=_blank>)
Temp5 = InStr(Temp1, "MAILTO") 'URL中含有空格的(比如<a href="mailto:[email protected]">)
temp6 = InStr(Temp1, "http://hexun.com/")
temp7 = InStr(Temp1, "/default.html")
'没有能检查所以的情况
If Temp2 = 0 And Temp3 = 0 And Temp4 = 0 And Temp5 = 0 Then
ReDim Preserve URLS(M)
If Left(Temp1, 1) = """" Or Left(Temp1, 1) = "'" Then Temp1 = Right(Temp1, Len(Temp1) - 1) 'URL前面可能还有个引号或单引号
If Temp2 = InStr(Temp1, ":") <> 0 And temp6 = InStr(Temp1, "http://hexun.com/") <> 0 And temp7 = InStr(Temp1, "/default.html") <> 0 Then
'存在冒号,说明是绝对路径(HTTP://),没有用判断HTTP来判断,是因为windows可以用HTTP给文件夹命名,而不可以用冒号
If Left(Temp1, 17) <> "http://hexun.com/" And Right(Temp1, 13) <> "/default.html" Then
Temp1 = URL1 & Right(Temp1, Len(Temp1) - 1)
Else
Temp1 = URL1 & Temp1
End If
End If
URLS(M) = Temp1
M = M + 1
Exit For
End If
End If
Next
NextEnd Function
解决方案 »
- GB18030实现工具
- 字段名模糊查询
- vb 发送邮件代码
- 请问一下,下面的话中用户指什么?是指不同的客户机还是指数据库中不同的用户。???
- 请问如何用VB在自己的程序中使用象“资源管理器”中的显示为“树型”文件夹的控件
- 高分求解---vb中关于日期类变量的算法
- help me!!!在线等待,100分相送,在线等待
- 谁会使MICROSOFT AGENT呀
- 如何用VB访问网络上带密码的文件夹,急急急........
- 高分求解:如何在VB6中用API设置自定义打印机纸张类型和大小即打印纸的宽,高,请给出例程!
- adors.delete 无法删除
- 向各位老师请教:VB自动填写表单中关于textarea控件无法附值的问题
If Instr(s,"http://hexun.com/")>0 And Instr(s,"/default.html")>0 then
s为链接
Function RegExpTest(patrn, strng) 'patrn:需要查找的字符 strng:被查找的字符串
Dim regEx, Match, Matches ' 创建变量。
Set regEx = New RegExp ' 创建正则表达式。
regEx.Pattern = patrn ' 设置模式。'"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*"'
regEx.IgnoreCase = True ' 设置是否区分大小写。
regEx.Global = True ' 设置全程匹配。
Set Matches = regEx.Execute(strng) ' 执行搜索。
For Each Match In Matches ' 循环遍历Matches集合。
RetStr = RetStr & Match.Value & vbCrLf
Next
RegExpTest = RetStr
End Function
Private Sub Command1_Click()
Dim URLRegExp As String, MailRegExp As String, ChiniRegExp As String
Dim FileName As String, sFile As String, MuName As String, Chans As String
Dim i As Long, arr() As String, arr1() As String, arr2() As String URLRegExp = "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?" 'URL正则表达式
MailRegExp = "\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*" '电子邮件正则表达式
ChiniRegExp = "[^\x00-\xff]* " Open "c:\temp.html" For Binary As #1 '你下载的网页源码
sFile = Space(LOF(1))
Get #1, , sFile
Close #1
'Text1.Text = RegExpTest(URLRegExp, sFile)
Chans = RegExpTest(URLRegExp, sFile)
arr = Split(Chans, vbCrLf)
For i = 0 To UBound(arr) - 1
'http://hexun.com/******/default.html
If Right(arr(i), 12) = "default.html" And Left(arr(i), 16) = "http://hexun.com" Then
Text1.Text = Text1.Text & arr(i) & vbCrLf
End If
Next i
End Sub
下面的代码连下载到解析,稍微修改就能应用到你的工程中Private Declare Function StrStrIA Lib "shlwapi.dll" (ByVal lpString As String, ByVal lpSch As String) As LongPrivate Sub CommandButton2_Click()
Dim pStream As IStream
Dim pPersist As IPersistStreamInit
Dim objHttp As New XMLHTTP
Dim objDoc As HTMLDocument
Dim pLink As IHTMLElement
On Error GoTo Err_Load
objHttp.Open "get", "http://www.csdn.net"
objHttp.send
Set pStream = objHttp.responseStream
If (ObjPtr(pStream)) Then
Set objDoc = New HTMLDocument
Set pPersist = objDoc
With pPersist
.InitNew
.Load pStream
End With
While objDoc.readyState <> "complete"
DoEvents
Wend
For Each pLink In objDoc.Links
If StrStrIA(pLink.toString, "http://www.csdn.net/") Then
Debug.Print pLink.toString
End If
Next
End If
Exit Sub
Err_Load:
MsgBox Err.Description
End Sub