如何提取出网页源码里面的超链接地址
2个回答
展开全部
Private Sub Command1_Click()
Dim s As String
s = Text1.Text
s = Replace(Text1.Text, vbCrLf, "") '移除所有回车换行符
'Dim oRegEx As RegExp
'Set oRegEx = New RegExp
'Dim oMatches As MatchCollection
'Dim oMatch As Match
Dim oRegEx As Object
Set oRegEx = CreateObject("VBScript.RegExp")
Dim oMatches As Object
Dim oMatch As Object
With oRegEx
.Global = True '全局匹配
.IgnoreCase = True '忽略大小写
.Pattern = "<a[^>]*?href=[""' ]?(.*?)(?:""|'| ).[^> ]*?>([\s\S]*?)</a>"
'提取所有A标签的正则式,小括号中是子匹配引用组第一个是 (.*?) 第二个是([\s\S]*?)
Set oMatches = .Execute(s)
If oMatches.Count >= 1 Then
Text2.Text = ""
Dim sHref As String, sInnerText As String
Dim i As Integer
Dim sLink As String
'Dim colLinks As Scripting.Dictionary
'Set colLinks = New Scripting.Dictionary
Dim colLinks As Object
Set colLinks = CreateObject("Scripting.Dictionary")
For Each oMatch In oMatches
sHref = oMatch.SubMatches(0) '(.*?)
sInnerText = oMatch.SubMatches(1) '([\s\S]*?)
sInnerText = RemoveTags(sInnerText) '移除A标签(内容)中的多余标签
sInnerText = Replace(sInnerText, " ", "") '移除A标签(内容)中的所有空格
sLink = "<A href=""" & sHref & """>" & sInnerText & "</A>"
If Not colLinks.Exists(sLink) Then
colLinks.Add sLink, sLink
Text2.Text = Text2.Text & sLink & vbNewLine
End If
Next
End If
End With
Set oMatches = Nothing
Set oMatch = Nothing
Set oRegEx = Nothing
Set colLinks = Nothing
End Sub
'这个函数可以去除HTML代码中的标签
Function RemoveTags(ByVal html As String)
'Dim oRegEx As RegExp
'Set oRegEx = New RegExp
Dim oRegEx As Object
Set oRegEx = CreateObject("VBScript.RegExp")
With oRegEx
.Global = True
.IgnoreCase = True
.Pattern = "<[^>]*>"
RemoveTags = .Replace(html, "")
End With
Set oRegEx = Nothing
End Function
Dim s As String
s = Text1.Text
s = Replace(Text1.Text, vbCrLf, "") '移除所有回车换行符
'Dim oRegEx As RegExp
'Set oRegEx = New RegExp
'Dim oMatches As MatchCollection
'Dim oMatch As Match
Dim oRegEx As Object
Set oRegEx = CreateObject("VBScript.RegExp")
Dim oMatches As Object
Dim oMatch As Object
With oRegEx
.Global = True '全局匹配
.IgnoreCase = True '忽略大小写
.Pattern = "<a[^>]*?href=[""' ]?(.*?)(?:""|'| ).[^> ]*?>([\s\S]*?)</a>"
'提取所有A标签的正则式,小括号中是子匹配引用组第一个是 (.*?) 第二个是([\s\S]*?)
Set oMatches = .Execute(s)
If oMatches.Count >= 1 Then
Text2.Text = ""
Dim sHref As String, sInnerText As String
Dim i As Integer
Dim sLink As String
'Dim colLinks As Scripting.Dictionary
'Set colLinks = New Scripting.Dictionary
Dim colLinks As Object
Set colLinks = CreateObject("Scripting.Dictionary")
For Each oMatch In oMatches
sHref = oMatch.SubMatches(0) '(.*?)
sInnerText = oMatch.SubMatches(1) '([\s\S]*?)
sInnerText = RemoveTags(sInnerText) '移除A标签(内容)中的多余标签
sInnerText = Replace(sInnerText, " ", "") '移除A标签(内容)中的所有空格
sLink = "<A href=""" & sHref & """>" & sInnerText & "</A>"
If Not colLinks.Exists(sLink) Then
colLinks.Add sLink, sLink
Text2.Text = Text2.Text & sLink & vbNewLine
End If
Next
End If
End With
Set oMatches = Nothing
Set oMatch = Nothing
Set oRegEx = Nothing
Set colLinks = Nothing
End Sub
'这个函数可以去除HTML代码中的标签
Function RemoveTags(ByVal html As String)
'Dim oRegEx As RegExp
'Set oRegEx = New RegExp
Dim oRegEx As Object
Set oRegEx = CreateObject("VBScript.RegExp")
With oRegEx
.Global = True
.IgnoreCase = True
.Pattern = "<[^>]*>"
RemoveTags = .Replace(html, "")
End With
Set oRegEx = Nothing
End Function
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询