如何提取出网页源码里面的超链接地址

 我来答

2个回答

#热议# 应届生在签三方时要注意什么？

雪V歌
2016-12-18 · 知道合伙人数码行家

雪V歌
知道合伙人数码行家

采纳数：78698 获赞数：222937

泉州兴瑞发公司2015-2017最佳优秀员工。

向TA提问私信TA

关注

展开全部

Private Sub Command1_Click()

Dim s As String

s = Text1.Text
s = Replace(Text1.Text, vbCrLf, "") '移除所有回车换行符

'Dim oRegEx As RegExp
'Set oRegEx = New RegExp
'Dim oMatches As MatchCollection
'Dim oMatch As Match

Dim oRegEx As Object
Set oRegEx = CreateObject("VBScript.RegExp")
Dim oMatches As Object
Dim oMatch As Object

With oRegEx
.Global = True '全局匹配
.IgnoreCase = True '忽略大小写
.Pattern = "<a[^>]*?href=[""' ]?(.*?)(?:""|'| ).[^> ]*?>([\s\S]*?)</a>"
'提取所有A标签的正则式,小括号中是子匹配引用组第一个是 (.*?) 第二个是([\s\S]*?)
Set oMatches = .Execute(s)

If oMatches.Count >= 1 Then
Text2.Text = ""

Dim sHref As String, sInnerText As String

Dim i As Integer

Dim sLink As String

'Dim colLinks As Scripting.Dictionary
'Set colLinks = New Scripting.Dictionary

Dim colLinks As Object
Set colLinks = CreateObject("Scripting.Dictionary")

For Each oMatch In oMatches

sHref = oMatch.SubMatches(0) '(.*?)
sInnerText = oMatch.SubMatches(1) '([\s\S]*?)
sInnerText = RemoveTags(sInnerText) '移除A标签(内容)中的多余标签
sInnerText = Replace(sInnerText, " ", "") '移除A标签(内容)中的所有空格
sLink = "<A href=""" & sHref & """>" & sInnerText & "</A>"

If Not colLinks.Exists(sLink) Then
colLinks.Add sLink, sLink
Text2.Text = Text2.Text & sLink & vbNewLine
End If

Next

End If

End With

Set oMatches = Nothing
Set oMatch = Nothing
Set oRegEx = Nothing
Set colLinks = Nothing
End Sub

'这个函数可以去除HTML代码中的标签
Function RemoveTags(ByVal html As String)

'Dim oRegEx As RegExp
'Set oRegEx = New RegExp
Dim oRegEx As Object

Set oRegEx = CreateObject("VBScript.RegExp")

With oRegEx
.Global = True
.IgnoreCase = True
.Pattern = "<[^>]*>"
RemoveTags = .Replace(html, "")
End With

Set oRegEx = Nothing
End Function

已赞过 已踩过<

评论收起

财经365站
2020-06-29

知道答主

回答量：51

采纳率：0%

帮助的人：3.7万

我也去答题访问个人页

关注

展开全部

可以百度，lmcjl在线工具，里面就有一个全站链接抓取工具。很多人都在用。

已赞过 已踩过<

评论收起

推荐律师服务：若未解决您的问题，请您详细描述您的问题，通过百度律临进行免费专业咨询

您可能关注的内容

自动网页链接采集软件-数据采集就用后羿网页链接采集器-小白必备

后羿采集器是新一代智能网页网页链接采集工具，不需要配置采集规则，为技术小白设计量身打造。导出数量无限制，可导出多种文件格式/网站/数据库，支持Win/Mac/Linux系统。

如何提取出网页源码里面的超链接地址

您可能关注的内容

其他类似问题

为你推荐：