加入收藏 | 设为首页 | 会员中心 | 我要投稿 拼字网 - 核心网 (https://www.hexinwang.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 教程 > 正文

正则获取网页的超链接和链接文本(a href),适用于asp vb vbscipt

发布时间:2020-12-05 20:28:17 所属栏目:教程 来源:佚名
导读:正则获取网页的超链接和链接文本(a href),适用于asp vb vbscipt,正则获取网页的超链接和链接文本(a href),适用于asp vb vbscipt
 

以vb6为例,获取给定的网址的外链的链接地址和链接文本,这里过滤了网站本身的网址,还有一些客服代码
也可以以常见的搜索引擎蜘蛛模拟访问
VB6需要引用 :在“项目”菜单上单击“引用“Microsoft VBScript Regular Expressions 5.5


Public Function HtmlGetLinks(url, zhizu) ' 从网址内获取外链地址 参数:网址,蜘蛛类型
    'On Error Resume Next
    Dim str, reg, objMatches, key1
    url2 = Replace(url, "http://", "")
    url = "http://" & url2
    str = GetUrlHtmlUTF(url, "auto", zhizu)
    Set reg = New RegExp
    reg.IgnoreCase = True
    reg.Global = True
    reg.Pattern = "href=[""'s]?http://([^s]+)(.*?)b[""'s]?(.*?)>(.*?)</a>"
    Set objMatches = reg.Execute(str)
    If objMatches.Count > 0 Then
    For i = 0 To objMatches.Count - 1
       url1 = objMatches(i).SubMatches(0)
       url1 = Replace(url1, """", "")
       url1 = Replace(url1, "'", "")
       Keyword = objMatches(i).SubMatches(3)
       Keyword = LCase(Keyword)
       Keyword = Replace(Keyword, "'", """")
       If InStr(Keyword, "src") > 0 Then
        Keyword = FindStrMulti(Keyword, "src=""", """", "")
        Keyword = "图片:" & Keyword
       Else
        Keyword = RemoveHTML(Keyword)
       End If
       If InStr(url1, url2) = 0 And InStr(Keyword, "wpa.qq.com") = 0 And InStr(url1, "51.la") = 0 And InStr(url1, "cnzz.com") = 0 And InStr(url1, "taobao.com") = 0 And InStr(url1, "beian.gov.cn") = 0 Then
          key1 = key1 & (Keyword & "|" & url1) & vbCrLf
       End If
       DoEvents
    Next
    End If
    HtmlGetLinks = key1
End Function

 

Public Function GetUrlHtmlUTF(url, CodeBase, zhizu)
    Dim xmlHTTP1, GetCode, GetCode1
    Set xmlHTTP1 = CreateObject("Microsoft.XMLHTTP")
    xmlHTTP1.Open "get", url, True
    If zhizu <> "" Then
      Select Case zhizu
      Case "baidu"
        xmlHTTP1.setRequestHeader "UserAgent:", "Baiduspider+(+http://www.baidu.com/search/spider.htm)"
      Case "google"
        xmlHTTP1.setRequestHeader "UserAgent:", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
      Case "yahoo"
        xmlHTTP1.setRequestHeader "UserAgent:", "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
      Case "yahoo"
        xmlHTTP1.setRequestHeader "UserAgent:", "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
      Case "youdao"
        xmlHTTP1.setRequestHeader "UserAgent:", "Mozilla/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/; )"
      Case "soso"
        xmlHTTP1.setRequestHeader "UserAgent:", "Sosospider+(+http://help.soso.com/webspider.htm)"
      Case "sogou"
        xmlHTTP1.setRequestHeader "UserAgent:", "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
     End Select
    End If
    xmlHTTP1.send
    While xmlHTTP1.ReadyState <> 4
        DoEvents
    Wend
    GetCode = xmlHTTP1.ResponseBody
    If CodeBase = "auto" Then
        GetCode1 = StrConv(Left(GetCode, 500), vbUnicode)
        If InStr(GetCode1, "charset=gb2312") > 0 Then
             GetCode = StrConv(GetCode, vbUnicode)
        Else
              If CStr(GetCode) <> "" Then GetCode = BytesToBstr(GetCode, "UTF-8")
        End If
    Else
        If CStr(GetCode) <> "" Then GetCode = BytesToBstr(GetCode, CodeBase)
    End If
   
    GetUrlHtmlUTF = GetCode
    Set ObjXML = Nothing
End Function

(编辑:拼字网 - 核心网)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!