| 网站首页 | 业界新闻 | 小组 | 威客 | 人才 | 下载频道 | 博客 | 代码贴 | 在线编程 | 编程论坛
欢迎加入我们,一同切磋技术
用户名:   
 
密 码:  
共有 565 人关注过本帖
标题:获取远程主机网页内容,并分析
只看楼主 加入收藏
tml327
Rank: 1
等 级:新手上路
帖 子:510
专家分:0
注 册:2007-10-30
收藏
 问题点数:0 回复次数:3 
获取远程主机网页内容,并分析

VERSION 5.00
Begin VB.Form Form1
BorderStyle = 3 'Fixed Dialog
Caption = "GetPage"
ClientHeight = 5550
ClientLeft = 45
ClientTop = 330
ClientWidth = 6900
Icon = "Form1.frx":0000
LinkTopic = "Form1"
MaxButton = 0 'False
MinButton = 0 'False
ScaleHeight = 5550
ScaleWidth = 6900
StartUpPosition = 2 '屏幕中心
Begin VB.CommandButton Command2
Caption = "Analyses"
Height = 315
Left = 1020
TabIndex = 5
Top = 3480
Width = 945
End
Begin VB.ListBox List1
Columns = 1
Height = 1680
ItemData = "Form1.frx":1472
Left = 30
List = "Form1.frx":1474
Sorted = -1 'True
TabIndex = 4
Top = 3840
Width = 6825
End
Begin VB.CommandButton Command1
Caption = "GetPage"
Height = 315
Left = 0
TabIndex = 3
Top = 3480
Width = 945
End
Begin VB.TextBox Text2
BackColor = &H00004000&
ForeColor = &H00FFFFFF&
Height = 2445
Left = 30
MultiLine = -1 'True
ScrollBars = 2 'Vertical
TabIndex = 1
Top = 990
Width = 6825
End
Begin VB.TextBox Text1
Height = 705
Left = 0
MultiLine = -1 'True
ScrollBars = 2 'Vertical
TabIndex = 0
Text = "Form1.frx":1476
Top = 240
Width = 6855
End
Begin VB.Label Label1
AutoSize = -1 'True
Caption = "HTTP 主机地址:"
Height = 180
Left = 30
TabIndex = 2
Top = 30
Width = 1260
End
End
Attribute VB_Name = "Form1"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = True
Attribute VB_Exposed = False
Option Explicit
'// 获取远程主机网页内容,并分析;
'// 使用 XML 的HTTP引擎实现的分割HTML中的超级链接标记代码

Private Sub Command1_Click()
Text2.Text = GetURL(Text1.Text) '//抓取页
End Sub

Function GetURL(url)
On Error GoTo Exittag:
Dim Retrieval As Object
Set Retrieval = CreateObject("Microsoft.XMLHTTP") '//建立对象
With Retrieval
.open "GET", url, False, "", "" '//GET 方式获取页,还有比如POST 形式提交数据
.send
GetURL = bytes2BSTR(.responseBody) '//转换byet到string
End With
Set Retrieval = Nothing '//释放
Exit Function
Exittag:
MsgBox Err.Description & "::. GetUrl" //获取错误信息,产生错误后,错误信息会放入vb.Err对象
Err.Clear
End Function

Private Function bytes2BSTR(vIn)
Dim i As Long
Dim ThischrCode As Integer
Dim NextchrCode As Integer
Dim strReturn As String
Dim timeOut As Long
timeOut = Timer
strReturn = ""
For i = 1 To LenB(vIn) '//按字节处理
ThischrCode = AscB(MidB(vIn, i, 1))
If ThischrCode < &H80 Then '//128 以下;0~127范围,ASCII 码
strReturn = strReturn & Chr(ThischrCode)
Else
NextchrCode = AscB(MidB(vIn, i + 1, 1)) '//127以上,当中文。
strReturn = strReturn & Chr(CLng(ThischrCode) * &H100 + CInt(NextchrCode))
i = i + 1
End If

If Int(Timer - timeOut) > 5 Then
MsgBox "TimeOut!" '//防止超时
Exit For
End If
Next
bytes2BSTR = strReturn
End Function

Private Sub Command2_Click() '//以下这个函数功能缺乏灵活性,我只针对我自己的站所编写
Dim pageStr As String
Dim urlStr As String
Dim tmpStr As String
Dim pos1 As Long, pos2 As Long
Dim i As Long
Dim serverURL As String

pageStr = Text2.Text
urlStr = "": tmpStr = ""
pos2 = 1

List1.Clear

Do
pos1 = InStr(pos2, LCase(pageStr), Chr(32) & "href=", 1)
If pos1 = 0 Then Exit Do
pageStr = Mid(pageStr, pos1)
pos1 = InStr(pos2, LCase(pageStr), "</a>", 1)
If pos1 = 0 Then Exit Do
tmpStr = Mid(pageStr, pos2, pos1)
pageStr = Mid(pageStr, pos1)
pageStr = Replace(pageStr, vbCr, "")
pageStr = Replace(pageStr, vbLf, "")
pageStr = Replace(pageStr, Chr(9), " ")
List1.AddItem tmpStr, 0 '//添加目标链接地址到分析路径
Loop
For i = 0 To List1.ListCount - 1
pos2 = 1
pageStr = List1.List(i)

serverURL = Text1.Text
If LCase(Left(Trim(serverURL), Len("http://"))) = "http://" Then '//检查头7位是否合法
serverURL = Mid(serverURL, Len("http://"))
End If
serverURL = "http:/" & Mid(serverURL, 1, InStr(3, serverURL, "/", 1)) '//这里的组合比较牵强;其实应该更灵活

Do
'<a href='playm3u.asp?sPath=f:\Mp3\MusicData'>音乐库<
pos1 = InStr(pos2, LCase(pageStr), "href=", 1) '//这个是可能一种情况的链接路径
If pos1 = 0 Then Exit Do
pos1 = pos1 + Len("href=")
pageStr = Mid(pageStr, pos1)
tmpStr = ""
If Left(Trim(pageStr), 1) = "'" Or Left(Trim(pageStr), 1) = """" Then
tmpStr = Left(Trim(pageStr), 1)
'MsgBox pageStr
'MsgBox tmpStr
pageStr = Mid(pageStr, 2)
' MsgBox pageStr
End If
pos1 = 0
If tmpStr <> "" Then
pos1 = InStr(pos2, LCase(pageStr), tmpStr, 1)
If pos1 = 0 Then Exit Do
pageStr = Mid(pageStr, pos2, pos1 - 1)
'MsgBox "new:" & vbCrLf & pageStr
Else
pos1 = InStr(pos2, LCase(pageStr), Chr(32), 1)
If pos1 = 0 Then
pos1 = InStr(pos2, LCase(pageStr), ">", 1)
If pos1 = 0 Then Exit Do
End If
pageStr = Mid(pageStr, pos2, pos1 - 1)
'playm3u.asp?sPath=f:\Mp3\MusicData'>音乐库<
End If
Loop

If Left(Trim(pageStr), 1) = "'" Or Left(Trim(pageStr), 1) = """" Then pageStr = Mid(pageStr, 2)
If Right(Trim(pageStr), 1) = "'" Or Right(Trim(pageStr), 1) = """" Then pageStr = Mid(pageStr, 1, Len(pageStr) - 1)

If Left(Trim(pageStr), 1) = "/" Then pageStr = Mid(pageStr, 2)

If LCase(Left(Trim(pageStr), 3)) = "mp3" Then '//这个是我强制加上的"/Mp3",可以删除;
serverURL = serverURL & pageStr
Else

搜索更多相关主题的帖子: 网页 主机 获取 
2007-10-31 14:22
永夜的极光
Rank: 6Rank: 6
等 级:贵宾
威 望:27
帖 子:2721
专家分:1
注 册:2007-10-9
收藏
得分:0 
这个是什么?VB?asp.net?而且还没写完

不过要分析内容还是用正则表达式比较方便

[此贴子已经被作者于2007-10-31 15:08:38编辑过]


从BFS(Breadth First Study)到DFS(Depth First Study)
2007-10-31 15:06
yms123
Rank: 16Rank: 16Rank: 16Rank: 16
等 级:版主
威 望:209
帖 子:12488
专家分:19042
注 册:2004-7-17
收藏
得分:0 

想起来一些网页转贴工具就是分析网页源代码。

2007-10-31 17:30
tml327
Rank: 1
等 级:新手上路
帖 子:510
专家分:0
注 册:2007-10-30
收藏
得分:0 
我 也是知道 一部分哦!让大家见笑了~

轻狂如我,心伤谁知!
2007-11-01 12:12
快速回复:获取远程主机网页内容,并分析
数据加载中...
 
   



关于我们 | 广告合作 | 编程中国 | 清除Cookies | TOP | 手机版

编程中国 版权所有,并保留所有权利。
Powered by Discuz, Processed in 0.021879 second(s), 8 queries.
Copyright©2004-2024, BCCN.NET, All Rights Reserved