asp 新闻数据采集的问题~
以下是我自己弄的个简单数据采集代码 挺垃圾的 其实发上来的 目的在于 抛砖引玉吧 希望 对于 数据采集 这一块比较懂的朋友 能给我上上课 想学这个了 以前没接触过这个,,,希望能帮小弟 在此先谢过啦~~~<style>
body{font-size:9pt;
color:#FF0000;
}
a:link{
color:#000000;
font-size:9pt;
}
a:visited{
color:#000000;
font-size:9pt;
}
</style>
<%
On Error Resume Next
Server.ScriptTimeOut=9999999
dim url,html
url="http://news.
Html=GetURL(url) '把地址url传递到GetURL这个函数
Html=Bytes2BStr(Html) '二进制转换成字符一下
xx=split(html,"<h1>") '按<h1> 分组
for j=0 to ubound(xx)
if instr(xx(j),"</h1>")=0 then
else
start=instr(xx(j),"</h1>")
stratco=mid(xx(j),1,start+4)
xx(j)=replace(xx(j),stratco,"")
starcode=instr(xx(j),"<table cellspacing=""0"">")
endcode=instr(xx(j),"</table>")
contenta=mid(xx(j),starcode,endcode-starcode)
content=mid(xx(j),starcode,endcode-starcode)
aa=split(content,"html")
for i=0 to ubound(aa)-1
b=mid(aa(i),instr(aa(i),"/health"),len(aa(i)))&"html" '首页内容中的相应链接代码
bb="http://news.(aa(i),instr(aa(i),"/health"),len(aa(i)))&"html" '每个链接的二级页内容
'bbhtml=GetURL(bb)'把地址url传递到GetURL这个函数
'bbhtml=Bytes2BStr(bbhtml) '二进制转换成字符一下
'response.Write(bbhtml&"<br>")
content=replace(content,b,bb)
next
' response.Write(j&"<br>")
response.Write(content)
'html=replace(html,contenta,content)
'html=replace(html,"http://i3.,"http://www.baidu.com/img/logo.gif")
end if
next
' 获取远程HTML
Function GetURL(url)
Set Retrieval = CreateObject("Microsoft.XMLHTTP")
With Retrieval
.Open "GET", url, False
.Send
GetURL = .responsebody
if len(.responsebody)<100 then
response.write "获取远程文件 <a href="&url&" target=_blank>"&url&"</a> 失败。"
response.end
end if
End With
Set Retrieval = Nothing
End Function
' 二进制转字符串
function bytes2bstr(vin)
strreturn = ""
for i = 1 to lenb(vin)
thischarcode = ascb(midb(vin,i,1))
if thischarcode < &h80 then
strreturn = strreturn & chr(thischarcode)
else
nextcharcode = ascb(midb(vin,i+1,1))
strreturn = strreturn & chr(clng(thischarcode) * &h100 + cint(nextcharcode))
i = i + 1
end if
next
bytes2bstr = strreturn
end function
'Response.write html&"<br>"
%>