在一篇介绍抓取百度贴吧信息的帖子(https://)中遇到了无法解释的问题。可能是版本的原因,他提供的源码问题比较多(适用于2.7,这里用的是3.8),
我做了一定改动后可以运行,bdtb.getTitle()结果是对的,可是bdtb.getPageNum()结果和网页源码对不上。
只有本站会员才能查看附件,请 登录
只有本站会员才能查看附件,请 登录
我的问题是明明网页源码中回复贴数是4685,页数是34,怎么程序返回的是130和5,非常奇怪,请大神帮忙看看什么情况。
程序代码:
# -*- coding:utf-8 -*-
import urllib
import re
import requests
#百度贴吧爬虫类
class BDTB:
#初始化,传入基地址,是否只看楼主的参数
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
#传入页码,获取该页帖子的代码
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
#response = urllib.request.urlopen(url)
response=requests.get(url)
#response = urllib(request)
#print(response.text)
return response
except urllib.error.URLError as e:
if hasattr(e,"reason"):
print(u"连接百度贴吧失败,错误原因",e.reason)
return None
#获取帖子标题
def getTitle(self):
page = self.getPage(1)
pattern = ('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
#page=str(page)
result = re.search(pattern,page.text)
#print(result)
if result:
print(result.group(1)) #测试输出
return result.group(1).strip()
else:
return None
#获取帖子一共有多少页
def getPageNum(self):
page = self.getPage(1)
pattern = ('<li class="l_reply_num".*?>(.*?)</span>页</li>',re.S)
#pattern = ('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
result = re.search(pattern,page.text)
print(result)
if result:
print(result.group(1)) #测试输出
return result.group(1).strip()
else:
return None
def getContent(self,page):
pattern = ('<div id="post_content_.*?>(.*?)</div>',re.S)
items = re.findall(pattern,page.text)
#print(items.group(1))
#for item in items:
#print item
#print(self.tool.replace(items[1]))
baseURL = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL,1)
bdtb.getPageNum()
#bdtb.getTitle()
#bdtb.getContent(bdtb.getPage(1))
import urllib
import re
import requests
#百度贴吧爬虫类
class BDTB:
#初始化,传入基地址,是否只看楼主的参数
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
#传入页码,获取该页帖子的代码
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
#response = urllib.request.urlopen(url)
response=requests.get(url)
#response = urllib(request)
#print(response.text)
return response
except urllib.error.URLError as e:
if hasattr(e,"reason"):
print(u"连接百度贴吧失败,错误原因",e.reason)
return None
#获取帖子标题
def getTitle(self):
page = self.getPage(1)
pattern = ('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
#page=str(page)
result = re.search(pattern,page.text)
#print(result)
if result:
print(result.group(1)) #测试输出
return result.group(1).strip()
else:
return None
#获取帖子一共有多少页
def getPageNum(self):
page = self.getPage(1)
pattern = ('<li class="l_reply_num".*?>(.*?)</span>页</li>',re.S)
#pattern = ('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
result = re.search(pattern,page.text)
print(result)
if result:
print(result.group(1)) #测试输出
return result.group(1).strip()
else:
return None
def getContent(self,page):
pattern = ('<div id="post_content_.*?>(.*?)</div>',re.S)
items = re.findall(pattern,page.text)
#print(items.group(1))
#for item in items:
#print item
#print(self.tool.replace(items[1]))
baseURL = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL,1)
bdtb.getPageNum()
#bdtb.getTitle()
#bdtb.getContent(bdtb.getPage(1))
[此贴子已经被作者于2020-2-2 21:13编辑过]