求助,如何使用python+MySQL爬取懂球帝新闻
这里有如下代码报错,求助哪里出问题了Traceback (most recent call last):
File "C:\Users\Zero\PycharmProjects\pythonProject\dqd.py", line 187, in <module>
spider.work_on()
File "C:\Users\Zero\PycharmProjects\pythonProject\dqd.py", line 171, in work_on
info = self.detail_to_mysql(news_detail, label, news_type) # 将新闻详情尝试存入数据库
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Zero\PycharmProjects\pythonProject\dqd.py", line 124, in detail_to_mysql
detail['videos'], create_time, datetime.now()))
^^^^^^^^^^^
NameError: name 'create_time' is not defined
代码如下
import json
import re
from lxml import etree
import pymysql
import requests
from datetime import datetime
from config import host, port, user, password, charset, database
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64'
') AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
}
host = "localhost"
class NewsSpider:
"""懂球帝新闻爬虫类"""
def __init__(self):
"""初始化函数"""
self.base_url = "https://www.{}.json" # 基础url(后期处理)
self.label_num = [3, 4, 5, 6] # 新闻标签数字(与base_url组合)
self.start_urls = [] # 即将爬取的新闻列表页链接
# 生成最初的即将爬取的新闻列表页链接并添加到start_urls中
for num in self.label_num:
url = self.base_url.format(num)
self.start_urls.append(url)
self.headers = headers
self.db = pymysql.connect(host= host, port=port, user=user, password=password,
database=database, charset=charset) # 建立数据库连接
self.cur = self.db.cursor() # 创建游标
self.create_db_table() # 创建表
def create_db_table(self):
"""创建表"""
sql = """CREATE TABLE IF NOT EXISTS news_urls(
id INT NOT NULL AUTO_INCREMENT,
news_urls VARCHAR(500) NOT NULL,
add_time DATETIME NOT NULL,
PRIMARY KEY (id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""
self.cur.execute(sql)
print("数据库表创建成功")
def get_news_page(self, url):
"""获取一页新闻列表页"""
try:
news_page = requests.get(url, headers=self.headers, timeout=5) # 请求链接
except requests.exceptions.ReadTimeout:
return None
news_dict = json.loads(news_page.text) # 将json解析成dict
label = news_dict.get('label', '') # 获取新闻标签名
next_page = news_dict.get('next', '') # 获取下一页新闻列表页链接
articles = news_dict.get('articles', []) # 获取当前新闻列表页的所有新闻
result = {'label': label, 'next_page': next_page, 'articles': articles}
return result
def url_to_mysql(self, url):
"""新闻链接存入数据库,数据库设置了唯一索引,用来去重,爬过的新闻不再爬取"""
try:
self.cur.execute("insert into news_urls(news_urls, add_time) values ('%s', '%s')" % (url, datetime.now()))
self.()
return 'ok'
except pymysql.err.IntegrityError:
return None
def get_news_detail(self, url):
"""获取新闻详情,使用xpath解析技术"""
try:
detail = requests.get(url, headers=headers, timeout=5)
except requests.exceptions.ReadTimeout:
return None
html = etree.HTML(detail.text)
title = html.xpath('//h1[@class="news-title"]/text()') # 标题
author = html.xpath('//p[@class="tips-writer"]/span/text()') # 作者
create_time = html.xpath('//p[@class="tips"]/text()') # 新闻发布时间
content = html.xpath('//div[@class="con"]/div//text()') # 正文文本内容
images = html.xpath('//div[@class="con"]/div//img/@src') # 所有图片链接
videos = html.xpath('//div[@class="video"]/a/@href') # 所有视频链接
news_detail = {
'title': title[0],
'author': author[0],
'create_time': create_time[0],
'content': re.escape(''.join(content)),
'images': '##'.join(images),
'videos': '##'.join(videos),
}
return news_detail
def detail_to_mysql(self, detail, label, news_type):
"""新闻详情存入数据库"""
# 判断表格是否存在
self.cur.execute("SHOW TABLES")
tables = [self.cur.fetchone()[0] for i in range(self.cur.rowcount)]
if 'news_details' not in tables:
# 如果表格不存在,则创建表格
self.cur.execute("""
CREATE TABLE news_details (
id INT PRIMARY KEY AUTO_INCREMENT,
title VARCHAR(255) NOT NULL,
author VARCHAR(255) NOT NULL,
create_time DATETIME NOT NULL,
content LONGTEXT NOT NULL,
images LONGTEXT,
videos LONGTEXT,
label VARCHAR(255) NOT NULL,
news_type VARCHAR(255) NOT NULL
)
""")
self.cur.execute("DESC news_details")
columns = self.cur.fetchall()
columns = [column[0] for column in columns]
if "add_time" not in columns:
# 如果add_time字段不存在,则添加该字段到news_details表格中
self.cur.execute(
"ALTER TABLE news_details ADD COLUMN add_time DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP")
try:
self.cur.execute(
"insert into news_details"
"(label, news_type, title, author, content, images, videos, create_time, add_time) "
"values (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
(label, news_type, detail['title'], detail['author'], detail['content'], detail['images'],
detail['videos'], create_time, datetime.now()))
self.()
return 'ok'
except pymysql.err.IntegrityError:
return None
def select_url(self, url):
"""筛选需要爬取的新闻链接"""
if 'https://www.' in url:
return 'ok'
else:
return None
def close_db(self):
"""关闭数据库连接"""
self.cur.close()
self.db.close()
def work_on(self):
"""逻辑执行函数"""
for i in range(1, 11): # 爬取多少页
if len(set(self.start_urls)) == 1: # 判断start_urls里面如果全是None,说明就没有要请求的新闻列表页了,就结束循环
break
for j in range(len(self.start_urls)): # 遍历start_urls来获取即将请求的列表页
url = self.start_urls[j]
if url is None: # 如果该url已经变成None,说明该类标签的新闻没有可以请求的列表页了
continue
a_page = self.get_news_page(url) # 获取一页新闻列表页
if a_page is None: # 如果返回是None,说明请求新闻列表页失败(请求超时),这时直接放弃该类标签新闻的所有列表页,因为当前页请求失败意味着没有办法获取到下一页的链接
self.start_urls[j] = None
continue
articles = a_page.get('articles') # 获取当前列表页的所有新闻信息
label = a_page.get('label') # 获取当前列表页属于哪种标签新闻
if articles: # 保险起见,判断下是否成功获取到新闻信息
for news in articles: # 遍历新闻信息列表
is_video = news.get('is_video') # 获取当前新闻类型
if is_video is True: # 判断当前新闻类型是视频还是文本
news_type = 'video'
else:
news_type = 'text'
news_url = news.get('share') # 获取当前新闻详情页的链接
if news_url: # 保险起见,判断下是否获取到当前新闻详情页的链接
if self.select_url(news_url) == 'ok': # 新闻详情页链接进行筛选
result = self.url_to_mysql(news_url) # 将筛选后的新闻详情页链接尝试存入数据库
if result == 'ok': # 如果成功存入数据库
news_detail = self.get_news_detail(news_url) # 获取当前新闻详情页信息
if news_detail: # 判断是否获取成功
info = self.detail_to_mysql(news_detail, label, news_type) # 将新闻详情尝试存入数据库
if info == 'ok': # 判断是否成功存入
print('存入%s新闻成功: %s' % (label, news.get('title')))
else:
print('存入%s新闻失败: %s' % (label, news.get('title')))
next_page = a_page.get('next_page') # 获取当前新闻列表页的下一页的链接
if next_page: # 判断是否有下一页
self.start_urls[j] = next_page # 如果有,就将start_urls里面当前列表页链接替换成下一页的链接
else:
self.start_urls[j] = None # 如果没有,就将start_urls里面当前列表页链接替换成None
self.close_db() # 关闭数据库连接
if __name__ == "__main__":
spider = NewsSpider()
spider.work_on()
print("执行完毕!!!")