| 网站首页 | 业界新闻 | 小组 | 威客 | 人才 | 下载频道 | 博客 | 代码贴 | 在线编程 | 编程论坛
欢迎加入我们,一同切磋技术
用户名:   
 
密 码:  
共有 443 人关注过本帖
标题:求助,如何使用python+MySQL爬取懂球帝新闻
只看楼主 加入收藏
Evy旧了
Rank: 1
等 级:新手上路
帖 子:1
专家分:0
注 册:2023-6-1
结帖率:0
收藏
已结贴  问题点数:20 回复次数:1 
求助,如何使用python+MySQL爬取懂球帝新闻
这里有如下代码报错,求助哪里出问题了
Traceback (most recent call last):
  File "C:\Users\Zero\PycharmProjects\pythonProject\dqd.py", line 187, in <module>
    spider.work_on()
  File "C:\Users\Zero\PycharmProjects\pythonProject\dqd.py", line 171, in work_on
    info = self.detail_to_mysql(news_detail, label, news_type)  # 将新闻详情尝试存入数据库
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Zero\PycharmProjects\pythonProject\dqd.py", line 124, in detail_to_mysql
    detail['videos'], create_time, datetime.now()))
                      ^^^^^^^^^^^
NameError: name 'create_time' is not defined



代码如下
import json
import re
from lxml import etree
import pymysql
import requests
from datetime import datetime
from config import host, port, user, password, charset, database

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64'
                  ') AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
}
host = "localhost"

class NewsSpider:
    """懂球帝新闻爬虫类"""

    def __init__(self):
        """初始化函数"""
        self.base_url = "https://www.{}.json"  # 基础url(后期处理)
        self.label_num = [3, 4, 5, 6]  # 新闻标签数字(与base_url组合)
        self.start_urls = []  # 即将爬取的新闻列表页链接
        # 生成最初的即将爬取的新闻列表页链接并添加到start_urls中
        for num in self.label_num:
            url = self.base_url.format(num)
            self.start_urls.append(url)
        self.headers = headers
        self.db = pymysql.connect(host= host, port=port, user=user, password=password,
                                  database=database, charset=charset)  # 建立数据库连接
        self.cur = self.db.cursor()  # 创建游标
        self.create_db_table()  # 创建表

    def create_db_table(self):
        """创建表"""
        sql = """CREATE TABLE IF NOT EXISTS news_urls(
                  id INT NOT NULL AUTO_INCREMENT,
                  news_urls VARCHAR(500) NOT NULL,
                  add_time DATETIME NOT NULL,
                  PRIMARY KEY (id)
                  ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""
        self.cur.execute(sql)
        print("数据库表创建成功")

    def get_news_page(self, url):
        """获取一页新闻列表页"""
        try:
            news_page = requests.get(url, headers=self.headers, timeout=5)  # 请求链接
        except requests.exceptions.ReadTimeout:
            return None
        news_dict = json.loads(news_page.text)  # 将json解析成dict
        label = news_dict.get('label', '')  # 获取新闻标签名
        next_page = news_dict.get('next', '')  # 获取下一页新闻列表页链接
        articles = news_dict.get('articles', [])  # 获取当前新闻列表页的所有新闻
        result = {'label': label, 'next_page': next_page, 'articles': articles}
        return result

    def url_to_mysql(self, url):
        """新闻链接存入数据库,数据库设置了唯一索引,用来去重,爬过的新闻不再爬取"""
        try:
            self.cur.execute("insert into news_urls(news_urls, add_time) values ('%s', '%s')" % (url, datetime.now()))
            self.()
            return 'ok'
        except pymysql.err.IntegrityError:
            return None

    def get_news_detail(self, url):
        """获取新闻详情,使用xpath解析技术"""
        try:
            detail = requests.get(url, headers=headers, timeout=5)
        except requests.exceptions.ReadTimeout:
            return None
        html = etree.HTML(detail.text)
        title = html.xpath('//h1[@class="news-title"]/text()')  # 标题
        author = html.xpath('//p[@class="tips-writer"]/span/text()')  # 作者
        create_time = html.xpath('//p[@class="tips"]/text()')  # 新闻发布时间
        content = html.xpath('//div[@class="con"]/div//text()')  # 正文文本内容
        images = html.xpath('//div[@class="con"]/div//img/@src')  # 所有图片链接
        videos = html.xpath('//div[@class="video"]/a/@href')  # 所有视频链接

        news_detail = {
            'title': title[0],
            'author': author[0],
            'create_time': create_time[0],
            'content': re.escape(''.join(content)),
            'images': '##'.join(images),
            'videos': '##'.join(videos),

        }
        return news_detail

    def detail_to_mysql(self, detail, label, news_type):
        """新闻详情存入数据库"""
        # 判断表格是否存在
        self.cur.execute("SHOW TABLES")
        tables = [self.cur.fetchone()[0] for i in range(self.cur.rowcount)]
        if 'news_details' not in tables:
            # 如果表格不存在,则创建表格
            self.cur.execute("""
                CREATE TABLE news_details (
                    id INT PRIMARY KEY AUTO_INCREMENT,
                    title VARCHAR(255) NOT NULL,
                    author VARCHAR(255) NOT NULL,
                    create_time DATETIME NOT NULL,
                    content LONGTEXT NOT NULL,
                    images LONGTEXT,
                    videos LONGTEXT,
                    label VARCHAR(255) NOT NULL,
                    news_type VARCHAR(255) NOT NULL
                )
            """)
        self.cur.execute("DESC news_details")
        columns = self.cur.fetchall()
        columns = [column[0] for column in columns]
        if "add_time" not in columns:
             # 如果add_time字段不存在,则添加该字段到news_details表格中
            self.cur.execute(
                "ALTER TABLE news_details ADD COLUMN add_time DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP")
        try:
            self.cur.execute(
                "insert into news_details"
                "(label, news_type, title, author, content, images, videos, create_time, add_time) "
                "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
                (label, news_type, detail['title'], detail['author'], detail['content'], detail['images'],
                 detail['videos'], create_time, datetime.now()))
            self.()
            return 'ok'
        except pymysql.err.IntegrityError:
            return None

    def select_url(self, url):
        """筛选需要爬取的新闻链接"""
        if 'https://www.' in url:
            return 'ok'
        else:
            return None

    def close_db(self):
        """关闭数据库连接"""
        self.cur.close()
        self.db.close()

    def work_on(self):
        """逻辑执行函数"""
        for i in range(1, 11):  # 爬取多少页
            if len(set(self.start_urls)) == 1:  # 判断start_urls里面如果全是None,说明就没有要请求的新闻列表页了,就结束循环
                break
            for j in range(len(self.start_urls)):  # 遍历start_urls来获取即将请求的列表页
                url = self.start_urls[j]
                if url is None:  # 如果该url已经变成None,说明该类标签的新闻没有可以请求的列表页了
                    continue
                a_page = self.get_news_page(url)  # 获取一页新闻列表页
                if a_page is None:  # 如果返回是None,说明请求新闻列表页失败(请求超时),这时直接放弃该类标签新闻的所有列表页,因为当前页请求失败意味着没有办法获取到下一页的链接
                    self.start_urls[j] = None
                    continue
                articles = a_page.get('articles')  # 获取当前列表页的所有新闻信息
                label = a_page.get('label')  # 获取当前列表页属于哪种标签新闻
                if articles:  # 保险起见,判断下是否成功获取到新闻信息
                    for news in articles:  # 遍历新闻信息列表
                        is_video = news.get('is_video')  # 获取当前新闻类型
                        if is_video is True:  # 判断当前新闻类型是视频还是文本
                            news_type = 'video'
                        else:
                            news_type = 'text'
                        news_url = news.get('share')  # 获取当前新闻详情页的链接
                        if news_url:  # 保险起见,判断下是否获取到当前新闻详情页的链接
                            if self.select_url(news_url) == 'ok':  # 新闻详情页链接进行筛选
                                result = self.url_to_mysql(news_url)  # 将筛选后的新闻详情页链接尝试存入数据库
                                if result == 'ok':  # 如果成功存入数据库
                                    news_detail = self.get_news_detail(news_url)  # 获取当前新闻详情页信息
                                    if news_detail:  # 判断是否获取成功
                                        info = self.detail_to_mysql(news_detail, label, news_type)  # 将新闻详情尝试存入数据库
                                        if info == 'ok':  # 判断是否成功存入
                                            print('存入%s新闻成功: %s' % (label, news.get('title')))
                                        else:
                                            print('存入%s新闻失败: %s' % (label, news.get('title')))
                next_page = a_page.get('next_page')  # 获取当前新闻列表页的下一页的链接
                if next_page:  # 判断是否有下一页
                    self.start_urls[j] = next_page  # 如果有,就将start_urls里面当前列表页链接替换成下一页的链接
                else:
                    self.start_urls[j] = None  # 如果没有,就将start_urls里面当前列表页链接替换成None

        self.close_db()  # 关闭数据库连接


if __name__ == "__main__":
    spider = NewsSpider()
    spider.work_on()
    print("执行完毕!!!")
搜索更多相关主题的帖子: 新闻 IF 链接 列表 获取 
2023-06-01 16:29
sheeboard
Rank: 20Rank: 20Rank: 20Rank: 20Rank: 20
等 级:版主
威 望:16
帖 子:79
专家分:442
注 册:2023-2-16
收藏
得分:20 
detail['create_time'],访问字典。用sqlalchemy吧,结合pandas方便点。
2023-06-02 08:54
快速回复:求助,如何使用python+MySQL爬取懂球帝新闻
数据加载中...
 
   



关于我们 | 广告合作 | 编程中国 | 清除Cookies | TOP | 手机版

编程中国 版权所有,并保留所有权利。
Powered by Discuz, Processed in 0.022194 second(s), 11 queries.
Copyright©2004-2024, BCCN.NET, All Rights Reserved