| 网站首页 | 业界新闻 | 小组 | 威客 | 人才 | 下载频道 | 博客 | 代码贴 | 在线编程 | 编程论坛
共有 428 人关注过本帖
标题:求助:下列代码不能爬取毎章小说的下一页
只看楼主 加入收藏
王咸美
Rank: 1
等 级:新手上路
帖 子:909
专家分:3
注 册:2018-1-4
收藏
已结贴  问题点数:30 回复次数:3 
求助:下列代码不能爬取毎章小说的下一页
下列代码不能爬取每章小说的下一页(部分章节有下一页),请高手赐教,万分感谢!!!

import os
import requests
from bs4 import BeautifulSoup
#目标网页
base_url='https://www.'
chapter_list_url = f'{base_url}/ml/116168/'

#发送请求获取网页内容
response = requests.get(chapter_list_url)
response.encoding ='utf-8'
#使用BeautifulSoup解析HTML
soup =BeautifulSoup(response.text,'html.parser')

#找到id为"List"的div下的所有〈dd〉标签
chapter_list = soup.find("div",id="list").find_all("dd")

#创建保存章节的目录
save_dir = ('novel_chapters')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

def get_full_chapter_content(url):
    """
    获取整个章节的所有页面的内容
    :param url:章节第一页的URL
    :return:完整章节内容
    """
    full_content =" "
    while url:
        response = requests.get(url)
        response.encoding = 'utf-8'
        chapter_soup = BeautifulSoup(response.text,'html.parser')

        #章节内容位于〈div class="content"〉标签内
        chapter_content_tag = chapter_soup.find('div',id='content')
        if chapter_content_tag:
            full_content +=chapter_content_tag.get_text("\n",strip=True)+"\n"

        #检查是否有"下一页"链接
        next_page_tag=chapter_soup.find('a',text='下一页')
        if next_page_tag and next_page_tag.has_attr('href'):
            url = base_url + next_page_tag['href'] if next_page_tag['href'].startswith('/') else next_page_tag['href']
        else:
            url=None
    return full_content

#遍历〈dd〉标签,找到从"第1章少馆主"开始的所有幸节
start_collecting = False

for chapter in chapter_list:
    link = chapter.a['href']
    title = chapter.a.get_text()
    if "第1章" in title:
        start_collecting = True
    if start_collecting:
        if link.startswith('/'):
           link = base_url + link

        full_chapter_content = get_full_chapter_content(link)

        #保存章节内容到代txt文件
        file_name = os.path.join(save_dir, f"{title}.txt")
        with open(file_name,'w',encoding='utf-8') as file:
            file.write(full_chapter_content)

        print(f'Saved:{file_name}')





搜索更多相关主题的帖子: 下一页 url if 内容 response 
2026-05-06 12:45
yiyanxiyin
Rank: 20Rank: 20Rank: 20Rank: 20Rank: 20
等 级:版主
威 望:9
帖 子:378
专家分:2417
注 册:2023-6-29
收藏
得分:0 
你代码没问题啊, 能取到第一章的前2页, 是网站本身有问题吧, 好像第一章内容不完整
2026-05-08 13:03
wzxc
Rank: 9Rank: 9Rank: 9
来 自:齐鲁大地
等 级:贵宾
威 望:39
帖 子:990
专家分:1336
注 册:2006-4-25
收藏
得分:30 
[code]import tkinter as tk
from tkinter import ttk, scrolledtext,messagebox
import os
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
import json


class NovelReader:
    def __init__(self, rt):
        self.root = rt
        self.root.title("小说阅读器")

        # 获取屏幕宽度和高度
        screen_width = rt.winfo_screenwidth()
        screen_height = rt.winfo_screenheight()
        self.width = 1440
        self.height = 900
        # 计算窗口左上角的位置,使得窗口居中
        position_top = int(screen_height / 2 - self.height / 2)
        position_right = int(screen_width / 2 - self.width / 2)
        # 设置窗口位置和大小
        self.root.geometry(f'{self.width}x{self.height}+{position_right}+{position_top}')

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        self.url_prefix_p = []
        self.soup = None
         = {}
        self.url_prefix_show_txt = []

        # 创建表格
        self.tree_scroll = ttk.Scrollbar(self.root, orient=tk.VERTICAL)
        self.tree_dictionary = ttk.Treeview(root, columns=('Key', 'Value'), show='headings', height=10,
                                            yscrollcommand=self.tree_scroll.set)

        self.create_text()
        self.create_treeview()
        self.load_data()
        self.create_widgets()
        self.create_context_menu()
        # 绑定全局左键点击事件以取消菜单
        self.root.bind("<Button-1>", self.hide_context_menu)
        ()
        self.refresh_path()
        self.root.protocol("WM_DELETE_WINDOW", self.quit_ip)   # 绑定窗口的关闭事件


    def net_path_dictionary(self):
        self.tree_scroll.configure(command=self.tree_dictionary.yview)
        self.tree_dictionary.heading('Key', text='路径')
        self.tree_dictionary.heading('Value', text='域名')
        self.tree_dictionary.column("Key", width=250)
        self.tree_dictionary.column("Value", width=353)
        self.tree_dictionary.place(x=810, y=360)
        self.tree_scroll.place(x=1416, y=360, height=222, anchor='nw')

    def refresh_path(self):
        # 清空表格
        for row in self.tree_dictionary.get_children():
            self.tree_dictionary.delete(row)

        # 填充数据
        for key, value in ():
            self.tree_dictionary.insert('', "end", values=(key, value))

    def add_path(self):
        self.add_path_button.configure(state="disabled")
        # 弹出窗口添加新条目
        self.add_window2 = tk.Toplevel(self.root)
        self.add_window2.title("添加路径")
        self.add_window2.geometry("520x200+290+180")
        # 创建Entry组件
        self.path_entry1 = tk.Entry(self.add_window2, width=60)
        self.domain_name_entry2 = tk.Entry(self.add_window2, width=60)
        path_label1 = tk.Label(self.add_window2, text="路径:")
        path_label1.grid(row=0, column=0, padx=5, pady=5)
        domain_name_label2 = tk.Label(self.add_window2, text="域名:")
        domain_name_label2.grid(row=1, column=0, padx=5, pady=5)

        self.path_entry1.grid(row=0, column=1, padx=5, pady=5)
        self.domain_name_entry2.grid(row=1, column=1, padx=5, pady=5)
        # 为右键菜单绑定事件(这里以entry1为例,其他entry类似)
        self.path_entry1.bind("<Button-3>", self.show_context_menu)
        self.domain_name_entry2.bind("<Button-3>", self.show_context_menu)

        def add_entry():
            key = self.path_entry1.get()
            value = self.domain_name_entry2.get()
            if key and value:
                [key] = value
                self.tree_dictionary.insert('', 'end', values=(key, value))
                self.add_path_button.configure(state="normal")
                self.add_window2.destroy()

        def cancel():
            self.add_path_button.configure(state="normal")
            self.add_window2.destroy()

        # 绑定窗口关闭事件
        self.add_window2.protocol("WM_DELETE_WINDOW", cancel)
        ttk.Button(self.add_window2, text="确定", command=add_entry, cursor='hand2').place(x=300,y=150)
        ttk.Button(self.add_window2, text="取消", command=cancel, cursor='hand2').place(x=400,y=150)

    def save_path(self):
        # 保存数据到文件
        with open('net_dress.json', 'w', encoding='utf-8') as f:
            json.dump(, f, ensure_ascii=False, indent=4)
        tk.messagebox.showinfo("Info", "数据已保存.")

    def deleted_path(self):
        # 删除选中的条目
        selected_item = self.tree_dictionary.selection()
        if not selected_item:
            return

        key = self.tree_dictionary.item(selected_item, 'values')[0]
        del [key]
        self.refresh_path()

    def add_url_p(self):
        self.add_url_button.configure(state="disabled")
        # 弹出窗口添加新条目
        self.add_window3 = tk.Toplevel(self.root)
        self.add_window3.title("添加域名")
        self.add_window3.geometry("520x200+290+180")
        # 创建Entry组件
        url_entry1 = tk.Entry(self.add_window3, width=60)
        url_entry1.grid(row=0, column=1, padx=5, pady=5)
        url_entry1.bind("<Button-3>", self.show_context_menu)
        def add_entry():
            url = url_entry1.get()
            if url:
                self.url_prefix_p.append(url)
                self.listbox_p.insert(tk.END, url)
                url_entry1.delete(0, tk.END)
                self.add_window3.destroy()
                self.add_url_button.configure(state="normal")
            else:
                messagebox.showwarning("提示", "请输入有效的 URL!")

        def cancel():
            self.add_url_button.configure(state="normal")
            self.add_window3.destroy()

        # 绑定窗口关闭事件
        self.add_window3.protocol("WM_DELETE_WINDOW", cancel)
        ttk.Button(self.add_window3, text="确定", command=add_entry, cursor='hand2').place(x=300,y=150)
        ttk.Button(self.add_window3, text="取消", command=cancel, cursor='hand2').place(x=400,y=150)

    def save_url_p(self):
        # 保存数据到文件
        with open('url_prefix_p.json', 'w', encoding='utf-8') as f:
            json.dump(self.url_prefix_p, f, ensure_ascii=False, indent=4)
        tk.messagebox.showinfo("Info", "数据已保存.")

    def delete_url_p(self):
        selected_index = self.listbox_p.curselection()
        if selected_index:
            self.url_prefix_p.clear()
            self.listbox_p.delete(selected_index[0])
            for idx in range(self.listbox_p.size()):
                self.url_prefix_p.append(self.listbox_p.get(idx))
        else:
            messagebox.showwarning("提示", "请选择要删除的 URL!")

    def add_url_show_txt(self):
        self.add_url_button.configure(state="disabled")
        # 弹出窗口添加新条目
        self.add_window4 = tk.Toplevel(self.root)
        self.add_window4.title("添加域名")
        self.add_window4.geometry("520x200+290+180")
        # 创建Entry组件
        url_entry1 = tk.Entry(self.add_window4, width=60)
        url_entry1.grid(row=0, column=1, padx=5, pady=5)
        url_entry1.bind("<Button-3>", self.show_context_menu)
        def add_entry():
            url = url_entry1.get()
            if url:
                self.url_prefix_show_txt.append(url)
                self.listbox_show_txt.insert(tk.END, url)
                url_entry1.delete(0, tk.END)
                self.add_window4.destroy()
                self.add_url_button.configure(state="normal")
            else:
                messagebox.showwarning("提示", "请输入有效的 URL!")

        def cancel():
            self.add_url_button.configure(state="normal")
            self.add_window4.destroy()

        # 绑定窗口关闭事件
        self.add_window4.protocol("WM_DELETE_WINDOW", cancel)
        ttk.Button(self.add_window4, text="确定", command=add_entry, cursor='hand2').place(x=300,y=150)
        ttk.Button(self.add_window4, text="取消", command=cancel, cursor='hand2').place(x=400,y=150)

    def save_url_show_txt(self):
        # 保存数据到文件
        with open('url_prefix_show_txt.json', 'w', encoding='utf-8') as f:
            json.dump(self.url_prefix_show_txt, f, ensure_ascii=False, indent=4)
        tk.messagebox.showinfo("Info", "数据已保存.")

    def delete_url_show_txt(self):
        selected_index = self.listbox_show_txt.curselection()
        if selected_index:
            self.url_prefix_show_txt.clear()
            self.listbox_show_txt.delete(selected_index[0])
            for idx in range(self.listbox_show_txt.size()):
                self.url_prefix_show_txt.append(self.listbox_p.get(idx))
        else:
            messagebox.showwarning("提示", "请选择要删除的 URL!")

    def add_bookmark(self):
        self.add_button.configure(state="disabled")
        # 弹出窗口添加新条目
        self.add_window = tk.Toplevel(self.root)
        self.add_window.title("添加书签")
        self.add_window.geometry("520x200+290+180")
        # 创建Entry组件
        self.title_entry1 = tk.Entry(self.add_window, width=60)
        self.chapter_entry2 = tk.Entry(self.add_window, width=60)
        self.dress_entry3 = tk.Entry(self.add_window, width=60)
        title_label1 = tk.Label(self.add_window, text="小说名称:")
        title_label1.grid(row=0, column=0, padx=5, pady=5)
        chapter_label2 = tk.Label(self.add_window, text="章节:")
        chapter_label2.grid(row=1, column=0, padx=5, pady=5)
        dress_label3 = tk.Label(self.add_window, text="URL:")
        dress_label3.grid(row=2, column=0, padx=5, pady=5)
        self.title_entry1.grid(row=0, column=1, padx=5, pady=5)
        self.chapter_entry2.grid(row=1, column=1, padx=5, pady=5)
        self.dress_entry3.grid(row=2, column=1, padx=5, pady=5)
        # 为右键菜单绑定事件(这里以entry1为例,其他entry类似)
        self.title_entry1.bind("<Button-3>", self.show_context_menu)
        self.chapter_entry2.bind("<Button-3>", self.show_context_menu)
        self.dress_entry3.bind("<Button-3>", self.show_context_menu)

        def cancel():
            self.add_button.configure(state="normal")
            self.add_window.destroy()

        # 绑定窗口关闭事件
        self.add_window.protocol("WM_DELETE_WINDOW", cancel)
        ttk.Button(self.add_window, text="确定", command=self.add_reading, cursor='hand2').place(x=300,y=150)
        ttk.Button(self.add_window, text="取消", command=cancel, cursor='hand2').place(x=400,y=150)

    def create_widgets(self):
        # 创建Entry组件
        # 书签
        self.add_button = ttk.Button(self.root, text="添加", command=self.add_bookmark, cursor='hand2')
        save_button = ttk.Button(self.root, text="保存", command=lambda:self.save_reading(sv=1), cursor='hand2')
        delete_button = ttk.Button(self.root, text="删除", command=self.delete, cursor='hand2')
        open_button = ttk.Button(self.root, text="打开", command=self.open_reading, cursor='hand2')
        self.add_button.place(x=920, y=320)
        save_button.place(x=1000, y=320)
        delete_button.place(x=1080, y=320)
        open_button.place(x=1160, y=320)

        self.previous_chapter_button = ttk.Button(self.root, text="上一章", command=self.previous_chapter, cursor='hand2')
        self.next_chapter_button = ttk.Button(self.root, text="下一章", command=self.next_chapter, cursor='hand2')
        self.bookmark_button = ttk.Button(self.root, text="添加书签", command=self.bookmark,cursor='hand2')
        self.previous_chapter_button.place(x=200,y=792)
        self.next_chapter_button.place(x=400,y=792)
        self.bookmark_button.place(x=600,y=792)
        # 路径
        self.add_path_button = ttk.Button(self.root, text="添加", command=self.add_path, cursor='hand2')
        save_path_button = ttk.Button(self.root, text="保存", command=self.save_path, cursor='hand2')
        delete_path_button = ttk.Button(self.root, text="删除", command=self.deleted_path, cursor='hand2')
        self.add_path_button.place(x=920, y=600)
        save_path_button.place(x=1000, y=600)
        delete_path_button.place(x=1080, y=600)
        # url
        # 列表框显示 URL_p
        self.listbox_p = tk.Listbox(self.root, width=40, height=6)
        scrollbar_p = tk.Scrollbar(self.root, orient="vertical", command=self.listbox_p.yview)
        self.listbox_p.config(yscrollcommand=scrollbar_p.set)
        # 布局
        self.listbox_p.place(x=810,y=660)
        scrollbar_p.place(x=1092,y=660, height=125)

        # 填充列表框
        for url in self.url_prefix_p:
            self.listbox_p.insert(tk.END, url)

        # 列表框显示 URL_show_txt
        self.listbox_show_txt = tk.Listbox(self.root, width=40, height=6)
        scrollbar_show_txt = tk.Scrollbar(self.root, orient="vertical", command=self.listbox_show_txt.yview)
        self.listbox_show_txt.config(yscrollcommand=scrollbar_p.set)
        # 布局
        self.listbox_show_txt.place(x=1109,y=660)
        scrollbar_show_txt.place(x=1380,y=660, height=125)

        # 填充列表框
        for url in self.url_prefix_show_txt:
            self.listbox_show_txt.insert(tk.END, url)

        self.add_url_button = ttk.Button(self.root, text="添加", command=lambda: self.on_button_click("add"), cursor='hand2')
        save_url_button = ttk.Button(self.root, text="保存", command=lambda: self.on_button_click("save"), cursor='hand2')
        delete_url_button = ttk.Button(self.root, text="删除", command=lambda: self.on_button_click("delete"), cursor='hand2')
        self.add_url_button.place(x=920, y=800)
        save_url_button.place(x=1000, y=800)
        delete_url_button.place(x=1080, y=800)
        ttk.Label(self.root, text="章节标签为p的域名:").place(x=810,y=639)
        ttk.Label(self.root, text="章节标签为show_txt的域名:").place(x=1109,y=639)

    def on_button_click(self, action):
        focused_widget = self.root.focus_get()
        print(focused_widget)
        if self.listbox_p.curselection():
            if action == "add":
                self.add_url_p()
            elif action == "save":
                self.save_url_p()
            elif action == "delete":
                self.delete_url_p()
        elif self.listbox_show_txt.curselection():
            if action == "add":
                self.add_url_show_txt()
            elif action == "save":
                self.save_url_show_txt()
            elif action == "delete":
                self.delete_url_show_txt()
        else:
            messagebox.showwarning("提示", "请先选中一个列表框!")


    def create_context_menu(self):
        # 创建右键菜单
        self.context_menu = tk.Menu(self.root, tearoff=0)
        self.context_menu.add_command(label="剪切", command=self.cut_text)
        self.context_menu.add_command(label="复制", command=self.copy_text)
        self.context_menu.add_command(label="粘贴", command=self.paste_text)
        self.context_menu.add_command(label="全选", command=self.select_all_text)
        # 添加“取消菜单”选项
        self.context_menu.add_separator()
        self.context_menu.add_command(label="取消菜单", command=lambda: self.context_menu.unpost())


    def show_context_menu(self, event):
        self.context_menu.post(event.x_root, event.y_root)

    def cut_text(self):
        try:
            self.root.focus_get().event_generate("<<Cut>>")
        except Exception as e:
            print(f"剪切错误: {e}")

    def copy_text(self):
        try:
            self.root.focus_get().event_generate("<<Copy>>")
        except Exception as e:
            print(f"复制错误: {e}")

    def paste_text(self):
        try:
            self.root.focus_get().event_generate("<<Paste>>")
        except Exception as e:
            print(f"粘贴错误: {e}")

    def select_all_text(self):
        try:
            self.root.focus_get().select_range(0, tk.END)
        except Exception as e:
            print(f"全选错误: {e}")

    def hide_context_menu(self, event):
        # 如果当前有菜单显示,并且点击的不是菜单项,则取消它
        if hasattr(self, 'context_menu') and self.context_menu.winfo_exists():
            # 检查点击是否发生在菜单上(可选,根据需求决定是否保留)
            # 如果不保留此检查,则任何左键点击都会取消菜单
            self.context_menu.unpost()

    def load_data(self):
        if os.path.isfile("reading.txt"):
            with open('reading.txt', 'r', encoding='utf-8', newline='') as fread:
                for line in fread:
                    original_string = line.strip()
                    parts = original_string.split(',')
                    first_part = parts[0].strip("()'")
                    second_part = parts[1].strip("'()' ")
                    third_part = parts[2].strip("'()' ")
                    self.gride.insert('', 'end', values=(first_part, second_part, third_part))

        if os.path.isfile("net_dress.json"):
            with open('net_dress.json', 'r', encoding='utf-8') as f:
                = json.load(f)

        if os.path.isfile("url_prefix_p.json"):
            with open('url_prefix_p.json', 'r', encoding='utf-8') as f:
                self.url_prefix_p = json.load(f)
        else:
            self.url_prefix_p = [
                "https://www., "https://www.,
                "https://www., "https://www.,
                "https://350.ooo/", "https://www., "https://www.
            ]

        if os.path.isfile("url_prefix_show_txt.json"):
            with open('url_prefix_show_txt.json', 'r', encoding='utf-8') as f:
                self.url_prefix_show_txt = json.load(f)
        else:
            self.url_prefix_show_txt = [
                "https://www.skjvvx.cc/", "https://www., "https://www.balshuzhai.cc/"
        ]


    def create_text(self):
        # 创建多行文本
        self.text = scrolledtext.ScrolledText(
            self.root, width=60, height=25, font=("Arial", 18), fg="white", bg="black",wrap=tk.WORD)
        self.text.place(x=10, y=74)
        # text.config(state=tk.DISABLED)
        self.label1 = tk.Label(self.root, text='  ')
        self.label1.place(x=60, y=10, anchor='nw')
        self.label2 = tk.Label(self.root, text='      ', width=53, font=("Arial", 20), fg="red", relief='groove')   #, bg="black"
        self.label2.place(x=12, y=40, anchor='nw')
        self.label_dress = tk.Label(self.root, text='地址:', width=60, anchor='nw')
        self.label_dress.place(x=700, y=10, anchor='nw')


    def create_treeview(self):
        # 创建Treeview组件和Scrollbar
        self.fscroll = ttk.Scrollbar(self.root, orient=tk.VERTICAL)
        self.gride = ttk.Treeview(self.root, columns=('c1', 'c2', 'c3'), show='headings', height=12, yscrollcommand=self.fscroll.set)
        self.gride.place(x=810, y=40, anchor='nw')
        self.fscroll.configure(command=self.gride.yview)
        self.fscroll.place(x=1420, y=40, height=320, anchor='nw')

        # 设置Treeview列宽和标题
        self.gride.column("c1", width=180)
        self.gride.column("c2", width=190)
        self.gride.column("c3", width=230)
        self.gride.heading("c1", text="文章名称")
        self.gride.heading("c2", text="书签")
        self.gride.heading("c3", text="URL")

        self.gride.bind("<Double-1>", self.open_reading)

    def add_reading(self):
        r1 = self.title_entry1.get().rstrip()
        r2 = self.chapter_entry2.get().rstrip()
        r3 = self.dress_entry3.get().rstrip()

        if r1 and r2 and len(r1) > 1 and len(r2) > 1:
            self.gride.insert('', 'end', values=(r1, r2, r3))
            self.title_entry1.delete(0, tk.END)
            self.chapter_entry2.delete(0, tk.END)
            self.dress_entry3.delete(0, tk.END)
            self.add_window.destroy()
            self.add_button.configure(state="normal")
        else:
            messagebox.showwarning("警告", "请输入内容!")


    def save_reading(self,sv):
        with open('reading.txt', 'w', encoding='utf-8', newline='') as f_read:
            item_ids = self.gride.get_children()
            for item_id in item_ids:
                values = self.gride.item(item_id, "values")
                s = ','.join(map(str, values))  # 将列表转换为逗号分隔的字符串
                f_read.write(s + "\r\n")
        if sv == 1:
            messagebox.showinfo("提示", "数据已保存!")

    def delete(self):
        for selected_item in self.gride.selection():
            self.gride.delete(selected_item)

    def open_reading(self, event=None):
        selected_items = self.gride.selection()
        if selected_items:
            values = self.gride.item(selected_items[0], "values")
            if len(values) >= 3:  # 确保有足够的列
                url = values[2].strip()  # 直接获取第三列的值
                # self.get_chapter(url)
                self.get_chapter(values[2].strip())




认真看书学习,弄通Fox主义。
5 天前 16:19
wzxc
Rank: 9Rank: 9Rank: 9
来 自:齐鲁大地
等 级:贵宾
威 望:39
帖 子:990
专家分:1336
注 册:2006-4-25
收藏
得分:0 
程序代码:
    def get_chapter(self,url):
        self.label_dress.config(text='地址:' + url)
        response = None
        try:
            # 发送HTTP请求,并禁用重定向
            # response = requests.get(url, headers=self.headers, allow_redirects=False)
            # response.raise_for_status()  # 检查请求是否成功

            session = requests.Session()
            session.headers.update(self.headers)
            response = session.get(url, allow_redirects=False)

            # 检查是否发生了重定向
            if response.status_code in (301, 302):
                print(f"发生了重定向,重定向到的URL是: {response.headers.get('Location')}")
                tk.messagebox.showinfo("我的标题", f"发生了重定向,重定向到的URL是: {response.headers.get('Location')}")
                # 这里可以添加手动访问重定向后的URL的逻辑
                url = response.headers.get('Location')
                response = requests.get(url, headers=self.headers, allow_redirects=False)
        # 解析HTML内容
            self.soup = BeautifulSoup(response.text, 'html.parser')
            # 提取<title>部分
            title = self.soup.find('title').text if self.soup.find('title') else "未找到标题"
            print(f"标题: {title}\n")
            self.label1.config(text=title)
            h1_tag = self.soup.find('h1')
            if h1_tag:
                self.label2.config(text=h1_tag.get_text().strip())
            h3_tag = self.soup.find('h3')
            if h3_tag:
                self.label2.config(text=h3_tag.get_text().strip())
            h2_tag = self.soup.find('h2')
            if h2_tag:
                self.label2.config(text=h2_tag.get_text().strip())
                print(h2_tag.get_text().strip())
        except requests.RequestException as e:
            print(f"请求发生错误: {e}")
        except Exception as e:
            print(f"3发生未知错误: {e}")
        # 提取网址部分
        parsed_url = urlparse(url)
        result = f"{parsed_url.scheme}://{parsed_}/"
        print(result)
        if result in self.url_prefix_p:
            self.get_chapter_p()
        elif result in self.url_prefix_show_txt:
            self.get_chapter_show_txt()
        else:
            print("未能打开当前选择的网页:", )

    def get_chapter_show_txt(self):
        # 提取小说正文部分
        content_div = self.soup.find('div', {'class': 'showtxt'})
        if content_div:
            # 移除<script>标签及其内容
            for script in content_div.find_all('script'):
                script.decompose()
            # 获取纯文本内容并分段
            content = content_div.get_text(separator='\n', strip=True)
            # 移除广告部分(假设广告在最后)
            ad_pattern = (r'先定个小目标.*$', re.DOTALL)
            cleaned_content = re.sub(ad_pattern, '', content)
            # 更新文本控件
            self.text.config(state=tk.NORMAL)
            self.text.delete(1.0, tk.END)
            self.text.insert('end', "\n")
            self.text.insert('end', "\n")
            self.text.insert('end', cleaned_content)
            self.text.config(state=tk.DISABLED)
        else:
            book_text_div = self.soup.find('div', id='book_text')

            # 提取文本并替换换行符和空格
            text = book_text_div.get_text(separator='\n', strip=True)
            text = text.replace('\n', ' ')  # 替换换行符
            text = re.sub(r'\s+', ' ', text)  # 替换连续的空格为单个空格

            # 移除分页标记
            text = re.sub(r'第$\d+/\d+$页', '', text)
            cleaned_content = re.sub(r'$本章未完,请翻页$', '', text)

            # 更新文本控件
            self.text.config(state=tk.NORMAL)
            self.text.delete(1.0, tk.END)
            self.text.insert('end', "\n")
            # self.text.insert('end', "\n")
            self.text.insert('end', cleaned_content)
            self.text.config(state=tk.DISABLED)
            # print("未找到正文内容")
            self.bookmark()

    def get_chapter_p(self):
       # 精准定位正文容器(class="word_read")
        content_div = self.soup.find('div', class_="word_read")
        if not content_div:
            content_div = self.soup.find('div', id="content")  # 备用方案
            if not content_div:
                content_div = self.soup.find('div', id="neirong")  # 备用方案
                if not content_div:
                    content_div = self.soup.find('div', id="container")  # 备用方案
        if content_div:
            # 提取所有 <p> 标签内容,并过滤广告
            paragraphs = content_div.find_all('p')
            filtered_content = []
            unwanted_websites = [
                'https://www.', 'https://www.', 'https://www.',
                '', '', '', 'kenshu.tw', '',
                'quanshu.la', 'tlxsw.cc', '', '', '',
                'baquge.cc', 'kenshuge.cc', '', '', ''
                '喜欢天师下山:', '我只想退婚请大家收藏:', '(www。aiquwx。com)', '天师下山:我只想退婚'
            ]

            for p in paragraphs:
                text = p.get_text().strip()
                # 过滤广告和无关内容
                if text and not any(site in text for site in unwanted_websites):
                    filtered_content.append(text)

            cleaned_content = '\n\n'.join(filtered_content)
            # 更新文本控件
            self.text.config(state=tk.NORMAL)
            self.text.delete(1.0, tk.END)
            self.text.insert('end', "\n")
            # self.text.insert('end', "\n")
            self.text.insert('end', cleaned_content)
            self.text.config(state=tk.DISABLED)
            self.bookmark()
        else:
            cleaned_content = "未找到正文内容。"

    def previous_chapter(self):
        # 找到下一章的链接
        prev_chapter_link = None
        prev_chapter_url = ""
        # 尝试通过id='next_url'提取(新网站结构)
        prev_chapter_link = self.soup.find('a', id='prev_url')
        if prev_chapter_link:
            prev_chapter_url = prev_chapter_link['href']
        prev_chapter_link = self.soup.find('a', id='pager_prev')
        if prev_chapter_link:
            prev_chapter_url = prev_chapter_link['href']
        else:
            # 尝试通过字符串内容提取(旧网站结构)
            prev_chapter_link = self.soup.find('a', string='上一章')
            if not prev_chapter_link:
                prev_chapter_link = self.soup.find('a', string='上一页')
        if prev_chapter_link:
            prev_chapter_url = prev_chapter_link['href']
            for key in  if prev_chapter_url.startswith(key):
                    # 获取对应的域名
                    base_url = [key]
                    # 补全地址
                    full_prev_chapter_url = f"{base_url}{prev_chapter_url}"
                    # 调用get_chapter函数显示下一章内容
                    if base_url in self.url_prefix_p:
                        self.get_chapter(full_prev_chapter_url)
                    elif base_url in self.url_prefix_show_txt:
                        self.get_chapter(full_prev_chapter_url)
                    else:
                        self.get_chapter(full_prev_chapter_url)
                    break
                else:
                    print("未找到匹配的域名")
            else:
                print("没有找到上一章的链接")

    def next_chapter(self):
        # 找到下一章的链接
        next_chapter_link = None
        next_chapter_url = ""
        # 尝试通过id='next_url'提取(新网站结构)
        next_chapter_link = self.soup.find('a', id='next_url')
        print("1",next_chapter_link)
        if next_chapter_link:
            next_chapter_url = next_chapter_link['href']
        else:
            next_chapter_link = self.soup.find('a', id='pager_next')
            if next_chapter_link:
                next_chapter_url = next_chapter_link['href']
            else:
                # 尝试通过字符串内容提取(旧网站结构)
                next_chapter_link = self.soup.find('a', string='下一章')
                if not next_chapter_link:
                    next_chapter_link = self.soup.find('a', string='下一页')
                if next_chapter_link:
                    next_chapter_url = next_chapter_link['href']
        # 如果成功提取到链接,补全地址并处理
        print(next_chapter_link)
        if next_chapter_link:
            # 从字典中查找匹配的键
            for key in  if next_chapter_url.startswith(key):
                    # 获取对应的域名
                    base_url = [key]
                    # 补全地址
                    full_next_chapter_url = f"{base_url}{next_chapter_url}"
                    print('full',full_next_chapter_url)
                    # 调用get_chapter函数显示下一章内容
                    if base_url in self.url_prefix_p:
                        self.get_chapter(full_next_chapter_url)
                    if base_url in self.url_prefix_show_txt:
                        self.get_chapter(full_next_chapter_url)
                    break
            else:
                print("未找到匹配的域名")
        else:
            print("没有找到下一章的链接")

    def bookmark(self):
        try:
            rr = self.label1.cget("text")  # 小说名称
            r2 = self.label2.cget("text")  # 章节名称
            r3 = self.label_dress.cget("text").split("")[1]  # URL
            r1 = self.extract_novel_name(rr, r3)

            selected_item = self.gride.selection()[0]  # 获取当前选中的行
            self.gride.item(selected_item, values=(r1, r2, r3))
        except IndexError:
            print("请先选中要修改的行!")

    def extract_novel_name(self, title, url):
        # 为添加标签,整理title
        rules = {
            "https://www.": lambda t: t.split("_")[1].strip(),
            "https://www.": lambda t: t.split("_")[1].strip(),
            "https://www.": lambda t: t.split("_")[1].strip(),
            "https://www.": lambda t: t.replace("阅读最新章节", "").split("_")[0].strip(),
            "https://www.": lambda t: t.split("_")[0].strip(),
        }
        for site, rule in rules.items():
            if url.startswith(site):
                return rule(title)
        return title  # 默认返回原标题


    def quit_ip(self):        ##命令按钮退
        self.save_reading(sv=2)
        self.root.destroy()

# 主程序
if __name__ == "__main__":
    root = tk.Tk()
    win = NovelReader(root)

    root.mainloop()


"小说阅读器"
交流一下.并不是所有小说网站都能正确打开.只有符合代码所列条件的才能正确显示.
第一章 五行补火针(1/2)_绝世强医 - 神品屋,第一章 五行补火针,https://www.shenplnwu.cc/56122/56122265/132795994.html
天师下山我只想退婚,乐文小说网>天师下山我只想退婚阅读 > 第273章 要么还钱要么死(第1页),https://www.


认真看书学习,弄通Fox主义。
5 天前 16:20
快速回复:求助:下列代码不能爬取毎章小说的下一页
数据加载中...
 
   
关于我们 | 广告合作 | 编程中国 | 清除Cookies | TOP | 手机版

编程中国 版权所有,并保留所有权利。
Powered by Discuz, Processed in 0.033097 second(s), 9 queries.
Copyright©2004-2026, BCCN.NET, All Rights Reserved