关于Pool线程池的问题
本人写了个爬虫,加了线程池,运行无报错,可我却发现了一个问题,如图爬取的顺序是乱的,请问这个问题如何解决,谢谢
代码如下:
from multiprocessing import Pool
import re
from lxml import etree
import requests
headers = {
'UserAgent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
}
def get_html():
url = "http://www.
response = requests.get(url, headers=headers)
print('%s 请求成功' %(url))
response.encoding = 'utf-8'
response = etree.HTML(response.text)
url_list = response.xpath('/html/body/div[7]/div/ul//li/span/a/@href')
return url_list
def save_html(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
response = etree.HTML(response.text)
title = response.xpath('//title/text()')[0].replace('?', '')
# title = response.xpath('//div[@cLass="bg"]/h1/text()')[0]
text_list = response.xpath('//div[@class="bg"]/div[@class="content"]//p/text()')
with open('C:/Users/Administrator/Desktop/斗罗大陆/%s.txt' %(title), 'w', encoding='utf-8') as file:
file.write(title + '\n')
for text in text_list:
file.write('\t%s\n' %(text))
print('《%s》爬取成功' %(title))
if __name__ == "__main__":
url_list = get_html()
pool = Pool(4)
pool.map(save_html, url_list)