爬虫源码-------批量爬取美女图片
import requestsfrom bs4 import BeautifulSoup
import os
import re
import time
import PySimpleGUI as sg
import random
#获得每页专辑的地址和专辑名称
def zongwangzhi(url):
data=paqu(url)
data=BeautifulSoup(data,"html.parser",from_encoding="gb18030")#解决中文乱码
d=data.find_all('div',class_="listMeinuT")
# print(d)
d=str(d[0])
findlink=(r'<a class="MMPic"(.*?)</li><li>')
link=re.findall(findlink,d)
return link
#获得每个专辑的页数
def yeshu(url):
sj=paqu(url)
data=BeautifulSoup(sj,"html.parser",from_encoding="gb18030")
y=data.find_all("ul",class_="articleV4Page l")
y=str(y[0])
findy=(r'共(.*?)页:')
linky=re.findall(findy,y)
return int(linky[0])
#拼接专辑地址
def zhuanjidiz(url,cs,wenjiandz):
for i in range(1,cs+1):
if i==1:
url1=url
print("一共有"+str(cs)+"页,现在下载的是第1页。")
sj=paqu(url1)
data=BeautifulSoup(sj,"html.parser",from_encoding="gb18030")
d=data.find_all('div',class_="content")
d=str(d[0])
findlink=(r' src="(.*?)"')
link=re.findall(findlink,d)
else:
url1=url[:-5]+"_"+str(i)+".html"
print("一共有"+str(cs)+"页,现在下载的是第"+str(i)+"页。")
sj=paqu(url1)
data=BeautifulSoup(sj,"html.parser",from_encoding="gb18030")
d=data.find_all('div',class_="content")
d=str(d[0])
findlink=(r' src="(.*?)"')
link=re.findall(findlink,d)
for y in link:
baocuntupian(y,wenjiandz)
#爬取网页数据
def paqu(url):
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
try:
urldate=requests.get(url,headers=header,timeout=12).content
return urldate
except:
try:
print("\n","*****************************************网络超时重新连接第1次*******************************************")
urldate=requests.get(url,headers=header,timeout=12).content
return urldate
except:
try:
print("\n","*************************************网络超时重新连接第2次*******************************************")
urldate=requests.get(url,headers=header,timeout=30).content
return urldate
except:
print("\n","--------------------------------------网络连接失败请重新尝试--------------------------------------------------")
# print(time.sleep(86400))
pass
#保存图片
def baocuntupian(url,wenjiandizi):
tupan = paqu(url)
name=random.randint(0,10000000)
print("正在下载......")
file=open(wenjiandizi+"\\"+str(name)+".jpg","wb")
file.write(tupan)
file.close()
def main(dz,ks,js):
kaishi=ks
jieshu=js
#获得总的网页,返回每个专辑的网页地址和专辑名称
for i in range(kaishi,jieshu+1):
url="https://www.tu963.cc/y/2/list_2_"
list1=[]
url=url+str(i)+".html"
print(url)
list1=zongwangzhi(url)
for o in list1:
find1=(r' href="(.*?)" target=')
dizi_href=re.findall(find1,o)
find2=(r'title="(.*?)">')
dizi_title=re.findall(find2,o)
# print(dizi_href[0],dizi_title[0])
ys=yeshu(dizi_href[0])
ml=dz+dizi_title[0]
if not os.path.exists(ml):
print("\n","已经新建文件夹!")
os.mkdir(ml)
print("----------------------------------------------------------------------------")
zhuanjidiz(dizi_href[0],ys,ml)
else:
print("文件夹已经存在,不用爬取!")
print("-----------------------------------------------------------")
print("---------------完成----------------------------")
def denglu_windows():
font_=("黑体",20)
layout=[
[sg.InputText("请选择要保存的路径......",font=font_,size=(30,1),key="-dizhi-"),sg.FolderBrowse("选择",font=font_)],
[sg.T("开始网页:",font=font_),sg.In("1",enable_events=True,font=font_,key="-kaishi-",size=(3,1))],
[sg.T("结束网页:",font=font_),sg.In("1",enable_events=True,font=font_,key="-jieshu-",size=(3,1))],
[sg.T("页数进度:",font=font_),sg.ProgressBar( max_value=5,orientation="h",size=(47,20),key="-bar-")],
[sg.T("专辑进度:",font=font_),sg.ProgressBar(5,orientation="h",size=(47,20),key="-bar2-")],
[sg.T("图片进度:",font=font_),sg.ProgressBar(5,orientation="h",size=(47,20),key="-bar3-")],
[sg.B("爬取",font=font_)],
]
window=sg.Window("爬虫",layout)
list1=[]
while True:
event,values=window.read()
if event=="爬取":
main(values["-dizhi-"]+"\\",int(values["-kaishi-"]),int(values["-jieshu-"]))
if event==None:
break
window.close()
denglu_windows()
#大家将代码负责到编辑器中运行,如果报错就看看哪些库没有安装,将对应的库安装上就可以了。开始运行是要先选择文件保存的位置,然后选择要下载的页码,开始页码一定要大于等于1。