![]() |
#2
lxlsf2018-01-21 17:01
|
1.实现方案
1.1 采用多进程创建多个爬虫对象 爬虫对象主要由获取网页内容,分析网页内容,下载图片,下载音乐 这四个线程组成
1.2 进程及线程数根据网络情况设置
1.3 图片保存在img下 音乐放在music下以期刊命名 内容存放在result.txt中
2.结果
只有本站会员才能查看附件,请 登录
只有本站会员才能查看附件,请 登录
只有本站会员才能查看附件,请 登录
只有本站会员才能查看附件,请 登录
3.代码
![](zzz/editor/img/code.gif)
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import re
import string
import sys
import os
import threading
import requests
import queue
import time
import multiprocessing
class spider:
def __init__(self,path):
#url 队列
self.queUrl = queue.Queue()
#网页内容 队列
self.quePageInfo = queue.Queue()
#保存爬取结果
self.f = open(path, "w+")
#爬取线程
self.threads = []
#写文件锁
self.mu = threading.Lock()
#下载图片l队列
self.queImg = queue.Queue()
#下载音乐队列
self.queMusic = queue.Queue()
#初始化下载路径
self.DownLoadPath()
#提取网页包含链接
def GetUrl(self,page):
regular = "href=\"([^\"]*)"
pattern = (regular)
result = pattern.findall(page)
for i in result:
self.queUrl.put(i)
#给定url放入抓取队列
def SetCapUrlQueue(self,url):
self.queUrl.put(url)
#创建图片保存以及音乐下载目录
def DownLoadPath(self):
#下载图片目录
self.pathImg = "./img"
#下载音乐目录
self.pathMusic = "./music"
#音乐下载链接
self.musicUrl = "http://mp3-cdn2."
#不存在创建
isExists=os.path.exists(self.pathImg)
if not isExists:
try:
os.makedirs(self.pathImg)
except Exception:
print ("create",self.pathImg,"err")
isExists=os.path.exists(self.pathMusic)
if not isExists:
try:
os.makedirs(self.pathMusic)
except Exception:
print ("create",self.pathMusic,"err")
#获取网页放入队列
def GetPage(self):
while not self.queUrl.empty():
url = self.queUrl.get()
r = requests.get(url)
a = [url,r.text]
self.quePageInfo.put(a)
#下载图片
def DownLoadImg(self):
while True:
while not self.queImg.empty():
img = self.queImg.get()
#文件存在不下载
path = self.pathImg + "/" + img[1]
isExists = os.path.exists(path)
if not isExists:
try:
r = requests.get(img[0])
except Exception:
print ("get err")
continue
else:
if r.status_code == 200:
open(path, 'wb').write(r.content)
else:
#GET 失败获取五次
for i in range(0,5):
r = requests.get(img[0])
if r.status_code == 200:
open(path, 'wb').write(r.content)
break
time.sleep(2)
time.sleep(2)
#下载音乐
def DownLoadMusic(self):
while True:
while not self.queMusic.empty():
music = self.queMusic.get()
path = self.pathMusic + "/" + music[0] + "/"
if '/' in music[1]:
tmp = music[1].split('/')
filePath = self.pathMusic + "/" + music[0] + "/" + tmp[0] + tmp[1] + ".mp3"
else:
filePath = self.pathMusic + "/" + music[0] + "/" + music[1] + ".mp3"
#按期刊创建文件夹
if self.mu.acquire(True):
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
self.mu.release()
#音乐不存在下载
isExists = os.path.exists(filePath)
if not isExists:
try:
r = requests.get(music[2])
except Exception:
print ("get err")
continue
else:
if r.status_code == 200:
print ("downLoad",music[2])
open(filePath, 'wb').write(r.content)
else:
#GET 失败获取五次
for i in range(0,5):
newUrl = music[2].replace('/0','/')
print ("redownLoad",newUrl)
r = requests.get(newUrl)
if r.status_code == 200:
open(filePath, 'wb').write(r.content)
break
time.sleep(2)
time.sleep(2)
#创建下载音乐线程
def CreateDownLoadMusicThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.DownLoadMusic,args=())
self.threads.append(t)
#创建获取网页信息线程
def CreateGetPageThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.GetPage,args=())
self.threads.append(t)
#创建分析网页内容线程
def CreatePsrPageThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.PsrPage,args=())
self.threads.append(t)
#创建图片下载线程
def CreateDownLoadImgThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.DownLoadImg,args=())
self.threads.append(t)
#启动线程
def Run(self):
for t in self.threads:
t.setDaemon(True)
t.start()
t.join()
#取出网页内容队列分析
def PsrPage(self):
while True:
while not self.quePageInfo.empty():
a = self.quePageInfo.get()
#提取图片链接
regular = "(\<img src=\"(http:\/\/img-cdn2.\/pics\/vol\/([^\!]*)![^\"]*))|"
#提取描述
regular += "(<meta name=\"description\" content=\"([^\"]*))|"
#提取音乐主题
regular += "(<meta name=\"keywords\" content=\"([^\"]*))|"
#提取期刊编号
regular += "(vol-number rounded\"\>([^\<]*))|"
#提取期刊标题
regular += "(vol-title\"\>([^\<]*))|"
#提取音乐
regular += "(trackname btn-play\"\>([^\<]*))"
pattern = (regular)
result = pattern.findall(a[1])
if len(result)<10:
continue
i = 0
first = 0
content = a[0] + '\n'
imgName = ""
music = ""
for tmp in result:
if (i == 0):
#描述
content += tmp[4] + '\n'
elif (i == 1):
#音乐主题
content += "@mark " + tmp[6] + '\n'
elif (i == 2):
#期刊编号
music = str(int(tmp[8]))
content += "@vol " + tmp[8] + '\n'
imgName = tmp[8] + ".jpg"
elif (i == 3):
#期刊标题
content += "@tip " + tmp[10] + '\n'
elif (tmp[0] != ''):
first = first + 1
#第一张图片为封面
if(first == 1):
#提取图片链接 图片名称
img = [tmp[1],imgName]
self.queImg.put(img)
content += "@img " + imgName + '\n'
content += "@music\n"
else:
#音乐名
content += " " + tmp[12] + '\n'
#保存音乐下载链接
s = tmp[12].split('.')
path = self.musicUrl + music + "/" + s[0] + ".mp3"
info = [music,tmp[12],path]
self.queMusic.put(info)
i = i + 1
#获取锁写文件
if self.mu.acquire(True):
self.f.write(content)
self.mu.release()
time.sleep(2)
#关闭文件退出
def Quit(self):
self.f.close()
def worker(num):
path = 'result' + str(num) + '.txt'
Luo = spider(path)
avg = 250
num = num*avg + avg
for i in range(num-avg,num):
content = "http://www."
if i < 10:
url = content + "00" + str(i)
elif i < 100:
url = content + "0" + str(i)
else:
url = content + str(i)
Luo.SetCapUrlQueue(url)
Luo.CreateGetPageThread(1)
Luo.CreatePsrPageThread(1)
Luo.CreateDownLoadImgThread(1)
Luo.CreateDownLoadMusicThread(1)
Luo.Run()
#创建进程
def RunSpider(num):
for i in range(0, num):
p = multiprocessing.Process(target = worker, args=(i,))
p.start()
if __name__ == '__main__':
RunSpider(1)
# -*- coding: UTF-8 -*-
import re
import string
import sys
import os
import threading
import requests
import queue
import time
import multiprocessing
class spider:
def __init__(self,path):
#url 队列
self.queUrl = queue.Queue()
#网页内容 队列
self.quePageInfo = queue.Queue()
#保存爬取结果
self.f = open(path, "w+")
#爬取线程
self.threads = []
#写文件锁
self.mu = threading.Lock()
#下载图片l队列
self.queImg = queue.Queue()
#下载音乐队列
self.queMusic = queue.Queue()
#初始化下载路径
self.DownLoadPath()
#提取网页包含链接
def GetUrl(self,page):
regular = "href=\"([^\"]*)"
pattern = (regular)
result = pattern.findall(page)
for i in result:
self.queUrl.put(i)
#给定url放入抓取队列
def SetCapUrlQueue(self,url):
self.queUrl.put(url)
#创建图片保存以及音乐下载目录
def DownLoadPath(self):
#下载图片目录
self.pathImg = "./img"
#下载音乐目录
self.pathMusic = "./music"
#音乐下载链接
self.musicUrl = "http://mp3-cdn2."
#不存在创建
isExists=os.path.exists(self.pathImg)
if not isExists:
try:
os.makedirs(self.pathImg)
except Exception:
print ("create",self.pathImg,"err")
isExists=os.path.exists(self.pathMusic)
if not isExists:
try:
os.makedirs(self.pathMusic)
except Exception:
print ("create",self.pathMusic,"err")
#获取网页放入队列
def GetPage(self):
while not self.queUrl.empty():
url = self.queUrl.get()
r = requests.get(url)
a = [url,r.text]
self.quePageInfo.put(a)
#下载图片
def DownLoadImg(self):
while True:
while not self.queImg.empty():
img = self.queImg.get()
#文件存在不下载
path = self.pathImg + "/" + img[1]
isExists = os.path.exists(path)
if not isExists:
try:
r = requests.get(img[0])
except Exception:
print ("get err")
continue
else:
if r.status_code == 200:
open(path, 'wb').write(r.content)
else:
#GET 失败获取五次
for i in range(0,5):
r = requests.get(img[0])
if r.status_code == 200:
open(path, 'wb').write(r.content)
break
time.sleep(2)
time.sleep(2)
#下载音乐
def DownLoadMusic(self):
while True:
while not self.queMusic.empty():
music = self.queMusic.get()
path = self.pathMusic + "/" + music[0] + "/"
if '/' in music[1]:
tmp = music[1].split('/')
filePath = self.pathMusic + "/" + music[0] + "/" + tmp[0] + tmp[1] + ".mp3"
else:
filePath = self.pathMusic + "/" + music[0] + "/" + music[1] + ".mp3"
#按期刊创建文件夹
if self.mu.acquire(True):
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
self.mu.release()
#音乐不存在下载
isExists = os.path.exists(filePath)
if not isExists:
try:
r = requests.get(music[2])
except Exception:
print ("get err")
continue
else:
if r.status_code == 200:
print ("downLoad",music[2])
open(filePath, 'wb').write(r.content)
else:
#GET 失败获取五次
for i in range(0,5):
newUrl = music[2].replace('/0','/')
print ("redownLoad",newUrl)
r = requests.get(newUrl)
if r.status_code == 200:
open(filePath, 'wb').write(r.content)
break
time.sleep(2)
time.sleep(2)
#创建下载音乐线程
def CreateDownLoadMusicThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.DownLoadMusic,args=())
self.threads.append(t)
#创建获取网页信息线程
def CreateGetPageThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.GetPage,args=())
self.threads.append(t)
#创建分析网页内容线程
def CreatePsrPageThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.PsrPage,args=())
self.threads.append(t)
#创建图片下载线程
def CreateDownLoadImgThread(self,num):
for i in range(0,num):
t = threading.Thread(target=self.DownLoadImg,args=())
self.threads.append(t)
#启动线程
def Run(self):
for t in self.threads:
t.setDaemon(True)
t.start()
t.join()
#取出网页内容队列分析
def PsrPage(self):
while True:
while not self.quePageInfo.empty():
a = self.quePageInfo.get()
#提取图片链接
regular = "(\<img src=\"(http:\/\/img-cdn2.\/pics\/vol\/([^\!]*)![^\"]*))|"
#提取描述
regular += "(<meta name=\"description\" content=\"([^\"]*))|"
#提取音乐主题
regular += "(<meta name=\"keywords\" content=\"([^\"]*))|"
#提取期刊编号
regular += "(vol-number rounded\"\>([^\<]*))|"
#提取期刊标题
regular += "(vol-title\"\>([^\<]*))|"
#提取音乐
regular += "(trackname btn-play\"\>([^\<]*))"
pattern = (regular)
result = pattern.findall(a[1])
if len(result)<10:
continue
i = 0
first = 0
content = a[0] + '\n'
imgName = ""
music = ""
for tmp in result:
if (i == 0):
#描述
content += tmp[4] + '\n'
elif (i == 1):
#音乐主题
content += "@mark " + tmp[6] + '\n'
elif (i == 2):
#期刊编号
music = str(int(tmp[8]))
content += "@vol " + tmp[8] + '\n'
imgName = tmp[8] + ".jpg"
elif (i == 3):
#期刊标题
content += "@tip " + tmp[10] + '\n'
elif (tmp[0] != ''):
first = first + 1
#第一张图片为封面
if(first == 1):
#提取图片链接 图片名称
img = [tmp[1],imgName]
self.queImg.put(img)
content += "@img " + imgName + '\n'
content += "@music\n"
else:
#音乐名
content += " " + tmp[12] + '\n'
#保存音乐下载链接
s = tmp[12].split('.')
path = self.musicUrl + music + "/" + s[0] + ".mp3"
info = [music,tmp[12],path]
self.queMusic.put(info)
i = i + 1
#获取锁写文件
if self.mu.acquire(True):
self.f.write(content)
self.mu.release()
time.sleep(2)
#关闭文件退出
def Quit(self):
self.f.close()
def worker(num):
path = 'result' + str(num) + '.txt'
Luo = spider(path)
avg = 250
num = num*avg + avg
for i in range(num-avg,num):
content = "http://www."
if i < 10:
url = content + "00" + str(i)
elif i < 100:
url = content + "0" + str(i)
else:
url = content + str(i)
Luo.SetCapUrlQueue(url)
Luo.CreateGetPageThread(1)
Luo.CreatePsrPageThread(1)
Luo.CreateDownLoadImgThread(1)
Luo.CreateDownLoadMusicThread(1)
Luo.Run()
#创建进程
def RunSpider(num):
for i in range(0, num):
p = multiprocessing.Process(target = worker, args=(i,))
p.start()
if __name__ == '__main__':
RunSpider(1)