python 爬虫 抓取落网音乐以及期刊内容图片
学了两天python 没啥感觉 还是要写点东西1.实现方案
1.1 采用多进程创建多个爬虫对象 爬虫对象主要由获取网页内容,分析网页内容,下载图片,下载音乐 这四个线程组成
1.2 进程及线程数根据网络情况设置
1.3 图片保存在img下 音乐放在music下以期刊命名 内容存放在result.txt中
2.结果
3.代码
程序代码:
#!/usr/bin/python3 # -*- coding: UTF-8 -*- import re import string import sys import os import threading import requests import queue import time import multiprocessing class spider: def __init__(self,path): #url 队列 self.queUrl = queue.Queue() #网页内容 队列 self.quePageInfo = queue.Queue() #保存爬取结果 self.f = open(path, "w+") #爬取线程 self.threads = [] #写文件锁 self.mu = threading.Lock() #下载图片l队列 self.queImg = queue.Queue() #下载音乐队列 self.queMusic = queue.Queue() #初始化下载路径 self.DownLoadPath() #提取网页包含链接 def GetUrl(self,page): regular = "href=\"([^\"]*)" pattern = (regular) result = pattern.findall(page) for i in result: self.queUrl.put(i) #给定url放入抓取队列 def SetCapUrlQueue(self,url): self.queUrl.put(url) #创建图片保存以及音乐下载目录 def DownLoadPath(self): #下载图片目录 self.pathImg = "./img" #下载音乐目录 self.pathMusic = "./music" #音乐下载链接 self.musicUrl = "http://mp3-cdn2." #不存在创建 isExists=os.path.exists(self.pathImg) if not isExists: try: os.makedirs(self.pathImg) except Exception: print ("create",self.pathImg,"err") isExists=os.path.exists(self.pathMusic) if not isExists: try: os.makedirs(self.pathMusic) except Exception: print ("create",self.pathMusic,"err") #获取网页放入队列 def GetPage(self): while not self.queUrl.empty(): url = self.queUrl.get() r = requests.get(url) a = [url,r.text] self.quePageInfo.put(a) #下载图片 def DownLoadImg(self): while True: while not self.queImg.empty(): img = self.queImg.get() #文件存在不下载 path = self.pathImg + "/" + img[1] isExists = os.path.exists(path) if not isExists: try: r = requests.get(img[0]) except Exception: print ("get err") continue else: if r.status_code == 200: open(path, 'wb').write(r.content) else: #GET 失败获取五次 for i in range(0,5): r = requests.get(img[0]) if r.status_code == 200: open(path, 'wb').write(r.content) break time.sleep(2) time.sleep(2) #下载音乐 def DownLoadMusic(self): while True: while not self.queMusic.empty(): music = self.queMusic.get() path = self.pathMusic + "/" + music[0] + "/" if '/' in music[1]: tmp = music[1].split('/') filePath = self.pathMusic + "/" + music[0] + "/" + tmp[0] + tmp[1] + ".mp3" else: filePath = self.pathMusic + "/" + music[0] + "/" + music[1] + ".mp3" #按期刊创建文件夹 if self.mu.acquire(True): isExists = os.path.exists(path) if not isExists: os.makedirs(path) self.mu.release() #音乐不存在下载 isExists = os.path.exists(filePath) if not isExists: try: r = requests.get(music[2]) except Exception: print ("get err") continue else: if r.status_code == 200: print ("downLoad",music[2]) open(filePath, 'wb').write(r.content) else: #GET 失败获取五次 for i in range(0,5): newUrl = music[2].replace('/0','/') print ("redownLoad",newUrl) r = requests.get(newUrl) if r.status_code == 200: open(filePath, 'wb').write(r.content) break time.sleep(2) time.sleep(2) #创建下载音乐线程 def CreateDownLoadMusicThread(self,num): for i in range(0,num): t = threading.Thread(target=self.DownLoadMusic,args=()) self.threads.append(t) #创建获取网页信息线程 def CreateGetPageThread(self,num): for i in range(0,num): t = threading.Thread(target=self.GetPage,args=()) self.threads.append(t) #创建分析网页内容线程 def CreatePsrPageThread(self,num): for i in range(0,num): t = threading.Thread(target=self.PsrPage,args=()) self.threads.append(t) #创建图片下载线程 def CreateDownLoadImgThread(self,num): for i in range(0,num): t = threading.Thread(target=self.DownLoadImg,args=()) self.threads.append(t) #启动线程 def Run(self): for t in self.threads: t.setDaemon(True) t.start() t.join() #取出网页内容队列分析 def PsrPage(self): while True: while not self.quePageInfo.empty(): a = self.quePageInfo.get() #提取图片链接 regular = "(\<img src=\"(http:\/\/img-cdn2.\/pics\/vol\/([^\!]*)![^\"]*))|" #提取描述 regular += "(<meta name=\"description\" content=\"([^\"]*))|" #提取音乐主题 regular += "(<meta name=\"keywords\" content=\"([^\"]*))|" #提取期刊编号 regular += "(vol-number rounded\"\>([^\<]*))|" #提取期刊标题 regular += "(vol-title\"\>([^\<]*))|" #提取音乐 regular += "(trackname btn-play\"\>([^\<]*))" pattern = (regular) result = pattern.findall(a[1]) if len(result)<10: continue i = 0 first = 0 content = a[0] + '\n' imgName = "" music = "" for tmp in result: if (i == 0): #描述 content += tmp[4] + '\n' elif (i == 1): #音乐主题 content += "@mark " + tmp[6] + '\n' elif (i == 2): #期刊编号 music = str(int(tmp[8])) content += "@vol " + tmp[8] + '\n' imgName = tmp[8] + ".jpg" elif (i == 3): #期刊标题 content += "@tip " + tmp[10] + '\n' elif (tmp[0] != ''): first = first + 1 #第一张图片为封面 if(first == 1): #提取图片链接 图片名称 img = [tmp[1],imgName] self.queImg.put(img) content += "@img " + imgName + '\n' content += "@music\n" else: #音乐名 content += " " + tmp[12] + '\n' #保存音乐下载链接 s = tmp[12].split('.') path = self.musicUrl + music + "/" + s[0] + ".mp3" info = [music,tmp[12],path] self.queMusic.put(info) i = i + 1 #获取锁写文件 if self.mu.acquire(True): self.f.write(content) self.mu.release() time.sleep(2) #关闭文件退出 def Quit(self): self.f.close() def worker(num): path = 'result' + str(num) + '.txt' Luo = spider(path) avg = 250 num = num*avg + avg for i in range(num-avg,num): content = "http://www." if i < 10: url = content + "00" + str(i) elif i < 100: url = content + "0" + str(i) else: url = content + str(i) Luo.SetCapUrlQueue(url) Luo.CreateGetPageThread(1) Luo.CreatePsrPageThread(1) Luo.CreateDownLoadImgThread(1) Luo.CreateDownLoadMusicThread(1) Luo.Run() #创建进程 def RunSpider(num): for i in range(0, num): p = multiprocessing.Process(target = worker, args=(i,)) p.start() if __name__ == '__main__': RunSpider(1)