请问:运行报错FileNotFoundError: [Errno 2] No such file or directory: 'cache.csv'怎么解决
程序代码:
import requests from pyquery import PyQuery as pq from prettyprinter import cpprint import json from urllib.parse import urlencode from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium. import By from selenium.webdriver.support import expected_conditions as EC import time import csv import datetime import sys def get_ajax(url): headers = { 'referer': 'https://item.', # referer: https://item.'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } response = requests.get(url, headers=headers) return json.loads(response.text[26:-2]) def make_url(baseurl, page=0, score=0, productId='3756271'): data1 = { 'callback': 'fetchJSON_comment98vv7490', 'productId': productId, 'score': score, 'sortType': '6', 'page': page, 'pageSize': '10', 'isShadowSku': '0', # 'fold': '1', # } url = baseurl + urlencode(data1) return url def parse_json(rjson, url=None): for comment in rjson.get('comments'): item = {} item['url'] = url item['评论星级'] = comment.get('score') item['评论长度'] = len(comment.get('content')) item['评论点赞数量'] = comment.get('usefulVoteCount') item['评论回复数量'] = comment.get('replyCount') item['评论文本内容'] = comment.get('content') item['评论者等级'] = comment.get('userLevelId') try: date1 = time.strptime(comment.get('creationTime'), "%Y-%m-%d %H:%M:%S") date2 = time.localtime(time.time()) date1 = datetime.datetime(date1[0], date1[1], date1[2]) date2 = datetime.datetime(date2[0], date2[1], date2[2]) item['评论发表距抓取的天数(days)'] = str((date2 - date1).days) except Exception as error: print('error is >>>', error) item['评论发表距抓取的天数(days)'] = '' if comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '') == '此用户未填写评价内容': item['追评文本内容'] = '' else: item['追评文本内容'] = comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '') try: date1 = time.strptime(comment.get('afterUserComment', {}).get('created', ''), "%Y-%m-%d %H:%M:%S") date2 = time.localtime(time.time()) date1 = datetime.datetime(date1[0], date1[1], date1[2]) date2 = datetime.datetime(date2[0], date2[1], date2[2]) item['追评与初评相距时间'] = str((date2 - date1).days) except Exception: item['追评与初评相距时间'] = '' if item['追评文本内容'] == '': item['追评与初评相距时间'] = '' yield item def save_csv_merinfo(item): with open(FILENAME_MER, 'a', encoding=ENCODING, newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames_merinfo) # writer.writeheader() writer.writerow(item) def save_csv_cominfo(item): with open(FILENAME_COM, 'a', encoding=ENCODING, newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames_cominfo) # writer.writeheader() writer.writerow(item) def get_page(url): browser.get(url) submit = wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class,"tab-main")]/ul/li[5]'))) time.sleep(2) for i in range(30): browser.execute_script("window.scrollBy(0,50)") time.sleep(0.1) submit.click() time.sleep(3) return browser.page_source def parse_page(html, url): page_item = {} doc = pq(html, parser='html') page_item['url'] = url page_item['商品小分类名称'] = doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(5) > a').text() page_item['商品名称'] = doc('div.itemInfo-wrap div.sku-name').text() page_item['商品总评论数量'] = doc('#detail > div.tab-main.large > ul > li.current > s').text().replace('(', '').replace( ')', '') page_item['商品好评率'] = doc('#comment > div.mc > > > div').text() ##comment > div.mc > > > div return page_item def csv_create(): with open(FILENAME_MER, 'w', encoding=ENCODING, newline='') as f: writer = csv.writer(f) writer.writerow(fieldnames_merinfo) with open(FILENAME_COM, 'w', encoding=ENCODING, newline='') as f: writer = csv.writer(f) writer.writerow(fieldnames_cominfo) def crawl_all_page_url(): global ALL_PAGE_URL browser = webdriver.Chrome() wait = WebDriverWait(browser, 20) browser.get('https://www.') wait.until(EC.presence_of_element_located( (By.XPATH, '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]'))) CASE = [] for i in range(10): # 水果 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[2]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(4): # 猪肉羊肉 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[3]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(8): # 海鲜水产 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[4]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(4): # 禽肉蛋白 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[5]/dd/a[{}]'.format(i + 1) CASE.append(initcase) for i in range(6): # 冷冻食品 initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[6]/dd/a[{}]'.format(i + 1) CASE.append(initcase) # 规则只要更改range里面的值和dl[]里面的值,可高度扩展 for case in CASE: print('>>>>>>>>>') submit = wait.until(EC.element_to_be_clickable( (By.XPATH, case))) submit.click() print(browser.current_url) handle = browser.current_window_handle handles = browser.window_handles for newhandle in handles: if newhandle != handle: browser.switch_to.window(newhandle) time.sleep(1.5) wait.until(EC.presence_of_element_located((By.XPATH, '//div[@id="plist"]/ul[contains(@class,"gl-warp")]'))) doc = pq(browser.page_source, parser='html') for li in list(doc('div#plist ul.gl-warp li').items())[:10]: res = 'https:' + str(li('div div.p-commit-n strong a').attr('href')).replace('#comment', '') print(res) ALL_PAGE_URL.append(res) time.sleep(1.5) browser.close() browser.switch_to.window(handle) def load_all_page_url(): global ALL_PAGE_URL with open("cache.csv", 'r', encoding="utf-8") as f: reader = csv.reader(f) for item in reader: ALL_PAGE_URL.append(item[0]) if __name__ == '__main__': # 前期准备>>>>>>>>>> browser = webdriver.Chrome() # selenium模拟浏览器 wait = WebDriverWait(browser, 20) MAXINDEX = 7 # 最大请求评论页数,为了控制评论数量在500条左右,应该设置为35左右,35时略大于500(网页评论非无限下拉) # 用户自定义配置区******************************** TIMESLEEP = 2 # 睡眠间隔 FILENAME_MER = 'merinfo_test.csv' # 商品信息的文件名 FILENAME_COM = 'cominfo_test.csv' # 评论信息的文件名 #FILENAME_CACHE = 'cache.csv' ENCODING = 'UTF-8' # 保存的CSV的编码 # ********************************************** # csv文件的字段 fieldnames_merinfo = ['url', '商品小分类名称', '商品名称', '商品总评论数量', '商品好评率'] fieldnames_cominfo = ['url', '评论星级', '评论长度', '评论点赞数量', '评论回复数量', '评论文本内容', '评论者等级', '评论发表距抓取的天数(days)', '追评文本内容', '追评与初评相距时间'] # <<<<<<<<<<<<<<<<< start = time.time() # csv_create() # 重置 # 去重模块>>> URLSET = [] # 已存在的url的集合 with open(FILENAME_MER, 'r', encoding=ENCODING) as f: reader = csv.reader(f) for res in reader: URLSET.append(res[0]) print('URLSET is', URLSET) # 爬取商品信息 ALL_PAGE_URL = [] # 所有的网页链接 load_all_page_url() # 这两个函数要二选一,load_all_page_url会从本地的cache.csv载入,速度更快,脱机工作,不占用网络 # crawl_all_page_url() # 这两个函数要二选一,load_all_page_url会从本地的cache.csv载入,速度更快,脱机工作,不占用网络 for page_url in ALL_PAGE_URL: if page_url not in URLSET: URLSET.append(page_url) # 动态去重 try: html = get_page(page_url) # 请求网页,selenium动态渲染 item_mer = parse_page(html, url=page_url) # 解析网页,pyquery cpprint(item_mer) # 爬取评论信息,ajax Flag = 0 # 计数器 ITEMS = [] baseurl = 'https://sclub.' for score in [5, 3, 2, 1]: # 0全部评论,5追评,3好评,2中评,1差评 if score == 5: MAXINDEX_TEMP = MAXINDEX else: MAXINDEX_TEMP = int(MAXINDEX / 7) # 控制比例为7:1:1:1 for index in range(MAXINDEX_TEMP): time.sleep(TIMESLEEP) url = make_url(baseurl, page=index, score=score, productId=''.join(list(filter(str.isdigit, page_url)))) # 构造url try: json_ = get_ajax(url) # 进行ajax请求 if len(json_.get('comments')) != 0: for item in parse_json(json_, url=page_url): # 解析json cpprint(item) ITEMS.append(item) Flag += 1 else: break except Exception as error: print('AJAX请求发生错误{}>>>'.format(error)) print('url is {}'.format(url)) print(str(datetime.datetime.now())) sys.exit(0) # ajax请求出错时退出程序,确保数据完整性 # 一个网页的商品信息和评论信息都爬取完毕时,保存数据 save_csv_merinfo(item_mer) # 保存商品信息 for item in ITEMS: # 保存评论信息 try: save_csv_cominfo(item) except Exception as error: print(error) print("保存了{}条评论".format(Flag)) except Exception as error: print('网页请求发生错误{}>>>'.format(error)) print('一个网页请求已经结束>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') # time.sleep(TIMESLEEP) end = time.time() print('总共用时{}秒'.format(end - start))