股票数据Scrapy爬虫未成功
错误:File "D:\papachong\BaiduStocks\BaiduStocks\spiders\stocks.py", line 22, in parse_stock
name = stockInfo.css('.bets-name').extract()[0]
IndexError: list index out of range
该程序来自北理的嵩天老师网络爬虫课。
# -*- coding: utf-8 -*- import scrapy import re class StocksSpider(scrapy.Spider): name = "stocks" start_urls = ['https://quote.'] def parse(self, response): for href in response.css('a::attr(href)').extract(): try: stock = re.findall(r"[s][hz]\d{6}", href)[0] url = 'https://gupiao.baidu.com/stock/' + stock + '.html' yield scrapy.Request(url, callback=self.parse_stock) except: continue def parse_stock(self, response): infoDict = {} stockInfo = response.css('.stock-bets') name = stockInfo.css('.bets-name').extract()[0] keyList = stockInfo.css('dt').extract() valueList = stockInfo.css('dd').extract() for i in range(len(keyList)): key = re.findall(r'>.*</dt>', keyList[i])[0][1:-5] try: val = re.findall(r'\d+\.?.*</dd>', valueList[i])[0][0:-5] except: val = '--' infoDict[key]=val infoDict.update( {'股票名称': re.findall('\s.*\(',name)[0].split()[0] + \ re.findall('\>.*\<', name)[0][1:-1]}) yield infoDict
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc. class BaidustocksPipeline(object): def process_item(self, item, spider): return item class BaidustocksInfoPipeline(object): def open_spider(self, spider): self.f = open('BaiduStockInfo.txt', 'w') def close_spider(self, spider): self.f.close() def process_item(self, item, spider): try: line = str(dict(item)) + '\n' self.f.write(line) except: pass return item
# Configure item pipelines # See https://scrapy. ITEM_PIPELINES = { 'BaiduStocks.pipelines.BaidustocksInfoPipeline': 300, }