#2
zjd8737215192018-07-18 10:52
|
程序代码:
import requests as r
import re
import time as t
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/\
65.0.3325.181 Safari/537.36'
}
client = pmg.MongoClient('localhost', 27017)
mydb = client['mydb']
d = mydb['doupochangqiong']
a = []
try:
def get_info(url):
global a
res = r.get(url, headers=headers)
if res.status_code == 200:
contents = re.findall('<p>(.*?)</p>', res.content.decode('utf-8'), re.S)
for content in contents:
content = re.sub(' www. ', '', content)
content = re.sub('&\\w;','"',content)
a.append(content)
else:
pass
if __name__ == '__main__':
urls = ['http://www.{}.html'.format(str(i)) for i in range(1, 1665)]
for urla in urls:
get_info(url=urla)
t.sleep(1)
b = ''
b += a[c-1: c]for c in range(1, len(a) + 1)
info = {'dou': b}
d.insert_one(info)
except:
pass
[此贴子已经被作者于2018-7-12 17:51编辑过]