#2
vbck2023-07-04 19:43
|
发现查看网页的元素普通,有些是正常的,有些就是第一张和最后一张是对调了的
高手们有办法解决吗,谢谢
http://2.
http://2.
http://2.
------------------------------------------------------
程序代码:
from selenium import webdriver
from selenium. import By
import requests
import time
import pandas as pd
import os
driver = webdriver.Chrome()
driver.implicitly_wait(5)
driver.maximize_window()
df = pd.read_excel('下载网址.xlsx', header=None)
base_url = 'https://market.m.'
for i in range(df.shape[0]):
id = df[8][i].split('=')[1]
url = base_url + str(id)
folder_path = "D:/python/" + df[0][i]
if not os.path.exists(folder_path):
os.makedirs(folder_path)
driver.get(url)
time.sleep(5) # 等待 5 秒,等页面完全加载完成
texts = driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[1]/div[2]/div[2]//span')
text_list = []
for k in range(len(texts)):
text = texts[k].text
text_list.append(text)
df.iloc[i, 1] = ' '.join(text_list)
imag = driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[1]/div[1]/div//img')
img_list = []
for g in range(len(imag)):
src = imag[g].get_attribute("src")
if src not in img_list:
img_list.append(src)
j = 1
for i in img_list:
response = requests.get(i)
with open(folder_path + '/{}.jpg'.format(j), 'wb') as f:
f.write(response.content)
j += 1
pass
df.to_excel('下载网址.xlsx', index=False, header=False)
from selenium. import By
import requests
import time
import pandas as pd
import os
driver = webdriver.Chrome()
driver.implicitly_wait(5)
driver.maximize_window()
df = pd.read_excel('下载网址.xlsx', header=None)
base_url = 'https://market.m.'
for i in range(df.shape[0]):
id = df[8][i].split('=')[1]
url = base_url + str(id)
folder_path = "D:/python/" + df[0][i]
if not os.path.exists(folder_path):
os.makedirs(folder_path)
driver.get(url)
time.sleep(5) # 等待 5 秒,等页面完全加载完成
texts = driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[1]/div[2]/div[2]//span')
text_list = []
for k in range(len(texts)):
text = texts[k].text
text_list.append(text)
df.iloc[i, 1] = ' '.join(text_list)
imag = driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[1]/div[1]/div//img')
img_list = []
for g in range(len(imag)):
src = imag[g].get_attribute("src")
if src not in img_list:
img_list.append(src)
j = 1
for i in img_list:
response = requests.get(i)
with open(folder_path + '/{}.jpg'.format(j), 'wb') as f:
f.write(response.content)
j += 1
pass
df.to_excel('下载网址.xlsx', index=False, header=False)