小天管理 发表于 2024年9月15日 发表于 2024年9月15日 我的代码: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import pandas as pd import time # 设置 Chrome 选项以启用 headless 模式和自定义 user-agent chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument( f'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"' ) # 初始化 WebDriver driver = webdriver.Chrome( executable_path="D:/lab/chromedriver-win64/chromedriver.exe", options=chrome_options ) url = "https://webs.bjidex.com/sys-bsc-home/#/bscConsole/tradingMarket" # 访问网页 driver.get(url) # 初始化一个列表来保存数据 data_list = [] # 爬取数据 for page in range(184): # 构建翻页按钮的 XPath if page < 4: next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[10]/button" elif 4 < page < 181: next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[12]/button" elif 181 < page: next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[10]/button" else: next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[11]/button" # 爬取每页的 10 组数据 for i in range(1, 11): time.sleep(1) # 等待页面加载新内容 # 构建每组的 XPath product_name_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[1]/div/span[1]" supplier_list_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[3]/span" product_type_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[1]/div/span[2]" application_scenario_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[3]/div" product_description_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[2]" price_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[2]/div[1]" try: # 供应商提供商品名称 product_name = driver.find_element(By.XPATH, product_name_xpath).text # 数据供应商名单 supplier_list = driver.find_element(By.XPATH, supplier_list_xpath).text # 商品类型 product_type = driver.find_element(By.XPATH, product_type_xpath).text # 应用场景 application_scenario = driver.find_element( By.XPATH, application_scenario_xpath ).text # 商品描述 product_description = driver.find_element( By.XPATH, product_description_xpath ).text # 价格 price = driver.find_element(By.XPATH, price_xpath).text # 将数据添加到列表 data_list.append( { "页数": page + 1, "供应商提供商品名称": product_name, "数据供应商名单": supplier_list, "商品类型": product_type, "应用场景": application_scenario, "商品描述": product_description, "价格": price, } ) except Exception as e: print(f"Error on page {page + 1}, item {i}: {e}") # 点击翻页按钮 try: next_button = WebDriverWait(driver, 2).until( # 等待时间为 2 秒 EC.element_to_be_clickable((By.XPATH, next_button_xpath)) ) next_button.click() except Exception as e: print("翻页出错或已经是最后一页:", e) break # 如果无法翻页,则跳出循环 # 关闭浏览器 driver.quit() # 将列表转换为 DataFrame data_df = pd.DataFrame(data_list) # 输出为表格 data_df.to_csv( "bjidex.com_data.csv", index=False, encoding="utf_8_sig" ) # 保存为 CSV 文件 print(data_df) # 打印 DataFrame 输出 PS D:\lab\bigdata24.9.9> & C:/tools/miniconda3/python.exe d:/lab/bigdata24.9.9/bjidex.com.py DevTools listening on ws://127.0.0.1:60057/devtools/browser/66a61aa5-3598-4069-94bd-d4f10be20d96 [42892:8184:0915/232611.717:ERROR:ssl_client_socket_impl.cc(882)] handshake failed; returned -1, SSL error code 1, net_error -101 [42892:8184:0915/232611.834:ERROR:ssl_client_socket_impl.cc(882)] handshake failed; returned -1, SSL error code 1, net_error -101 [42892:8184:0915/232630.213:ERROR:ssl_client_socket_impl.cc(882)] handshake failed; returned -1, SSL error code 1, net_error -101 翻页出错或已经是最后一页: Message: 页数 ... 价格 0 1 ... 0.5 元/次 1 1 ... 0 元/次 2 1 ... 0 元/次 3 1 ... 2.5 元/次 ... 37 4 ... 0.2 元/次 38 4 ... 0.1 元/次 39 4 ... 0.15 元/次 [40 rows x 7 columns]
已推荐帖子