提问人:hrocj2n 提问时间:6/2/2023 最后编辑:Ajeet Vermahrocj2n 更新时间:6/2/2023 访问量:26
为什么使用 Selenium 和 Python 从网站抓取数据时,By.CSS 和 By.CLASS_NAME 选择器无法高效工作?
Why are By.CSS and By.CLASS_NAME selectors not working efficiently when using Selenium and Python to scrape data from a website?
问:
所以我正在尝试使用 selenium 在网页上查找特定项目,但我想让它适用于类似页面。我当前的代码如下。
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import requests
driver = webdriver.Chrome(executable_path='/Applications/chromedriver_mac_arm64 (1)/chromedriver')
driver.get(
"https://www.usgbc.org/projects/?Country=%5B%22United+States%22%5D&Rating+System=%5B%22New+Construction%22%5D&Rating+Version=%5B%22v2009%22%5D&Certification=%5B%22Platinum%22%5D&State=%5B%22Texas%22%5D")
# check this is the right website
# print(driver.title)
# list of building names to quickly check what has been added
buildings = []
locations = []
# lists for items on project page
sqft_amount = []
# dataframe to collect all building information
df_main = pd.DataFrame()
# dataframe to collect building profile data on page
df_profile_data = pd.DataFrame()
# dataframe to collect scorecard data
df_scorecard = pd.DataFrame()
# make this happen for next button
while True:
try:
project_profiles = driver.find_elements(By.CLASS_NAME, "grid-item--title")
for i in range(len(project_profiles)):
# Wait for page to load as order of elements will be incorrect otherwise
time.sleep(1)
project_profiles = driver.find_elements(By.CLASS_NAME, "grid-item--title") # Find the list again
# append and add to df
building_name = project_profiles[i].text
buildings.append(building_name)
print(building_name)
# enable if checking all
# building profile page information grab##############################
# load building profile page
building_profile_link = driver.find_element(By.XPATH, f"//div[@id='result-grid']//h1[text()='{building_name}']")
building_profile_link.click()
time.sleep(1)
# address
address = driver.find_elements(By.CLASS_NAME, 'projectAddress')
for i in address:
building_address = i.text
locations.append(building_address)
print(locations)
# get values from tables on page
row_data = []
col_data = []
# the row typicaly starts with td[]##
# copy path of first row then make it end in /td, copy Xpath not entire path
rows = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/tbody/tr/td")))
# the col typicaly starts with th[]##
# copy path of first row then make it end in /th, copy Xpath not entire path
columns = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/thead/tr/th")))
for row in rows:
row_data.append(row.text)
for cols in columns:
col_data.append(cols.text)
#time.sleep(5)
print(row_data, "row")
print(col_data, "col")
df_profile_data.append(row_data)
################################SQFT##########################################
#import requests
get_url = driver.current_url
print("The current url is:"+str(get_url))
URL = get_url
html = requests.get(URL).content
df_list = pd.read_html(html)
df = df_list[-1]
SQFT = df.iloc[0,1]
SQFT = SQFT.replace('sq ft', '')
sqft_amount.append(SQFT)
#print(SQFT)
######### load credit score card page##########################################
#building_scorecard_link = driver.find_element(By.XPATH, f"//div[@id='project-details--wrapper']//h1[text()='Scorecard']")
building_scorecard_link = driver.find_element(By.PARTIAL_LINK_TEXT, 'Scorecard')
#/html/body/div[1]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/table/tbody/tr/td[2]
building_scorecard_link.click()
time.sleep(2)
# grab data on categories point totals
point_data = []
point_total = driver.find_elements(By.CLASS_NAME, 'category-score')
for points in point_total:
point_data.append(points.text)
print('cat scores', point_data)
# category names
cat_names = []
#expand credit areas
sus_link = driver.find_elements(By.CLASS_NAME, 'category-title')
#sus_link = WebDriverWait(driver, 15).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'category-title')))
for i in sus_link:
i.click()
print('cat_names:', i.text)
cat_names.append(i.text)
#if i == 'SUSTAINABLE SITES':
# grab specfic credit name
#rows = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/tbody/tr/td")))
# so far css selector works but also times out
# tag_name span also works but it pulls everything, data cleaning needed
# class name not working
credit_names = []
content = driver.find_elements(By.CSS_SELECTOR, 'span.credit-name')
#content = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'cred-name')))
#content = driver.find_elements(By.CLASS_NAME, 'credit-name')
for i in content:
#print(i.text)
credit_names.append(i.text)
print('cred_name', credit_names)
# grab data on category points
sus_data = []
content = driver.find_elements(By.CLASS_NAME, 'num')
for points in content:
sus_data.append(points.text)
print('sus_scores', sus_data)
# add all these things into df
df_scorecard = pd.DataFrame()
# exit scorecard page
driver.back()
# exit building profile page
driver.back()
# move onto next page#############################################
# Perform your desired actions on each page here
# Add some wait here for next page load
# Check if the next button is disabled
next_button = WebDriverWait(driver, 5).until(EC.presence_of_element_located(
(By.XPATH, '//div[@id="result-grid"]//a[text()="Next"]')))
if next_button.get_attribute('Disabled'):
time.sleep(1)
break # Exit the loop if the next button is disabled
else:
# Click the next button to navigate to the next page
time.sleep(1)
next_button.click()
except IndexError:
break
# quit the chrome driver option
driver.quit()
基本上,当代码到达#获取特定信用名称块和#获取类别点上的数据时,如果我在没有webdriverwait的情况下使用By.CSS,它会找到大部分项目,但不是全部,此外,在查看一定数量的建筑物后,它可能会超时。当我尝试使用webdriver等待时,它通常会超时。如果我使用 By.CLASS_NAME,即使类名正确,selenium 也找不到任何元素。如果我使用 By.tag_name它会找到页面上的所有“span”元素,这在尝试查找“credit-name”或“num”特定元素时并不理想。
总的来说,我需要帮助来理解为什么上述方法不起作用以及任何人可能有的任何其他建议。我擅长用 python 编码,但对 selenium 很陌生。
谢谢!
答: 暂无答案
评论