为什么使用 Selenium 和 Python 从网站抓取数据时,By.CSS 和 By.CLASS_NAME 选择器无法高效工作?

Why are By.CSS and By.CLASS_NAME selectors not working efficiently when using Selenium and Python to scrape data from a website?

提问人:hrocj2n 提问时间:6/2/2023 最后编辑:Ajeet Vermahrocj2n 更新时间:6/2/2023 访问量:26

问:

所以我正在尝试使用 selenium 在网页上查找特定项目,但我想让它适用于类似页面。我当前的代码如下。

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

import pandas as pd

import requests

driver = webdriver.Chrome(executable_path='/Applications/chromedriver_mac_arm64 (1)/chromedriver')
driver.get(
    "https://www.usgbc.org/projects/?Country=%5B%22United+States%22%5D&Rating+System=%5B%22New+Construction%22%5D&Rating+Version=%5B%22v2009%22%5D&Certification=%5B%22Platinum%22%5D&State=%5B%22Texas%22%5D")
# check this is the right website
# print(driver.title)

# list of building names to quickly check what has been added
buildings = []
locations = []

# lists for items on project page
sqft_amount = []

# dataframe to collect all building information
df_main = pd.DataFrame()
# dataframe to collect building profile data on page
df_profile_data = pd.DataFrame()
# dataframe to collect scorecard data
df_scorecard = pd.DataFrame()

# make this happen for next button

while True:
    
    try:
        
        project_profiles = driver.find_elements(By.CLASS_NAME, "grid-item--title")
    
        for i in range(len(project_profiles)):
            # Wait for page to load as order of elements will be incorrect otherwise
            time.sleep(1)
            
            project_profiles = driver.find_elements(By.CLASS_NAME, "grid-item--title")  # Find the list again
            # append and add to df
            building_name = project_profiles[i].text
            buildings.append(building_name)
            print(building_name)

            # enable if checking all
            # building profile page information grab##############################
            # load building profile page
            building_profile_link = driver.find_element(By.XPATH, f"//div[@id='result-grid']//h1[text()='{building_name}']")
            building_profile_link.click()
            time.sleep(1)
            
            # address
            address = driver.find_elements(By.CLASS_NAME, 'projectAddress')
            for i in address:
                building_address = i.text
                locations.append(building_address)
            print(locations)
            
            # get values from tables on page
            row_data = []
            col_data = []
            
            # the row typicaly starts with td[]##
            # copy path of first row then make it end in /td, copy Xpath not entire path
            rows = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/tbody/tr/td")))
            # the col typicaly starts with th[]##
            # copy path of first row then make it end in /th, copy Xpath not entire path
            columns = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/thead/tr/th")))
            
            for row in rows:
                row_data.append(row.text)
                
            for cols in columns:
                col_data.append(cols.text)
            
            #time.sleep(5)
            
            print(row_data, "row")
            print(col_data, "col")
            
            df_profile_data.append(row_data)
            
            ################################SQFT##########################################
            
            #import requests
            get_url = driver.current_url
            print("The current url is:"+str(get_url))
            
            URL = get_url
            html = requests.get(URL).content
            df_list = pd.read_html(html)
            df = df_list[-1]
            SQFT = df.iloc[0,1]
            SQFT = SQFT.replace('sq ft', '')
            sqft_amount.append(SQFT)
            #print(SQFT)
            
            ######### load credit score card page##########################################
            #building_scorecard_link = driver.find_element(By.XPATH, f"//div[@id='project-details--wrapper']//h1[text()='Scorecard']")
            building_scorecard_link = driver.find_element(By.PARTIAL_LINK_TEXT, 'Scorecard')
            #/html/body/div[1]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/table/tbody/tr/td[2]
            building_scorecard_link.click()
            time.sleep(2)
            
            # grab data on categories point totals
            point_data = []
            point_total = driver.find_elements(By.CLASS_NAME, 'category-score')
            for points in point_total:
                point_data.append(points.text)
            print('cat scores', point_data)
            
            # category names
            cat_names = []
            #expand credit areas
            sus_link = driver.find_elements(By.CLASS_NAME, 'category-title')
            #sus_link = WebDriverWait(driver, 15).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'category-title')))
            for i in sus_link:
                i.click()
                print('cat_names:', i.text)
                cat_names.append(i.text)
                #if i == 'SUSTAINABLE SITES':
                    
            # grab specfic credit name
            #rows = WebDriverWait(driver,3).until(EC.visibility_of_all_elements_located((By.XPATH, "//*[@id='certificates']/tbody/tr/td")))
            
            # so far css selector works but also times out
            # tag_name span also works but it pulls everything, data cleaning needed
            # class name not working
            credit_names = []
            content = driver.find_elements(By.CSS_SELECTOR, 'span.credit-name')
            #content = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'cred-name')))
            #content = driver.find_elements(By.CLASS_NAME, 'credit-name')
            
            for i in content:
                #print(i.text)
                credit_names.append(i.text)
            print('cred_name', credit_names)
            
            # grab data on category points
            sus_data = []
            content = driver.find_elements(By.CLASS_NAME, 'num')

            for points in content:
                sus_data.append(points.text)
            print('sus_scores', sus_data)


            # add all these things into df
            df_scorecard = pd.DataFrame()
            
            # exit scorecard page
            driver.back()
            
            # exit building profile page
            driver.back()
            
           
        # move onto next page#############################################
    
        # Perform your desired actions on each page here
        # Add some wait here for next page load
        # Check if the next button is disabled
        next_button = WebDriverWait(driver, 5).until(EC.presence_of_element_located(
            (By.XPATH, '//div[@id="result-grid"]//a[text()="Next"]')))
        if next_button.get_attribute('Disabled'):
            time.sleep(1)
            break  # Exit the loop if the next button is disabled
        
        else:
            # Click the next button to navigate to the next page
            time.sleep(1)
            next_button.click()
            
    except IndexError:
        break

# quit the chrome driver option
driver.quit()

基本上,当代码到达#获取特定信用名称块和#获取类别点上的数据时,如果我在没有webdriverwait的情况下使用By.CSS,它会找到大部分项目,但不是全部,此外,在查看一定数量的建筑物后,它可能会超时。当我尝试使用webdriver等待时,它通常会超时。如果我使用 By.CLASS_NAME,即使类名正确,selenium 也找不到任何元素。如果我使用 By.tag_name它会找到页面上的所有“span”元素,这在尝试查找“credit-name”或“num”特定元素时并不理想。

总的来说,我需要帮助来理解为什么上述方法不起作用以及任何人可能有的任何其他建议。我擅长用 python 编码,但对 selenium 很陌生。

谢谢!

python selenium-webdriver css-selectors webdriverwait getelementsbyclassname

评论


答: 暂无答案