为什么 selenium 无法解析站点并抛出错误?

Why can't selenium parse the site and throws an error?

提问人:Игорь Кудряшов 提问时间:1/17/2023 最后编辑:Игорь Кудряшов 更新时间:1/18/2023 访问量:155

问:

我在 bs4 中编写了解析代码,但后来我不得不为 selenium 重新制作它。当您运行代码时,chrome-driver 会打开,但随后关闭并显示错误。在解析开始时,chrome-driver 打开,许多单词“word word word word ...”显示,最后在控制台中仅显示一个链接

# aiogram
from aiogram import types
from aiogram.types.message import ParseMode
from bot import dp
from bot import db
from aiogram.utils.markdown import hbold, hlink
import cfscrape

import fake_headers

# python
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import time

# default для mvc
city = "belgorod"
city2 = "voronezh"
city3 = "kursk"

radius = 300
allowed_data = ['часов', 'часа','час']

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)

stealth(
        driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
)

  @dp.message_handler(lambda message: message.text == "Воронеж")
    async def avito_list(message: types.Message):
        try:
            # Проверим есть ли вообще подписки
            if db.follows_exists(message.from_user.id):
    
                # Выводим все объявления по подпискам
                follows = db.show_subs(message.from_user.id)
    
                for follow in follows:
                    line = follow[1][0][0]
    
                    with db.connection:
                        marka = db.cursor.execute(f"SELECT `avito_mark_name` FROM `marks` WHERE `name` = ?", (line,)).fetchall()
                        marka = marka[0][0]
    
                    min_price = follow[2][0][0]
                    if follow[3][0][0] == None:
                        max_price = follow[2][0][0]*1000
                    else:
                        max_price = follow[3][0][0]
                    #model = "2114_samara"
                    #url = f"https://www.avito.ru/{city}/avtomobili/{marka}/{model}?radius={radius}"
                    url = f"https://www.avito.ru/valuyki/avtomobili/{marka}-ASgBAgICAUTgtg3GmSg?cd=1&radius=200"
                    driver.get(url)
                    time.sleep(5)
                    print(url)
    
                    main_container = driver.find_elements(By.CSS_SELECTOR,".iva-item-content")
    
                    for index, content in enumerate(main_container):
                        contaier_of_content = content.find_element(By.CSS_SELECTOR, ".iva-item-body")
    
    
    
                        ad_post = contaier_of_content.find_element(By.CLASS_NAME, "item-line")
    
    
                        #fresh_car = contaier_of_content.find_element(By.CLASS_NAME,{"data-marker":"item-date"}).text.split()[1]
                        price = content.find_element(By.CSS_SELECTOR, ".price-price").find_element(By.CLASS_NAME, "price").get_attribute("content")
    
                        if int(price) <= max_price and int(price) >= min_price: # проверка на сегодняшний день и на подхождение по цене
    
    
    
                            title_info = contaier_of_content.find_element(By.CLASS_NAME,'.iva-item-title').get_attribute("title").split(',')
                            town_info = contaier_of_content.find_element(By.CLASS_NAME,'.geo-root').find_element(By.TAG_NAME,"span").text
                            datePost_info = contaier_of_content.find_element(By.CLASS_NAME,'.iva-item-dateInfo').find_element(By.TAG_NAME,"div").text
                            #link = content.find('a', class_=re.compile('iva-item-sliderLink*'))['href']
    
                            currency = content.find_element(By.CSS_SELECTOR,'.price-price').find_element(By.TAG_NAME,'meta').get_attribute("content")
                            car_info = title_info[0]
                            was_created = title_info[1]
                            city_on_sale = town_info
    
                            card = f'{hlink(car_info+" - "+was_created,"https://avito.ru"+link)}\n' \
                                f'{hbold("Город: ", city_on_sale)}\n' \
                                f'{hbold("Цена: ", price, currency)}'
    
    
    
                            await message.answer(card)
        except Exception as ex:
            print(ex)
        finally:
            driver.close()
            driver.quit()

这将显示在控制台中

https://www.avito.ru/valuyki/avtomobili/volkswagen-ASgBAgICAUTgtg3GmSg?cd=1&radius=200

https://www.avito.ru/valuyki/avtomobili/vaz_lada-ASgBAgICAUTgtg3GmSg?cd=1&radius=200

python-3.x selenium selenium-webdriver html 解析

评论

0赞 iohans 1/17/2023
你对这一行的意图是什么:iva-item-content*
0赞 Игорь Кудряшов 1/17/2023
@iohans想解析网站上的银行卡数据
0赞 Игорь Кудряшов 1/17/2023
@iohans我可以向您发送相同的 BS4 代码?我真的需要将其转换为硒
0赞 iohans 1/17/2023
您是否希望main_container是所有 ID 或类(包含 iva-item-content)的列表?是的,您可以提供 BS4 代码。
0赞 Игорь Кудряшов 1/17/2023
@iohans更新了问题,并将那里的所有内容更改为 bs4

答:

0赞 Игорь Кудряшов 1/18/2023 #1

我能够通过结合 bs4 和 selenium 的工作来解决这个问题,多亏了这一点,解析器开始工作

# aiogram
from aiogram import types
from aiogram.types.message import ParseMode
from selenium.webdriver.chrome.service import Service
from bot import dp
from bot import db
from aiogram.utils.markdown import hbold, hlink
import cfscrape

import fake_headers

# python
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
import time

# default для mvc
city = "belgorod"
city2 = "voronezh"
city3 = "kursk"

radius = 300
allowed_data = ['часов', 'часа','час']

options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_argument("--no-sandbox")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)

stealth(
        driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
)


#def get_session(url):
    #session = requests.Session()
    #session.headers = {
    #    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
    #    'Accept-Language':'ru,en-US;q=0.5',
   # }
    #return cfscrape.create_scraper(sess=session)

@dp.message_handler(lambda message: message.text == "Белгород")
async def avito_list(message: types.Message):
    try:
        # Проверим есть ли вообще подписки
        if db.follows_exists(message.from_user.id):

            # Выводим все объявления по подпискам
            follows = db.show_subs(message.from_user.id)

            for follow in follows:
                line = follow[1][0][0]

                with db.connection:
                    marka = db.cursor.execute(f"SELECT `avito_mark_name` FROM `marks` WHERE `name` = ?", (line,)).fetchall()
                    marka = marka[0][0]

                min_price = follow[2][0][0]
                if follow[3][0][0] == None:
                    max_price = follow[2][0][0]*1000
                else:
                    max_price = follow[3][0][0]
                #model = "2114_samara"
                #url = f"https://www.avito.ru/{city}/avtomobili/{marka}/{model}?radius={radius}"
                url = f"your url"
                driver.get(url)
                source_data = driver.page_source
                soup = BeautifulSoup(source_data, 'lxml')


                time.sleep(5)
                print('Начинаю собирать информацию')

                main_container = soup.find_all('div', class_=re.compile('iva-item-content*'))

                for index, content in enumerate(main_container):
                    contaier_of_content = content.find("div", class_=re.compile("iva-item-body*"))

                    ad_post = contaier_of_content.find("div", {"data-marker": "item-line"})

                    fresh_car = contaier_of_content.find("div", {"data-marker": "item-date"}).text.split()[1]
                    price = content.find('span', class_=re.compile('price-price-*')).find('meta', itemprop="price")[
                        'content']

                    if int(price) <= max_price and int(
                            price) >= min_price:  # проверка на сегодняшний день и на подхождение по цене

                        title_info = contaier_of_content.find('a', class_=re.compile('iva-item-title*'))['title'].split(
                            ',')
                        town_info = contaier_of_content.find('div', class_=re.compile('geo-root*')).find('span').text
                        datePost_info = contaier_of_content.find('div', class_=re.compile('iva-item-dateInfo*')).find(
                            'div').text
                        link = content.find('a', class_=re.compile('iva-item-sliderLink*'))['href']

                        currency = content.find('span', class_=re.compile('price-price-*')).find('meta')['content']
                        car_info = title_info[0]
                        was_created = title_info[1]
                        city_on_sale = town_info

                        card = f'{hlink(car_info + " - " + was_created, "https://avito.ru" + link)}\n' \
                               f'{hbold("Город: ", city_on_sale)}\n' \
                               f'{hbold("Цена: ", price, currency)}'

                        await message.answer(card)
    except Exception as ex:
        print(ex)
    finally:
        print('Миссия выполнена, сэр!')

@dp.message_handler(lambda message: message.text == "Воронеж")
async def avito_list(message: types.Message):
    try:
        # Проверим есть ли вообще подписки
        if db.follows_exists(message.from_user.id):

            # Выводим все объявления по подпискам
            follows = db.show_subs(message.from_user.id)

            for follow in follows:
                line = follow[1][0][0]

                with db.connection:
                    marka = db.cursor.execute(f"SELECT `avito_mark_name` FROM `marks` WHERE `name` = ?", (line,)).fetchall()
                    marka = marka[0][0]

                min_price = follow[2][0][0]
                if follow[3][0][0] == None:
                    max_price = follow[2][0][0]*1000
                else:
                    max_price = follow[3][0][0]
                #model = "2114_samara"
                #url = f"https://www.avito.ru/{city}/avtomobili/{marka}/{model}?radius={radius}"
                url = f"your url"
                driver.get(url)
                source_data = driver.page_source
                soup = BeautifulSoup(source_data, 'lxml')


                time.sleep(5)
                print('Начинаю собирать информацию')

                main_container = soup.find_all('div', class_=re.compile('iva-item-content*'))

                for index, content in enumerate(main_container):
                    contaier_of_content = content.find("div", class_=re.compile("iva-item-body*"))

                    ad_post = contaier_of_content.find("div", {"data-marker": "item-line"})

                    fresh_car = contaier_of_content.find("div", {"data-marker": "item-date"}).text.split()[1]
                    price = content.find('span', class_=re.compile('price-price-*')).find('meta', itemprop="price")[
                        'content']

                    if int(price) <= max_price and int(
                            price) >= min_price:  # проверка на сегодняшний день и на подхождение по цене

                        title_info = contaier_of_content.find('a', class_=re.compile('iva-item-title*'))['title'].split(
                            ',')
                        town_info = contaier_of_content.find('div', class_=re.compile('geo-root*')).find('span').text
                        datePost_info = contaier_of_content.find('div', class_=re.compile('iva-item-dateInfo*')).find(
                            'div').text
                        link = content.find('a', class_=re.compile('iva-item-sliderLink*'))['href']

                        currency = content.find('span', class_=re.compile('price-price-*')).find('meta')['content']
                        car_info = title_info[0]
                        was_created = title_info[1]
                        city_on_sale = town_info

                        card = f'{hlink(car_info + " - " + was_created, "https://avito.ru" + link)}\n' \
                               f'{hbold("Город: ", city_on_sale)}\n' \
                               f'{hbold("Цена: ", price, currency)}'

                        await message.answer(card)
    except Exception as ex:
        print(ex)
    finally:
        print('Миссия выполнена, сэр!')

@dp.message_handler(lambda message: message.text == "Курск")
async def avito_list(message: types.Message):
    try:
        # Проверим есть ли вообще подписки
        if db.follows_exists(message.from_user.id):

            # Выводим все объявления по подпискам
            follows = db.show_subs(message.from_user.id)

            for follow in follows:
                line = follow[1][0][0]

                with db.connection:
                    marka = db.cursor.execute(f"SELECT `avito_mark_name` FROM `marks` WHERE `name` = ?", (line,)).fetchall()
                    marka = marka[0][0]

                min_price = follow[2][0][0]
                if follow[3][0][0] == None:
                    max_price = follow[2][0][0]*1000
                else:
                    max_price = follow[3][0][0]
                #model = "2114_samara"
                #url = f"https://www.avito.ru/{city}/avtomobili/{marka}/{model}?radius={radius}"
                url = f"your url"
                driver.get(url)
                source_data = driver.page_source
                soup = BeautifulSoup(source_data, 'lxml')


                time.sleep(5)
                print('Начинаю собирать информацию')

                main_container = soup.find_all('div', class_=re.compile('iva-item-content*'))

                for index, content in enumerate(main_container):
                    contaier_of_content = content.find("div", class_=re.compile("iva-item-body*"))

                    ad_post = contaier_of_content.find("div", {"data-marker": "item-line"})

                    fresh_car = contaier_of_content.find("div", {"data-marker": "item-date"}).text.split()[1]
                    price = content.find('span', class_=re.compile('price-price-*')).find('meta', itemprop="price")[
                        'content']

                    if int(price) <= max_price and int(
                            price) >= min_price:  # проверка на сегодняшний день и на подхождение по цене

                        title_info = contaier_of_content.find('a', class_=re.compile('iva-item-title*'))['title'].split(
                            ',')
                        town_info = contaier_of_content.find('div', class_=re.compile('geo-root*')).find('span').text
                        datePost_info = contaier_of_content.find('div', class_=re.compile('iva-item-dateInfo*')).find(
                            'div').text
                        link = content.find('a', class_=re.compile('iva-item-sliderLink*'))['href']

                        currency = content.find('span', class_=re.compile('price-price-*')).find('meta')['content']
                        car_info = title_info[0]
                        was_created = title_info[1]
                        city_on_sale = town_info

                        card = f'{hlink(car_info + " - " + was_created, "https://avito.ru" + link)}\n' \
                               f'{hbold("Город: ", city_on_sale)}\n' \
                               f'{hbold("Цена: ", price, currency)}'

                        await message.answer(card)
    except Exception as ex:
        print(ex)
    finally:
        print('Миссия выполнена, сэр!')