网页抓取 TimeOutException：消息：-解网

问：

我正在尝试抓取电子商务网站。我想从搜索结果中抓取每个产品的产品描述。我成功地从搜索结果中抓取了所有产品链接，并得到了一个产品的产品描述。但是，当我尝试循环产品链接以从我从搜索结果中获得的所有产品中获取产品描述时，即将到来。TimeOutException: Message

我已经尝试更改 WebDriverWait 的时间，但它没有修复错误。

知道我该怎么办吗？

这是我的代码：

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options       # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from time import sleep
from collections import Counter
import json
from turtle import delay
import time

# create object for chrome options
chrome_options = Options()

# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')      

# create webdriver object
path = '/Applications/chromedriver'
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)

baseurl = 'https://shopee.co.id/search?keyword=obat%20kanker'

product_links = []

for page in range(0,6):
    search_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page={}'.format(page)
    driver.get(search_link)
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))

    driver.execute_script("""
            var scroll = document.body.scrollHeight / 10;
            var i = 0;
            function scrollit(i) {
            window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
            i++;
            if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
            }
            scrollit(i);
            """)
    sleep(5)
    html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    soup = BeautifulSoup(html, "html.parser")

    product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
    for item in product_list:
        for link in item.find_all('a', href=True):
            product_links.append(baseurl + link['href'])



for link in product_links:
        driver.get(link)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))

        driver.execute_script("""
                var scroll = document.body.scrollHeight / 10;
                var i = 0;
                function scrollit(i) {
                window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                i++;
                if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
                }
                scrollit(i);
                """)

        sleep(20)
        html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
        soup = BeautifulSoup(html, "html.parser")

        name = soup.find('div', class_='_2rQP1z').text.replace('Star+','')
        price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
        sold = soup.find('div', class_ = 'HmRxgn').text.strip()
        rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
        city = soup.find('span', class_ = '_2fJrvA').text.strip()
        specification = soup.find('div', class_ = '_2jz573').text.strip()

        herbcancer = {
                'name': name,
                'price': price,
                'sold': sold,
                'rate': rate,
                'city': city,
                'specification': specification
        }

        print(herbcancer)

python selenium 网页抓取 beautifulsoup chrome-web-driver

答：

0赞 Amen Aziz 8/26/2022 #1

基本 url 不正确，为什么他们向您显示：TimeOutException

https://shopee.co.id/search?keyword=obat%20kanker

正确的是：base url

https://shopee.co.id

完整的代码是：

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options       # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from time import sleep
from collections import Counter
import json
from turtle import delay
import time

# create object for chrome options
chrome_options = Options()

# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')      

# create webdriver object
path = ''
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)

baseurl = 'https://shopee.co.id'

product_links = []

for page in range(0,6):
    search_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page={}'.format(page)
    driver.get(search_link)
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))

    driver.execute_script("""
            var scroll = document.body.scrollHeight / 10;
            var i = 0;
            function scrollit(i) {
            window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
            i++;
            if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
            }
            scrollit(i);
            """)
    sleep(5)
    html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    soup = BeautifulSoup(html, "html.parser")

    product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
    for item in product_list:
        for link in item.find_all('a', href=True):
            comp=baseurl + link['href']
            product_links.append(comp)
            
            
    for link in product_links:
        driver.get(link)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))

        driver.execute_script("""
                var scroll = document.body.scrollHeight / 10;
                var i = 0;
                function scrollit(i) {
                window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                i++;
                if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
                }
                scrollit(i);
                """)

        sleep(3)
        html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
        soup = BeautifulSoup(html, "html.parser")

        name = soup.find('div', class_='_2rQP1z').text.replace('Star+','')
        price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
        sold = soup.find('div', class_ = 'HmRxgn').text.strip()
        rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
        
        try:
            city = soup.find('span', class_ = '_2fJrvA').text.strip()
        except:
            city=''
            
        try:
            specification = soup.find('div', class_ = '_2jz573').text.strip()
        except:
            specification=''

        herbcancer = {
                'name': name,
                'price': price,
                'sold': sold,
                'rate': rate,
                'city': city,
                'specification': specification
        }

        print(herbcancer)

上一个：使用 Selenium Python 获取所有链接

下一个：Selenium 复选框找不到 - Web 驱动程序预期条件失败：等待 By.xpath 找到的元素的可见性：

网页抓取 TimeOutException：消息：

Web Scraping TimeOutException: Message:

评论