提问人:puyocode 提问时间:8/26/2022 最后编辑:blackravenpuyocode 更新时间:8/28/2022 访问量:191
网页抓取 TimeOutException:消息:
Web Scraping TimeOutException: Message:
问:
我正在尝试抓取电子商务网站。我想从搜索结果中抓取每个产品的产品描述。我成功地从搜索结果中抓取了所有产品链接,并得到了一个产品的产品描述。但是,当我尝试循环产品链接以从我从搜索结果中获得的所有产品中获取产品描述时,即将到来。TimeOutException: Message
我已经尝试更改 WebDriverWait 的时间,但它没有修复错误。
知道我该怎么办吗?
这是我的代码:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from collections import Counter
import json
from turtle import delay
import time
# create object for chrome options
chrome_options = Options()
# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')
# create webdriver object
path = '/Applications/chromedriver'
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
baseurl = 'https://shopee.co.id/search?keyword=obat%20kanker'
product_links = []
for page in range(0,6):
search_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page={}'.format(page)
driver.get(search_link)
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
for item in product_list:
for link in item.find_all('a', href=True):
product_links.append(baseurl + link['href'])
for link in product_links:
driver.get(link)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(20)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
name = soup.find('div', class_='_2rQP1z').text.replace('Star+','')
price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
sold = soup.find('div', class_ = 'HmRxgn').text.strip()
rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
city = soup.find('span', class_ = '_2fJrvA').text.strip()
specification = soup.find('div', class_ = '_2jz573').text.strip()
herbcancer = {
'name': name,
'price': price,
'sold': sold,
'rate': rate,
'city': city,
'specification': specification
}
print(herbcancer)
答:
0赞
Amen Aziz
8/26/2022
#1
基本 url 不正确,为什么他们向您显示:TimeOutException
https://shopee.co.id/search?keyword=obat%20kanker
正确的是:base url
https://shopee.co.id
完整的代码是:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from collections import Counter
import json
from turtle import delay
import time
# create object for chrome options
chrome_options = Options()
# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')
# create webdriver object
path = ''
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
baseurl = 'https://shopee.co.id'
product_links = []
for page in range(0,6):
search_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page={}'.format(page)
driver.get(search_link)
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
for item in product_list:
for link in item.find_all('a', href=True):
comp=baseurl + link['href']
product_links.append(comp)
for link in product_links:
driver.get(link)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(3)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
name = soup.find('div', class_='_2rQP1z').text.replace('Star+','')
price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
sold = soup.find('div', class_ = 'HmRxgn').text.strip()
rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
try:
city = soup.find('span', class_ = '_2fJrvA').text.strip()
except:
city=''
try:
specification = soup.find('div', class_ = '_2jz573').text.strip()
except:
specification=''
herbcancer = {
'name': name,
'price': price,
'sold': sold,
'rate': rate,
'city': city,
'specification': specification
}
print(herbcancer)
评论