提问人:Outdoor_Cat 提问时间:5/19/2023 更新时间:5/19/2023 访问量:27
如何使用 BeautifulSoup 从网页中抓取特定 URL?
How can I scrape a specific URL from a webpage using BeautifulSoup?
问:
我正在编写一个 Python 脚本,用于解析 HTML(一个分类网站)并向我发送有关特定产品和价格点的电子邮件通知。除了“listing_url”捕获之外,一切都在这里工作,我希望在电子邮件中显示它,以便我可以单击 url 访问产品页面。我尝试按照网站上的检查,使用“class=listing-card__inner”抓取相应的标签,但这不起作用。
这是我的完整代码:
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"
smtp_password = "xxxxxxxx"
# Send email function
def send_email(subject, body):
message = MIMEText(body)
message['Subject'] = subject
message['From'] = sender_email
message['To'] = receiver_email
with smtplib.SMTP(smtp_server, smtp_port) as server:
server.starttls()
server.login(smtp_username, smtp_password)
server.sendmail(sender_email, receiver_email, message.as_string())
# Scrape listings function
def scrape_listings(url):
# Make a GET request to the website
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the laptop listings
listings = soup.find_all('div', class_='listing-card__content')
# List to store qualifying listings
qualifying_listings = []
# Iterate through listings and check conditions
for listing in listings:
title = listing.find('div', class_='listing-card__header__title').text.strip()
date = listing.find('div', class_='listing-card__header__date').text.strip()
price = listing.find('span', class_='listing-card__price__value 1').text.strip()
price = price.replace('Br', '').replace(',', '').strip()
price = int(price)
listing_url = listing.find('a', class_='listing-card__inner')['href']
if price < 80000:
qualifying_listings.append((title, date, price, listing_url))
return qualifying_listings
# Main function
def main():
base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
qualifying_listings = []
# Scrape first page
qualifying_listings += scrape_listings(base_url)
# Scrape remaining pages, limited to 8
page = 2 # Start from the second page
while page <= 9: # Limit to 8 pages (2 to 9)
url = base_url + f"&page={page}"
qualifying_listings += scrape_listings(url)
page += 1
# Prepare email subject and body
subject = "Gaming Laptops under 80,000"
total_count = len(qualifying_listings)
body = f"Total Qualifying Listings: {total_count}\n\n"
for listing in qualifying_listings:
title, date, price, listing_url = listing
body += f"Title: {title}\n"
body += f"Date Posted: {date}\n"
body += f"Price: {price}\n"
body += f"URL: {listing_url}\n"
body += "\n"
# Send the email
send_email(subject, body)
# Run the script
if __name__ == '__main__':
main()
以下是网站上标签的缩短示例,我试图从中抓取第一行 href 对象中的 URL:
<a href="https://www.qefira.com/listings/hpelitebook-core-i5-4th-laptops-5366113" class="listing-card__inner" id="listing-5366113" data-t-listing="" data-t-listing_context="search" data-t-listing_id="5366113" data-t-listing_title="HpElitebook core i5 4th Laptops" data-t-listing_type="classified" data-t-listing_category_title="Laptops" data-t-listing_category_slug="laptops" data-t-listing_slug="hpelitebook-core-i5-4th-laptops" data-t-listing_price="19500.00" data-t-listing_currency="ETB" data-t-listing_location_title="Bole" data-t-listing_source="qe_et" data-t-listing_product_slugs="listing">...</a>
关于如何成功抓取 URL 以便它可以在电子邮件正文中显示为可点击链接的任何指导?
尝试了 .find 方法的不同迭代,例如:
listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url = listing_url_element['href'] if listing_url_element else "N/A"
或
listing_url = listing.find('a', class_='listing-card__title')['href']
或
listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url = listing_url_element['href'] if listing_url_element and 'href' in listing_url_element.attrs else "N/A"
甚至:
import re
listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url_match = re.search(r'href="(.*?)"', str(listing_url_element))
listing_url = listing_url_match.group(1) if listing_url_match else "N/A"
仍然没有(或显示“N/A”而不是实际的 URL)。常见错误如下:
listing_url = listing.find('a', class_='listing-card__inner')['href']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^
TypeError: 'NoneType' object is not subscriptable
答:
0赞
Spawin
5/19/2023
#1
我建议你使用这个类“listing-card__content”而不是这个“listing-card--has-content”。我还添加了一个检查以避免代码执行停止。
以下是完整代码:
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"
smtp_password = "xxxxxxxx"
# Send email function
def send_email(subject, body):
message = MIMEText(body)
message['Subject'] = subject
message['From'] = sender_email
message['To'] = receiver_email
with smtplib.SMTP(smtp_server, smtp_port) as server:
server.starttls()
server.login(smtp_username, smtp_password)
server.sendmail(sender_email, receiver_email, message.as_string())
# Scrape listings function
def scrape_listings(url):
# Make a GET request to the website
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the laptop listings
# listings = soup.find_all('div', class_='listing-card__content')
listings = soup.find_all('div', class_='listing-card--has-content')
# List to store qualifying listings
qualifying_listings = []
# Iterate through listings and check conditions
for listing in listings:
title = listing.find('div', class_='listing-card__header__title').text.strip()
date = listing.find('div', class_='listing-card__header__date').text.strip()
price = listing.find('span', class_='listing-card__price__value 1').text.strip()
price = price.replace('Br', '').replace(',', '').strip()
price = int(price)
a = listing.find('a', class_='listing-card__inner')
if a:
listing_url = a['href']
print(listing_url)
if price < 80000:
qualifying_listings.append((title, date, price, listing_url))
return qualifying_listings
# Main function
def main():
base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
qualifying_listings = []
# Scrape first page
qualifying_listings += scrape_listings(base_url)
# Scrape remaining pages, limited to 8
page = 2 # Start from the second page
while page <= 9: # Limit to 8 pages (2 to 9)
url = base_url + f"&page={page}"
qualifying_listings += scrape_listings(url)
page += 1
# Prepare email subject and body
subject = "Gaming Laptops under 80,000"
total_count = len(qualifying_listings)
body = f"Total Qualifying Listings: {total_count}\n\n"
for listing in qualifying_listings:
title, date, price, listing_url = listing
body += f"Title: {title}\n"
body += f"Date Posted: {date}\n"
body += f"Price: {price}\n"
body += f"URL: {listing_url}\n"
body += "\n"
# Send the email
send_email(subject, body)
# Run the script
if __name__ == '__main__':
main()
评论