如何使用 BeautifulSoup 从网页中抓取特定 URL？-解网

问：

我正在编写一个 Python 脚本，用于解析 HTML（一个分类网站）并向我发送有关特定产品和价格点的电子邮件通知。除了“listing_url”捕获之外，一切都在这里工作，我希望在电子邮件中显示它，以便我可以单击 url 访问产品页面。我尝试按照网站上的检查，使用“class=listing-card__inner”抓取相应的标签，但这不起作用。

这是我的完整代码：

import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText


# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"  
smtp_password = "xxxxxxxx"  

# Send email function
def send_email(subject, body):
    message = MIMEText(body)
    message['Subject'] = subject
    message['From'] = sender_email
    message['To'] = receiver_email

    with smtplib.SMTP(smtp_server, smtp_port) as server:
        server.starttls()
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, receiver_email, message.as_string())

# Scrape listings function
def scrape_listings(url):
    # Make a GET request to the website
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the laptop listings
    listings = soup.find_all('div', class_='listing-card__content')

    # List to store qualifying listings
    qualifying_listings = []

    # Iterate through listings and check conditions
    for listing in listings:
        title = listing.find('div', class_='listing-card__header__title').text.strip()
        date = listing.find('div', class_='listing-card__header__date').text.strip()
        price = listing.find('span', class_='listing-card__price__value 1').text.strip()
        price = price.replace('Br', '').replace(',', '').strip()
        price = int(price)
        listing_url = listing.find('a', class_='listing-card__inner')['href']
        if price < 80000:
            qualifying_listings.append((title, date, price, listing_url))

    return qualifying_listings

# Main function
def main():
    base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
    qualifying_listings = []

    # Scrape first page
    qualifying_listings += scrape_listings(base_url)

    # Scrape remaining pages, limited to 8
    page = 2  # Start from the second page
    while page <= 9:  # Limit to 8 pages (2 to 9)
        url = base_url + f"&page={page}"
        qualifying_listings += scrape_listings(url)
        page += 1

    # Prepare email subject and body
    subject = "Gaming Laptops under 80,000"
    total_count = len(qualifying_listings)
    body = f"Total Qualifying Listings: {total_count}\n\n"
    for listing in qualifying_listings:
        title, date, price, listing_url = listing
        body += f"Title: {title}\n"
        body += f"Date Posted: {date}\n"
        body += f"Price: {price}\n"
        body += f"URL: {listing_url}\n"
        body += "\n"

    # Send the email
    send_email(subject, body)

# Run the script
if __name__ == '__main__':
    main()

以下是网站上标签的缩短示例，我试图从中抓取第一行 href 对象中的 URL：

<a href="https://www.qefira.com/listings/hpelitebook-core-i5-4th-laptops-5366113" class="listing-card__inner" id="listing-5366113" data-t-listing="" data-t-listing_context="search" data-t-listing_id="5366113" data-t-listing_title="HpElitebook core i5 4th Laptops" data-t-listing_type="classified" data-t-listing_category_title="Laptops" data-t-listing_category_slug="laptops" data-t-listing_slug="hpelitebook-core-i5-4th-laptops" data-t-listing_price="19500.00" data-t-listing_currency="ETB" data-t-listing_location_title="Bole" data-t-listing_source="qe_et" data-t-listing_product_slugs="listing">...</a>

关于如何成功抓取 URL 以便它可以在电子邮件正文中显示为可点击链接的任何指导？

尝试了 .find 方法的不同迭代，例如：

 listing_url_element = listing.find('a', class_='listing-card__inner')
 listing_url = listing_url_element['href'] if listing_url_element else "N/A"

或

 listing_url = listing.find('a', class_='listing-card__title')['href']

或

listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url = listing_url_element['href'] if listing_url_element and 'href' in listing_url_element.attrs else "N/A"

甚至：

import re

listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url_match = re.search(r'href="(.*?)"', str(listing_url_element))
listing_url = listing_url_match.group(1) if listing_url_match else "N/A"

仍然没有（或显示“N/A”而不是实际的 URL）。常见错误如下：

listing_url = listing.find('a', class_='listing-card__inner')['href']
                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^
TypeError: 'NoneType' object is not subscriptable

python beautifulsoup 脚本 smtp html 解析

答：

0赞 Spawin 5/19/2023 #1

我建议你使用这个类“listing-card__content”而不是这个“listing-card--has-content”。我还添加了一个检查以避免代码执行停止。

以下是完整代码：

import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText


# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"  
smtp_password = "xxxxxxxx"  

# Send email function
def send_email(subject, body):
    message = MIMEText(body)
    message['Subject'] = subject
    message['From'] = sender_email
    message['To'] = receiver_email

    with smtplib.SMTP(smtp_server, smtp_port) as server:
        server.starttls()
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, receiver_email, message.as_string())

# Scrape listings function
def scrape_listings(url):
    # Make a GET request to the website
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the laptop listings
    # listings = soup.find_all('div', class_='listing-card__content')
    listings = soup.find_all('div', class_='listing-card--has-content')

    # List to store qualifying listings
    qualifying_listings = []

    # Iterate through listings and check conditions
    for listing in listings:
        title = listing.find('div', class_='listing-card__header__title').text.strip()
        date = listing.find('div', class_='listing-card__header__date').text.strip()
        price = listing.find('span', class_='listing-card__price__value 1').text.strip()
        price = price.replace('Br', '').replace(',', '').strip()
        price = int(price)
        
        a = listing.find('a', class_='listing-card__inner')
        if a:
            listing_url = a['href']
            print(listing_url)
            if price < 80000:
                qualifying_listings.append((title, date, price, listing_url))
    
    return qualifying_listings

# Main function
def main():
    base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
    qualifying_listings = []

    # Scrape first page
    qualifying_listings += scrape_listings(base_url)

    # Scrape remaining pages, limited to 8
    page = 2  # Start from the second page
    while page <= 9:  # Limit to 8 pages (2 to 9)
        url = base_url + f"&page={page}"
        qualifying_listings += scrape_listings(url)
        page += 1

    # Prepare email subject and body
    subject = "Gaming Laptops under 80,000"
    total_count = len(qualifying_listings)
    body = f"Total Qualifying Listings: {total_count}\n\n"
    for listing in qualifying_listings:
        title, date, price, listing_url = listing
        body += f"Title: {title}\n"
        body += f"Date Posted: {date}\n"
        body += f"Price: {price}\n"
        body += f"URL: {listing_url}\n"
        body += "\n"

    # Send the email
    send_email(subject, body)

# Run the script
if __name__ == '__main__':
    main()

上一个：如何解析 SEC 10K 文档中的 HTML 表格数据并以文本格式编写？

下一个：Android、Java、通过 Jsoup 从 Web 表单下拉列表中获取数据

如何使用 BeautifulSoup 从网页中抓取特定 URL？

How can I scrape a specific URL from a webpage using BeautifulSoup?

评论