From 681dde9b4618c1b1f4b5f74afdfc5a60348f30e7 Mon Sep 17 00:00:00 2001 From: Jakub Knetl Date: Thu, 8 Feb 2024 11:21:57 +0100 Subject: [PATCH] WIP parse by xpath --- main.py | 8 ++++++++ requirements.txt | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 5910a40..9be106e 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ import requests from bs4 import BeautifulSoup from mail_sender import EmailSender from utils import format_product_table +from lxml import etree def fetch_product_info(urls): product_info = [] @@ -14,10 +15,17 @@ def fetch_product_info(urls): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') + html_str = str(soup) + root_element = etree.fromstring(html_str, parser=etree.HTMLParser()) product_name = soup.find('h1', class_='keyfacts__title').text.strip() current_price = soup.find('div', class_='m-price__price').text.strip() original_price_element = soup.find('span', class_='m-price__rrp') original_price = original_price_element.text.strip() if original_price_element else "-" + + + discount_xpath = '//div[@class="m-price__label" and not(ancestor::*[@style[contains(., "display: none")]])]' + discount_xpath_results = root_element.xpath(discount_xpath) + discount_elements = [BeautifulSoup(etree.tostring(elem), 'html.parser') for elem in discount_xpath_results] discount_element = soup.find('div', class_='m-price__label') discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0 diff --git a/requirements.txt b/requirements.txt index 9c214ad..4c4bb38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ requests beautifulsoup4 -tabulate \ No newline at end of file +tabulate +lxml \ No newline at end of file