From 681dde9b4618c1b1f4b5f74afdfc5a60348f30e7 Mon Sep 17 00:00:00 2001
From: Jakub Knetl <knetl.j@gmail.com>
Date: Thu, 8 Feb 2024 11:21:57 +0100
Subject: [PATCH] WIP parse by xpath

---
 main.py          | 8 ++++++++
 requirements.txt | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 5910a40..9be106e 100644
--- a/main.py
+++ b/main.py
@@ -6,6 +6,7 @@ import requests
 from bs4 import BeautifulSoup
 from mail_sender import EmailSender
 from utils import format_product_table
+from lxml import etree
 
 def fetch_product_info(urls):
     product_info = []
@@ -14,10 +15,17 @@ def fetch_product_info(urls):
         response = requests.get(url)
         if response.status_code == 200:
             soup = BeautifulSoup(response.text, 'html.parser')
+            html_str = str(soup)
+            root_element = etree.fromstring(html_str,  parser=etree.HTMLParser())
             product_name = soup.find('h1', class_='keyfacts__title').text.strip()
             current_price = soup.find('div', class_='m-price__price').text.strip()
             original_price_element = soup.find('span', class_='m-price__rrp')
             original_price = original_price_element.text.strip() if original_price_element else "-"
+
+
+            discount_xpath = '//div[@class="m-price__label" and not(ancestor::*[@style[contains(., "display: none")]])]'
+            discount_xpath_results = root_element.xpath(discount_xpath)
+            discount_elements = [BeautifulSoup(etree.tostring(elem), 'html.parser') for elem in discount_xpath_results]
             discount_element = soup.find('div', class_='m-price__label')
             discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0
 
diff --git a/requirements.txt b/requirements.txt
index 9c214ad..4c4bb38 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 requests
 beautifulsoup4
-tabulate
\ No newline at end of file
+tabulate
+lxml
\ No newline at end of file