WIP parse by xpath

Print datetime
Fix readme
2024-02-08 11:21:57 +01:00 · 2024-02-03 09:40:52 +01:00 · 2024-02-02 20:52:37 +01:00 · 2024-02-02 20:50:18 +01:00
4 changed files with 22 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -2,7 +2,10 @@

 Simple app which checks a price of selected items and prints their current price and discount.

-It also sends a notification of items via email any items is on sale and following env variables are defined:
+## How to build and run
+
+1. create file with urls to be watched `cp urls.txt.example urls.txt`
+2. Configure SMTP and mail destination (for mail notification)

 ```
 SCRAPER_SMTP_USER
@@ -24,5 +27,5 @@ or build docker image and run it using the docker:

 ```
 docker build -t lidl-price-scraper .
-docker run -it --rm  --env-file ./.env lidl-price-scraper
+docker run -it --rm -v $(pwd)/urls.txt:/app/urls.txt  --env-file ./.env lidl-price-scraper
 ```
--- a/main.py
+++ b/main.py
@@ -1,3 +1,4 @@
+import datetime
 import os
 import re
 import json
@@ -5,6 +6,7 @@ import requests
 from bs4 import BeautifulSoup
 from mail_sender import EmailSender
 from utils import format_product_table
+from lxml import etree

 def fetch_product_info(urls):
    product_info = []
@@ -13,10 +15,17 @@ def fetch_product_info(urls):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
+            html_str = str(soup)
+            root_element = etree.fromstring(html_str,  parser=etree.HTMLParser())
            product_name = soup.find('h1', class_='keyfacts__title').text.strip()
            current_price = soup.find('div', class_='m-price__price').text.strip()
            original_price_element = soup.find('span', class_='m-price__rrp')
            original_price = original_price_element.text.strip() if original_price_element else "-"
+
+
+            discount_xpath = '//div[@class="m-price__label" and not(ancestor::*[@style[contains(., "display: none")]])]'
+            discount_xpath_results = root_element.xpath(discount_xpath)
+            discount_elements = [BeautifulSoup(etree.tostring(elem), 'html.parser') for elem in discount_xpath_results]
            discount_element = soup.find('div', class_='m-price__label')
            discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0

@@ -44,6 +53,9 @@ if __name__ == "__main__":
    with open(input_file, 'r') as f:
        urls = [line.strip() for line in f.readlines()]

+
+    print(f'Fetching prices at {datetime.datetime.now()}')
+
    product_info = fetch_product_info(urls)
    print(format_product_table(product_info))
    save_to_json(product_info, output_file)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 requests
 beautifulsoup4
-tabulate
+tabulate
+lxml
--- a/urls.txt.example
+++ b/urls.txt.example
@@ -0,0 +1,3 @@
+https://www.lidl.cz/p/p100370600
+https://www.lidl.cz/p/p100358513
+https://www.lidl.cz/p/p100336045
Author	SHA1	Message	Date
Jakub Knetl	681dde9b46	WIP parse by xpath	2024-02-08 11:21:57 +01:00
Jakub Knetl	c7fcec7c56	Print datetime	2024-02-03 09:40:52 +01:00
Jakub Knetl	3505a86416	Fix readme	2024-02-02 20:52:37 +01:00
Jakub Knetl	d4302240e7	Update readme	2024-02-02 20:50:18 +01:00