diff --git a/.gitignore b/.gitignore index 65dc4f0..0e7999c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .env urls.txt +products.txt # Created by https://www.toptal.com/developers/gitignore/api/python,intellij+all # Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij+all diff --git a/README.md b/README.md index a4b950a..d700718 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Simple app which checks a price of selected items and prints their current price ## How to build and run -1. create file with urls to be watched `cp urls.txt.example urls.txt` +1. create file with urls and file with products to be watched `cp urls.txt.example urls.txt ; cp products.txt.example products.txt` 2. Configure SMTP and mail destination (for mail notification) ``` @@ -27,5 +27,5 @@ or build docker image and run it using the docker: ``` docker build -t lidl-price-scraper . -docker run -it --rm -v $(pwd)/urls.txt:/app/urls.txt --env-file ./.env lidl-price-scraper +docker run -it --rm -v $(pwd)/urls.txt:/app/urls.txt -v $(pwd)/products.txt:/app/products.txt --env-file ./.env lidl-price-scraper ``` \ No newline at end of file diff --git a/main.py b/main.py index 5910a40..8b9151e 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,10 @@ import requests from bs4 import BeautifulSoup from mail_sender import EmailSender from utils import format_product_table +from urllib.parse import quote + +LIDL_PRODUCT_URL_PREFIX = "https://www.lidl.cz/p/" + def fetch_product_info(urls): product_info = [] @@ -34,21 +38,61 @@ def fetch_product_info(urls): return product_info +def find_urls(products): + all_links = set() + for product in products: + url = f'https://www.lidl.cz/q/search?q={quote(product)}' + response = requests.get(url) + + if response.status_code == 200: + # this might potentially get invalid products (relies on the fact the product number is 9 digits surrounded by parentheses + # and there are no other 9 digits strings in the whole returned HTML (but it could be improved to target only the