Support product search by name

2024-02-08 13:14:43 +01:00
5 changed files with 53 additions and 16 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .env
 urls.txt
+products.txt

 # Created by https://www.toptal.com/developers/gitignore/api/python,intellij+all
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij+all
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Simple app which checks a price of selected items and prints their current price

 ## How to build and run

-1. create file with urls to be watched `cp urls.txt.example urls.txt`
+1. create file with urls and file with products to be watched `cp urls.txt.example urls.txt ; cp products.txt.example products.txt`
 2. Configure SMTP and mail destination (for mail notification)

 ```
@@ -27,5 +27,5 @@ or build docker image and run it using the docker:

 ```
 docker build -t lidl-price-scraper .
-docker run -it --rm -v $(pwd)/urls.txt:/app/urls.txt  --env-file ./.env lidl-price-scraper
+docker run -it --rm -v $(pwd)/urls.txt:/app/urls.txt  -v $(pwd)/products.txt:/app/products.txt --env-file ./.env lidl-price-scraper
 ```
--- a/main.py
+++ b/main.py
@@ -6,7 +6,10 @@ import requests
 from bs4 import BeautifulSoup
 from mail_sender import EmailSender
 from utils import format_product_table
-from lxml import etree
+from urllib.parse import quote
+
+LIDL_PRODUCT_URL_PREFIX = "https://www.lidl.cz/p/"
+

 def fetch_product_info(urls):
    product_info = []
@@ -15,17 +18,10 @@ def fetch_product_info(urls):
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
-            html_str = str(soup)
-            root_element = etree.fromstring(html_str,  parser=etree.HTMLParser())
            product_name = soup.find('h1', class_='keyfacts__title').text.strip()
            current_price = soup.find('div', class_='m-price__price').text.strip()
            original_price_element = soup.find('span', class_='m-price__rrp')
            original_price = original_price_element.text.strip() if original_price_element else "-"
-
-
-            discount_xpath = '//div[@class="m-price__label" and not(ancestor::*[@style[contains(., "display: none")]])]'
-            discount_xpath_results = root_element.xpath(discount_xpath)
-            discount_elements = [BeautifulSoup(etree.tostring(elem), 'html.parser') for elem in discount_xpath_results]
            discount_element = soup.find('div', class_='m-price__label')
            discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0

@@ -42,21 +38,61 @@ def fetch_product_info(urls):
    return product_info


+def find_urls(products):
+    all_links = set()
+    for product in products:
+        url = f'https://www.lidl.cz/q/search?q={quote(product)}'
+        response = requests.get(url)
+
+        if response.status_code == 200:
+            # this might potentially get invalid products (relies on the fact the product number is 9 digits surrounded by parentheses
+            # and there are no other 9 digits strings in the whole returned HTML (but it could be improved to target only the <script>
+            # element containing the results (its still big but easy win) - currently this works
+            matches = re.findall(r'"([0-9]{9})"', response.text)
+            for match in matches:
+                all_links.add(LIDL_PRODUCT_URL_PREFIX + "p" + match)
+            print("OK")
+        else:
+            print(f"Failed to search for product by URL: {url}")
+    return all_links
+
+
 def save_to_json(product_info, output_file):
    with open(output_file, 'w') as f:
        json.dump(product_info, f, indent=4)

+
+def unify_product_url(url):
+    # Optionally a product can have extra url segment (probably for seo purposes) so that following urls are
+    # semantically same:
+    # - https://www.lidl.cz/p/parkside-uhlova-bruska-pws-230-d4/p100346196
+    # - https://www.lidl.cz/p/p100346196
+    pattern = r"%s[^/]+/" % LIDL_PRODUCT_URL_PREFIX
+    return re.sub(pattern, "%s" % LIDL_PRODUCT_URL_PREFIX, url)
+
+
 if __name__ == "__main__":
-    input_file = "urls.txt"
+    input_urls_file = "urls.txt"
+    input_products_file = "products.txt"
    output_file = "product_info.json"

-    with open(input_file, 'r') as f:
-        urls = [line.strip() for line in f.readlines()]
+    with open(input_products_file, 'r') as f:
+        products_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])

+    with open(input_urls_file, 'r') as f:
+        urls_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
+
+    print(f'Getting urls to products at {datetime.datetime.now()}')
+    product_urls = find_urls(products_input)
+
+    print(f'Found {len(product_urls)} products by search')
+    print(f'Found {len(urls_input)} products by url input')
+    urls = product_urls.union(urls_input)
+    print(f'Total products to verify {len(urls)}')

    print(f'Fetching prices at {datetime.datetime.now()}')

-    product_info = fetch_product_info(urls)
+    product_info = fetch_product_info(urls_input.union(product_urls))
    print(format_product_table(product_info))
    save_to_json(product_info, output_file)

--- a/products.txt.example
+++ b/products.txt.example
@@ -0,0 +1 @@
+Kapovací a pokosová pila
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
 requests
 beautifulsoup4
 tabulate
-lxml