lidl-price-scraper/main.py

import datetime
import os
import re
import json
import requests
from bs4 import BeautifulSoup
from mail_sender import EmailSender
from utils import format_product_table
from urllib.parse import quote

LIDL_PRODUCT_URL_PREFIX = "https://www.lidl.cz/p/"


def fetch_product_info(urls):
    product_info = []

    for url in urls:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            product_name = soup.find('h1', class_='keyfacts__title').text.strip()
            current_price = soup.find('div', class_='m-price__price').text.strip()
            original_price_element = soup.find('span', class_='m-price__rrp')
            original_price = original_price_element.text.strip() if original_price_element else "-"
            discount_element = soup.find('div', class_='m-price__label')
            discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0

            product_info.append({
                "name": product_name,
                "price": current_price,
                "discount": discount,
                "originalPrice": original_price,
                "url": url
            })
        else:
            print(f"Failed to fetch URL: {url}")

    return product_info


def find_urls(products):
    all_links = set()
    for product in products:
        url = f'https://www.lidl.cz/q/search?q={quote(product)}'
        response = requests.get(url)

        if response.status_code == 200:
            # this might potentially get invalid products (relies on the fact the product number is 9 digits surrounded by parentheses
            # and there are no other 9 digits strings in the whole returned HTML (but it could be improved to target only the <script>
            # element containing the results (its still big but easy win) - currently this works
            matches = re.findall(r'"([0-9]{9})"', response.text)
            for match in matches:
                all_links.add(LIDL_PRODUCT_URL_PREFIX + "p" + match)
            print("OK")
        else:
            print(f"Failed to search for product by URL: {url}")
    return all_links


def save_to_json(product_info, output_file):
    with open(output_file, 'w') as f:
        json.dump(product_info, f, indent=4)


def unify_product_url(url):
    # Optionally a product can have extra url segment (probably for seo purposes) so that following urls are
    # semantically same:
    # - https://www.lidl.cz/p/parkside-uhlova-bruska-pws-230-d4/p100346196
    # - https://www.lidl.cz/p/p100346196
    pattern = r"%s[^/]+/" % LIDL_PRODUCT_URL_PREFIX
    return re.sub(pattern, "%s" % LIDL_PRODUCT_URL_PREFIX, url)


if __name__ == "__main__":
    input_urls_file = "urls.txt"
    input_products_file = "products.txt"
    output_file = "product_info.json"

    with open(input_products_file, 'r') as f:
        products_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])

    with open(input_urls_file, 'r') as f:
        urls_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])

    print(f'Getting urls to products at {datetime.datetime.now()}')
    product_urls = find_urls(products_input)

    print(f'Found {len(product_urls)} products by search')
    print(f'Found {len(urls_input)} products by url input')
    urls = product_urls.union(urls_input)
    print(f'Total products to verify {len(urls)}')

    print(f'Fetching prices at {datetime.datetime.now()}')

    product_info = fetch_product_info(urls_input.union(product_urls))
    print(format_product_table(product_info))
    save_to_json(product_info, output_file)

    products_on_sale = [product for product in product_info if product["discount"] > 0]

    if len(products_on_sale) > 0:
        sender = EmailSender(os.environ["SCRAPER_SMTP_USER"], os.environ["SCRAPER_SMTP_PASSWORD"])
        sender.send_email(os.environ["SCRAPER_TO_MAIL"], products_on_sale)