Files
lidl-price-scraper/main.py

104 lines
3.9 KiB
Python

import datetime
import os
import re
import json
import requests
from bs4 import BeautifulSoup
from mail_sender import EmailSender
from utils import format_product_table
from urllib.parse import quote
LIDL_PRODUCT_URL_PREFIX = "https://www.lidl.cz/p/"
def fetch_product_info(urls):
product_info = []
for url in urls:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
product_name = soup.find('h1', class_='keyfacts__title').text.strip()
current_price = soup.find('div', class_='m-price__price').text.strip()
original_price_element = soup.find('span', class_='m-price__rrp')
original_price = original_price_element.text.strip() if original_price_element else "-"
discount_element = soup.find('div', class_='m-price__label')
discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0
product_info.append({
"name": product_name,
"price": current_price,
"discount": discount,
"originalPrice": original_price,
"url": url
})
else:
print(f"Failed to fetch URL: {url}")
return product_info
def find_urls(products):
all_links = set()
for product in products:
url = f'https://www.lidl.cz/q/search?q={quote(product)}'
response = requests.get(url)
if response.status_code == 200:
# this might potentially get invalid products (relies on the fact the product number is 9 digits surrounded by parentheses
# and there are no other 9 digits strings in the whole returned HTML (but it could be improved to target only the <script>
# element containing the results (its still big but easy win) - currently this works
matches = re.findall(r'"([0-9]{9})"', response.text)
for match in matches:
all_links.add(LIDL_PRODUCT_URL_PREFIX + "p" + match)
print("OK")
else:
print(f"Failed to search for product by URL: {url}")
return all_links
def save_to_json(product_info, output_file):
with open(output_file, 'w') as f:
json.dump(product_info, f, indent=4)
def unify_product_url(url):
# Optionally a product can have extra url segment (probably for seo purposes) so that following urls are
# semantically same:
# - https://www.lidl.cz/p/parkside-uhlova-bruska-pws-230-d4/p100346196
# - https://www.lidl.cz/p/p100346196
pattern = r"%s[^/]+/" % LIDL_PRODUCT_URL_PREFIX
return re.sub(pattern, "%s" % LIDL_PRODUCT_URL_PREFIX, url)
if __name__ == "__main__":
input_urls_file = "urls.txt"
input_products_file = "products.txt"
output_file = "product_info.json"
with open(input_products_file, 'r') as f:
products_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
with open(input_urls_file, 'r') as f:
urls_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
print(f'Getting urls to products at {datetime.datetime.now()}')
product_urls = find_urls(products_input)
print(f'Found {len(product_urls)} products by search')
print(f'Found {len(urls_input)} products by url input')
urls = product_urls.union(urls_input)
print(f'Total products to verify {len(urls)}')
print(f'Fetching prices at {datetime.datetime.now()}')
product_info = fetch_product_info(urls_input.union(product_urls))
print(format_product_table(product_info))
save_to_json(product_info, output_file)
products_on_sale = [product for product in product_info if product["discount"] > 0]
if len(products_on_sale) > 0:
sender = EmailSender(os.environ["SCRAPER_SMTP_USER"], os.environ["SCRAPER_SMTP_PASSWORD"])
sender.send_email(os.environ["SCRAPER_TO_MAIL"], products_on_sale)