104 lines
3.9 KiB
Python
104 lines
3.9 KiB
Python
import datetime
|
|
import os
|
|
import re
|
|
import json
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from mail_sender import EmailSender
|
|
from utils import format_product_table
|
|
from urllib.parse import quote
|
|
|
|
LIDL_PRODUCT_URL_PREFIX = "https://www.lidl.cz/p/"
|
|
|
|
|
|
def fetch_product_info(urls):
|
|
product_info = []
|
|
|
|
for url in urls:
|
|
response = requests.get(url)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
product_name = soup.find('h1', class_='keyfacts__title').text.strip()
|
|
current_price = soup.find('div', class_='m-price__price').text.strip()
|
|
original_price_element = soup.find('span', class_='m-price__rrp')
|
|
original_price = original_price_element.text.strip() if original_price_element else "-"
|
|
discount_element = soup.find('div', class_='m-price__label')
|
|
discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0
|
|
|
|
product_info.append({
|
|
"name": product_name,
|
|
"price": current_price,
|
|
"discount": discount,
|
|
"originalPrice": original_price,
|
|
"url": url
|
|
})
|
|
else:
|
|
print(f"Failed to fetch URL: {url}")
|
|
|
|
return product_info
|
|
|
|
|
|
def find_urls(products):
|
|
all_links = set()
|
|
for product in products:
|
|
url = f'https://www.lidl.cz/q/search?q={quote(product)}'
|
|
response = requests.get(url)
|
|
|
|
if response.status_code == 200:
|
|
# this might potentially get invalid products (relies on the fact the product number is 9 digits surrounded by parentheses
|
|
# and there are no other 9 digits strings in the whole returned HTML (but it could be improved to target only the <script>
|
|
# element containing the results (its still big but easy win) - currently this works
|
|
matches = re.findall(r'"([0-9]{9})"', response.text)
|
|
for match in matches:
|
|
all_links.add(LIDL_PRODUCT_URL_PREFIX + "p" + match)
|
|
print("OK")
|
|
else:
|
|
print(f"Failed to search for product by URL: {url}")
|
|
return all_links
|
|
|
|
|
|
def save_to_json(product_info, output_file):
|
|
with open(output_file, 'w') as f:
|
|
json.dump(product_info, f, indent=4)
|
|
|
|
|
|
def unify_product_url(url):
|
|
# Optionally a product can have extra url segment (probably for seo purposes) so that following urls are
|
|
# semantically same:
|
|
# - https://www.lidl.cz/p/parkside-uhlova-bruska-pws-230-d4/p100346196
|
|
# - https://www.lidl.cz/p/p100346196
|
|
pattern = r"%s[^/]+/" % LIDL_PRODUCT_URL_PREFIX
|
|
return re.sub(pattern, "%s" % LIDL_PRODUCT_URL_PREFIX, url)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
input_urls_file = "urls.txt"
|
|
input_products_file = "products.txt"
|
|
output_file = "product_info.json"
|
|
|
|
with open(input_products_file, 'r') as f:
|
|
products_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
|
|
|
|
with open(input_urls_file, 'r') as f:
|
|
urls_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
|
|
|
|
print(f'Getting urls to products at {datetime.datetime.now()}')
|
|
product_urls = find_urls(products_input)
|
|
|
|
print(f'Found {len(product_urls)} products by search')
|
|
print(f'Found {len(urls_input)} products by url input')
|
|
urls = product_urls.union(urls_input)
|
|
print(f'Total products to verify {len(urls)}')
|
|
|
|
print(f'Fetching prices at {datetime.datetime.now()}')
|
|
|
|
product_info = fetch_product_info(urls_input.union(product_urls))
|
|
print(format_product_table(product_info))
|
|
save_to_json(product_info, output_file)
|
|
|
|
products_on_sale = [product for product in product_info if product["discount"] > 0]
|
|
|
|
if len(products_on_sale) > 0:
|
|
sender = EmailSender(os.environ["SCRAPER_SMTP_USER"], os.environ["SCRAPER_SMTP_PASSWORD"])
|
|
sender.send_email(os.environ["SCRAPER_TO_MAIL"], products_on_sale)
|