Support product search by name
This commit is contained in:
52
main.py
52
main.py
@@ -6,6 +6,10 @@ import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from mail_sender import EmailSender
|
||||
from utils import format_product_table
|
||||
from urllib.parse import quote
|
||||
|
||||
LIDL_PRODUCT_URL_PREFIX = "https://www.lidl.cz/p/"
|
||||
|
||||
|
||||
def fetch_product_info(urls):
|
||||
product_info = []
|
||||
@@ -34,21 +38,61 @@ def fetch_product_info(urls):
|
||||
return product_info
|
||||
|
||||
|
||||
def find_urls(products):
|
||||
all_links = set()
|
||||
for product in products:
|
||||
url = f'https://www.lidl.cz/q/search?q={quote(product)}'
|
||||
response = requests.get(url)
|
||||
|
||||
if response.status_code == 200:
|
||||
# this might potentially get invalid products (relies on the fact the product number is 9 digits surrounded by parentheses
|
||||
# and there are no other 9 digits strings in the whole returned HTML (but it could be improved to target only the <script>
|
||||
# element containing the results (its still big but easy win) - currently this works
|
||||
matches = re.findall(r'"([0-9]{9})"', response.text)
|
||||
for match in matches:
|
||||
all_links.add(LIDL_PRODUCT_URL_PREFIX + "p" + match)
|
||||
print("OK")
|
||||
else:
|
||||
print(f"Failed to search for product by URL: {url}")
|
||||
return all_links
|
||||
|
||||
|
||||
def save_to_json(product_info, output_file):
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(product_info, f, indent=4)
|
||||
|
||||
|
||||
def unify_product_url(url):
|
||||
# Optionally a product can have extra url segment (probably for seo purposes) so that following urls are
|
||||
# semantically same:
|
||||
# - https://www.lidl.cz/p/parkside-uhlova-bruska-pws-230-d4/p100346196
|
||||
# - https://www.lidl.cz/p/p100346196
|
||||
pattern = r"%s[^/]+/" % LIDL_PRODUCT_URL_PREFIX
|
||||
return re.sub(pattern, "%s" % LIDL_PRODUCT_URL_PREFIX, url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "urls.txt"
|
||||
input_urls_file = "urls.txt"
|
||||
input_products_file = "products.txt"
|
||||
output_file = "product_info.json"
|
||||
|
||||
with open(input_file, 'r') as f:
|
||||
urls = [line.strip() for line in f.readlines()]
|
||||
with open(input_products_file, 'r') as f:
|
||||
products_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
|
||||
|
||||
with open(input_urls_file, 'r') as f:
|
||||
urls_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
|
||||
|
||||
print(f'Getting urls to products at {datetime.datetime.now()}')
|
||||
product_urls = find_urls(products_input)
|
||||
|
||||
print(f'Found {len(product_urls)} products by search')
|
||||
print(f'Found {len(urls_input)} products by url input')
|
||||
urls = product_urls.union(urls_input)
|
||||
print(f'Total products to verify {len(urls)}')
|
||||
|
||||
print(f'Fetching prices at {datetime.datetime.now()}')
|
||||
|
||||
product_info = fetch_product_info(urls)
|
||||
product_info = fetch_product_info(urls_input.union(product_urls))
|
||||
print(format_product_table(product_info))
|
||||
save_to_json(product_info, output_file)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user