Compare commits
1 Commits
fix-multip
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 16144fc428 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,6 @@
|
|||||||
.env
|
.env
|
||||||
urls.txt
|
urls.txt
|
||||||
|
products.txt
|
||||||
|
|
||||||
# Created by https://www.toptal.com/developers/gitignore/api/python,intellij+all
|
# Created by https://www.toptal.com/developers/gitignore/api/python,intellij+all
|
||||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij+all
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij+all
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ Simple app which checks a price of selected items and prints their current price
|
|||||||
|
|
||||||
## How to build and run
|
## How to build and run
|
||||||
|
|
||||||
1. create file with urls to be watched `cp urls.txt.example urls.txt`
|
1. create file with urls and file with products to be watched `cp urls.txt.example urls.txt ; cp products.txt.example products.txt`
|
||||||
2. Configure SMTP and mail destination (for mail notification)
|
2. Configure SMTP and mail destination (for mail notification)
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -27,5 +27,5 @@ or build docker image and run it using the docker:
|
|||||||
|
|
||||||
```
|
```
|
||||||
docker build -t lidl-price-scraper .
|
docker build -t lidl-price-scraper .
|
||||||
docker run -it --rm -v $(pwd)/urls.txt:/app/urls.txt --env-file ./.env lidl-price-scraper
|
docker run -it --rm -v $(pwd)/urls.txt:/app/urls.txt -v $(pwd)/products.txt:/app/products.txt --env-file ./.env lidl-price-scraper
|
||||||
```
|
```
|
||||||
60
main.py
60
main.py
@@ -6,7 +6,10 @@ import requests
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from mail_sender import EmailSender
|
from mail_sender import EmailSender
|
||||||
from utils import format_product_table
|
from utils import format_product_table
|
||||||
from lxml import etree
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
LIDL_PRODUCT_URL_PREFIX = "https://www.lidl.cz/p/"
|
||||||
|
|
||||||
|
|
||||||
def fetch_product_info(urls):
|
def fetch_product_info(urls):
|
||||||
product_info = []
|
product_info = []
|
||||||
@@ -15,17 +18,10 @@ def fetch_product_info(urls):
|
|||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
html_str = str(soup)
|
|
||||||
root_element = etree.fromstring(html_str, parser=etree.HTMLParser())
|
|
||||||
product_name = soup.find('h1', class_='keyfacts__title').text.strip()
|
product_name = soup.find('h1', class_='keyfacts__title').text.strip()
|
||||||
current_price = soup.find('div', class_='m-price__price').text.strip()
|
current_price = soup.find('div', class_='m-price__price').text.strip()
|
||||||
original_price_element = soup.find('span', class_='m-price__rrp')
|
original_price_element = soup.find('span', class_='m-price__rrp')
|
||||||
original_price = original_price_element.text.strip() if original_price_element else "-"
|
original_price = original_price_element.text.strip() if original_price_element else "-"
|
||||||
|
|
||||||
|
|
||||||
discount_xpath = '//div[@class="m-price__label" and not(ancestor::*[@style[contains(., "display: none")]])]'
|
|
||||||
discount_xpath_results = root_element.xpath(discount_xpath)
|
|
||||||
discount_elements = [BeautifulSoup(etree.tostring(elem), 'html.parser') for elem in discount_xpath_results]
|
|
||||||
discount_element = soup.find('div', class_='m-price__label')
|
discount_element = soup.find('div', class_='m-price__label')
|
||||||
discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0
|
discount = int(re.findall(r'\d+', discount_element.text.strip())[0]) if discount_element else 0
|
||||||
|
|
||||||
@@ -42,21 +38,61 @@ def fetch_product_info(urls):
|
|||||||
return product_info
|
return product_info
|
||||||
|
|
||||||
|
|
||||||
|
def find_urls(products):
|
||||||
|
all_links = set()
|
||||||
|
for product in products:
|
||||||
|
url = f'https://www.lidl.cz/q/search?q={quote(product)}'
|
||||||
|
response = requests.get(url)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
# this might potentially get invalid products (relies on the fact the product number is 9 digits surrounded by parentheses
|
||||||
|
# and there are no other 9 digits strings in the whole returned HTML (but it could be improved to target only the <script>
|
||||||
|
# element containing the results (its still big but easy win) - currently this works
|
||||||
|
matches = re.findall(r'"([0-9]{9})"', response.text)
|
||||||
|
for match in matches:
|
||||||
|
all_links.add(LIDL_PRODUCT_URL_PREFIX + "p" + match)
|
||||||
|
print("OK")
|
||||||
|
else:
|
||||||
|
print(f"Failed to search for product by URL: {url}")
|
||||||
|
return all_links
|
||||||
|
|
||||||
|
|
||||||
def save_to_json(product_info, output_file):
|
def save_to_json(product_info, output_file):
|
||||||
with open(output_file, 'w') as f:
|
with open(output_file, 'w') as f:
|
||||||
json.dump(product_info, f, indent=4)
|
json.dump(product_info, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
def unify_product_url(url):
|
||||||
|
# Optionally a product can have extra url segment (probably for seo purposes) so that following urls are
|
||||||
|
# semantically same:
|
||||||
|
# - https://www.lidl.cz/p/parkside-uhlova-bruska-pws-230-d4/p100346196
|
||||||
|
# - https://www.lidl.cz/p/p100346196
|
||||||
|
pattern = r"%s[^/]+/" % LIDL_PRODUCT_URL_PREFIX
|
||||||
|
return re.sub(pattern, "%s" % LIDL_PRODUCT_URL_PREFIX, url)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_file = "urls.txt"
|
input_urls_file = "urls.txt"
|
||||||
|
input_products_file = "products.txt"
|
||||||
output_file = "product_info.json"
|
output_file = "product_info.json"
|
||||||
|
|
||||||
with open(input_file, 'r') as f:
|
with open(input_products_file, 'r') as f:
|
||||||
urls = [line.strip() for line in f.readlines()]
|
products_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
|
||||||
|
|
||||||
|
with open(input_urls_file, 'r') as f:
|
||||||
|
urls_input = set([unify_product_url(line.strip()) for line in f.readlines() if line.strip() != ''])
|
||||||
|
|
||||||
|
print(f'Getting urls to products at {datetime.datetime.now()}')
|
||||||
|
product_urls = find_urls(products_input)
|
||||||
|
|
||||||
|
print(f'Found {len(product_urls)} products by search')
|
||||||
|
print(f'Found {len(urls_input)} products by url input')
|
||||||
|
urls = product_urls.union(urls_input)
|
||||||
|
print(f'Total products to verify {len(urls)}')
|
||||||
|
|
||||||
print(f'Fetching prices at {datetime.datetime.now()}')
|
print(f'Fetching prices at {datetime.datetime.now()}')
|
||||||
|
|
||||||
product_info = fetch_product_info(urls)
|
product_info = fetch_product_info(urls_input.union(product_urls))
|
||||||
print(format_product_table(product_info))
|
print(format_product_table(product_info))
|
||||||
save_to_json(product_info, output_file)
|
save_to_json(product_info, output_file)
|
||||||
|
|
||||||
|
|||||||
1
products.txt.example
Normal file
1
products.txt.example
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Kapovací a pokosová pila
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
requests
|
requests
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
tabulate
|
tabulate
|
||||||
lxml
|
|
||||||
Reference in New Issue
Block a user