From 716acad6db20328ebc98880c6244c7bf9ee03245 Mon Sep 17 00:00:00 2001 From: Simeon Wallrath Date: Fri, 31 Oct 2025 10:26:05 +0100 Subject: [PATCH] Create Scraper.py --- Scraper.py | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 Scraper.py diff --git a/Scraper.py b/Scraper.py new file mode 100644 index 0000000..951a78f --- /dev/null +++ b/Scraper.py @@ -0,0 +1,179 @@ +import csv +import os +import re +import time +import errno +from datetime import datetime +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException +from bs4 import BeautifulSoup + +BASE_URL = "https://www.lebensmittel-sonderposten.de/Suessigkeiten/?p={page}" + +def parse_price(price_str): + # Keep decimal comma, remove other characters + cleaned = re.sub(r"[^\d,\.]", "", price_str or "") + if "," in cleaned and "." in cleaned: + cleaned = cleaned.replace(".", "") # remove thousands separator + try: + return cleaned # keep as string with comma + except ValueError: + return "0,0" + +# Selenium +opts = Options() +opts.add_argument("--headless") +opts.add_argument("--disable-gpu") +opts.add_argument("--no-sandbox") +opts.add_argument("--disable-blink-features=AutomationControlled") +opts.add_argument("start-maximized") +opts.add_argument( + "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +) + +driver = webdriver.Chrome(options=opts) +wait = WebDriverWait(driver, 10) + +def get_products(page, retries=3): + url = BASE_URL.format(page=page) + for attempt in range(retries): + driver.get(url) + try: + wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".product-box"))) + time.sleep(1) + break + except TimeoutException: + print(f"⚠️ Timeout loading page {page}, retry {attempt+1}/{retries}") + if attempt == retries - 1: + return [] + + soup = BeautifulSoup(driver.page_source, "html.parser") + products = [] + + for product in soup.select(".product-box"): + name_el = product.select_one("a.product-name") + price_el = product.select_one(".product-price.with-list-price, .product-price") + list_price_el = product.select_one(".list-price-price") + saving_el = product.select_one(".list-price-percentage") + base_price_el = product.select_one(".product-base-price") + unit_el = product.select_one(".price-unit-content") + + if not name_el or not price_el: + continue + + name = name_el.text.strip() + url = name_el.get("href") + + # Extract prices with regex + price = "0,0" + if price_el and price_el.contents: + text = price_el.get_text(" ", strip=True) + match = re.search(r"[\d,.]+", text) + if match: + price = match.group(0).replace(".", "").replace(",", ",") + + list_price = price + if list_price_el: + match = re.search(r"[\d,.]+", list_price_el.text) + if match: + list_price = match.group(0).replace(".", "").replace(",", ",") + + saving_percent = "0,0" + if saving_el: + match = re.search(r"([\d,.]+)", saving_el.text) + if match: + saving_percent = match.group(1).replace(".", ",") + + base_price = None + if base_price_el: + match = re.search(r"[\d,.]+", base_price_el.text) + if match: + base_price = match.group(0).replace(".", ",") + + unit = unit_el.text.strip() if unit_el else None + + products.append({ + "name": name, + "url": url, + "price": price, + "list_price": list_price, + "saving": saving_percent, + "price_per_kg": base_price, + "unit": unit + }) + + return products + +def safe_write_csv(filename, data): + """Write CSV safely, retrying if permission denied, keeping decimal commas.""" + max_tries = 100 + attempt = 0 + base, ext = os.path.splitext(filename) + csv_file = None + + while attempt < max_tries: + suffix = f"_{attempt}" if attempt > 0 else "" + candidate = f"{base}{suffix}{ext}" + try: + with open(candidate, mode="w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter( + f, + fieldnames=["name", "url", "price", "list_price", "saving", "price_per_kg", "unit"], + delimiter=";" + ) + writer.writeheader() + for p in data: + writer.writerow(p) + csv_file = candidate + break + except PermissionError: + attempt += 1 + continue + except OSError as e: + if e.errno in (errno.EACCES, errno.EPERM): + attempt += 1 + continue + raise + + if csv_file: + print(f"✅ Saved {len(data)} products to {csv_file}") + else: + print("❌ Failed to save CSV file after multiple attempts.") + +# scraping loop +all_products = [] +page = 1 +current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M") +csv_filename = f"sonderposten_{current_datetime}.csv" + +try: + while True: + print(f"Scraping page {page}...") + products = get_products(page) + + soup = BeautifulSoup(driver.page_source, "html.parser") + alert = soup.select_one("div.alert.alert-info[role='alert']") + if alert and "Keine Produkte gefunden" in alert.text: + print("✅ No more products found, stopping.") + break + + if not products: + print("⚠️ No products found on this page, stopping.") + break + + all_products.extend(products) + page += 1 + +except KeyboardInterrupt: + print("\n⏹️ Scraping cancelled — saving current progress...") +finally: + driver.quit() + # Always sort by saving descending + all_products.sort(key=lambda x: float(x["saving"].replace(",", ".")), reverse=True) + safe_write_csv(csv_filename, all_products) +