From 716acad6db20328ebc98880c6244c7bf9ee03245 Mon Sep 17 00:00:00 2001
From: Simeon Wallrath <forgejo@waldo.works>
Date: Fri, 31 Oct 2025 10:26:05 +0100
Subject: [PATCH] Create Scraper.py

---
 Scraper.py | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 Scraper.py

diff --git a/Scraper.py b/Scraper.py
new file mode 100644
index 0000000..951a78f
--- /dev/null
+++ b/Scraper.py
@@ -0,0 +1,179 @@
+import csv
+import os
+import re
+import time
+import errno
+from datetime import datetime
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
+from bs4 import BeautifulSoup
+
+BASE_URL = "https://www.lebensmittel-sonderposten.de/Suessigkeiten/?p={page}"
+
+def parse_price(price_str):
+    # Keep decimal comma, remove other characters
+    cleaned = re.sub(r"[^\d,\.]", "", price_str or "")
+    if "," in cleaned and "." in cleaned:
+        cleaned = cleaned.replace(".", "")  # remove thousands separator
+    try:
+        return cleaned  # keep as string with comma
+    except ValueError:
+        return "0,0"
+
+# Selenium
+opts = Options()
+opts.add_argument("--headless")
+opts.add_argument("--disable-gpu")
+opts.add_argument("--no-sandbox")
+opts.add_argument("--disable-blink-features=AutomationControlled")
+opts.add_argument("start-maximized")
+opts.add_argument(
+    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+)
+
+driver = webdriver.Chrome(options=opts)
+wait = WebDriverWait(driver, 10)
+
+def get_products(page, retries=3):
+    url = BASE_URL.format(page=page)
+    for attempt in range(retries):
+        driver.get(url)
+        try:
+            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".product-box")))
+            time.sleep(1)
+            break
+        except TimeoutException:
+            print(f"⚠️ Timeout loading page {page}, retry {attempt+1}/{retries}")
+            if attempt == retries - 1:
+                return []
+
+    soup = BeautifulSoup(driver.page_source, "html.parser")
+    products = []
+
+    for product in soup.select(".product-box"):
+        name_el = product.select_one("a.product-name")
+        price_el = product.select_one(".product-price.with-list-price, .product-price")
+        list_price_el = product.select_one(".list-price-price")
+        saving_el = product.select_one(".list-price-percentage")
+        base_price_el = product.select_one(".product-base-price")
+        unit_el = product.select_one(".price-unit-content")
+
+        if not name_el or not price_el:
+            continue
+
+        name = name_el.text.strip()
+        url = name_el.get("href")
+
+        # Extract prices with regex
+        price = "0,0"
+        if price_el and price_el.contents:
+            text = price_el.get_text(" ", strip=True)
+            match = re.search(r"[\d,.]+", text)
+            if match:
+                price = match.group(0).replace(".", "").replace(",", ",")
+
+        list_price = price
+        if list_price_el:
+            match = re.search(r"[\d,.]+", list_price_el.text)
+            if match:
+                list_price = match.group(0).replace(".", "").replace(",", ",")
+
+        saving_percent = "0,0"
+        if saving_el:
+            match = re.search(r"([\d,.]+)", saving_el.text)
+            if match:
+                saving_percent = match.group(1).replace(".", ",")
+
+        base_price = None
+        if base_price_el:
+            match = re.search(r"[\d,.]+", base_price_el.text)
+            if match:
+                base_price = match.group(0).replace(".", ",")
+
+        unit = unit_el.text.strip() if unit_el else None
+
+        products.append({
+            "name": name,
+            "url": url,
+            "price": price,
+            "list_price": list_price,
+            "saving": saving_percent,
+            "price_per_kg": base_price,
+            "unit": unit
+        })
+
+    return products
+
+def safe_write_csv(filename, data):
+    """Write CSV safely, retrying if permission denied, keeping decimal commas."""
+    max_tries = 100
+    attempt = 0
+    base, ext = os.path.splitext(filename)
+    csv_file = None
+
+    while attempt < max_tries:
+        suffix = f"_{attempt}" if attempt > 0 else ""
+        candidate = f"{base}{suffix}{ext}"
+        try:
+            with open(candidate, mode="w", newline="", encoding="utf-8") as f:
+                writer = csv.DictWriter(
+                    f,
+                    fieldnames=["name", "url", "price", "list_price", "saving", "price_per_kg", "unit"],
+                    delimiter=";"
+                )
+                writer.writeheader()
+                for p in data:
+                    writer.writerow(p)
+            csv_file = candidate
+            break
+        except PermissionError:
+            attempt += 1
+            continue
+        except OSError as e:
+            if e.errno in (errno.EACCES, errno.EPERM):
+                attempt += 1
+                continue
+            raise
+
+    if csv_file:
+        print(f"✅ Saved {len(data)} products to {csv_file}")
+    else:
+        print("❌ Failed to save CSV file after multiple attempts.")
+
+# scraping loop
+all_products = []
+page = 1
+current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M")
+csv_filename = f"sonderposten_{current_datetime}.csv"
+
+try:
+    while True:
+        print(f"Scraping page {page}...")
+        products = get_products(page)
+
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        alert = soup.select_one("div.alert.alert-info[role='alert']")
+        if alert and "Keine Produkte gefunden" in alert.text:
+            print("✅ No more products found, stopping.")
+            break
+
+        if not products:
+            print("⚠️ No products found on this page, stopping.")
+            break
+
+        all_products.extend(products)
+        page += 1
+
+except KeyboardInterrupt:
+    print("\n⏹️ Scraping cancelled — saving current progress...")
+finally:
+    driver.quit()
+    # Always sort by saving descending
+    all_products.sort(key=lambda x: float(x["saving"].replace(",", ".")), reverse=True)
+    safe_write_csv(csv_filename, all_products)
+