Lebensmittel-Sonderposten-S.../Scraper.py

import csv
import os
import re
import time
import errno
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

BASE_URL = "https://www.lebensmittel-sonderposten.de/Suessigkeiten/?p={page}"

def parse_price(price_str):
    # Keep decimal comma, remove other characters
    cleaned = re.sub(r"[^\d,\.]", "", price_str or "")
    if "," in cleaned and "." in cleaned:
        cleaned = cleaned.replace(".", "")  # remove thousands separator
    try:
        return cleaned  # keep as string with comma
    except ValueError:
        return "0,0"

# Selenium
opts = Options()
opts.add_argument("--headless")
opts.add_argument("--disable-gpu")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-blink-features=AutomationControlled")
opts.add_argument("start-maximized")
opts.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(options=opts)
wait = WebDriverWait(driver, 10)

def get_products(page, retries=3):
    url = BASE_URL.format(page=page)
    for attempt in range(retries):
        driver.get(url)
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".product-box")))
            time.sleep(1)
            break
        except TimeoutException:
            print(f"⚠️ Timeout loading page {page}, retry {attempt+1}/{retries}")
            if attempt == retries - 1:
                return []

    soup = BeautifulSoup(driver.page_source, "html.parser")
    products = []

    for product in soup.select(".product-box"):
        name_el = product.select_one("a.product-name")
        price_el = product.select_one(".product-price.with-list-price, .product-price")
        list_price_el = product.select_one(".list-price-price")
        saving_el = product.select_one(".list-price-percentage")
        base_price_el = product.select_one(".product-base-price")
        unit_el = product.select_one(".price-unit-content")

        if not name_el or not price_el:
            continue

        name = name_el.text.strip()
        url = name_el.get("href")

        # Extract prices with regex
        price = "0,0"
        if price_el and price_el.contents:
            text = price_el.get_text(" ", strip=True)
            match = re.search(r"[\d,.]+", text)
            if match:
                price = match.group(0).replace(".", "").replace(",", ",")

        list_price = price
        if list_price_el:
            match = re.search(r"[\d,.]+", list_price_el.text)
            if match:
                list_price = match.group(0).replace(".", "").replace(",", ",")

        saving_percent = "0,0"
        if saving_el:
            match = re.search(r"([\d,.]+)", saving_el.text)
            if match:
                saving_percent = match.group(1).replace(".", ",")

        base_price = None
        if base_price_el:
            match = re.search(r"[\d,.]+", base_price_el.text)
            if match:
                base_price = match.group(0).replace(".", ",")

        unit = unit_el.text.strip() if unit_el else None

        products.append({
            "name": name,
            "url": url,
            "price": price,
            "list_price": list_price,
            "saving": saving_percent,
            "price_per_kg": base_price,
            "unit": unit
        })

    return products

def safe_write_csv(filename, data):
    """Write CSV safely, retrying if permission denied, keeping decimal commas."""
    max_tries = 100
    attempt = 0
    base, ext = os.path.splitext(filename)
    csv_file = None

    while attempt < max_tries:
        suffix = f"_{attempt}" if attempt > 0 else ""
        candidate = f"{base}{suffix}{ext}"
        try:
            with open(candidate, mode="w", newline="", encoding="utf-8") as f:
                writer = csv.DictWriter(
                    f,
                    fieldnames=["name", "url", "price", "list_price", "saving", "price_per_kg", "unit"],
                    delimiter=";"
                )
                writer.writeheader()
                for p in data:
                    writer.writerow(p)
            csv_file = candidate
            break
        except PermissionError:
            attempt += 1
            continue
        except OSError as e:
            if e.errno in (errno.EACCES, errno.EPERM):
                attempt += 1
                continue
            raise

    if csv_file:
        print(f"✅ Saved {len(data)} products to {csv_file}")
    else:
        print("❌ Failed to save CSV file after multiple attempts.")

# scraping loop
all_products = []
page = 1
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M")
csv_filename = f"sonderposten_{current_datetime}.csv"

try:
    while True:
        print(f"Scraping page {page}...")
        products = get_products(page)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        alert = soup.select_one("div.alert.alert-info[role='alert']")
        if alert and "Keine Produkte gefunden" in alert.text:
            print("✅ No more products found, stopping.")
            break

        if not products:
            print("⚠️ No products found on this page, stopping.")
            break

        all_products.extend(products)
        page += 1

except KeyboardInterrupt:
    print("\n⏹️ Scraping cancelled — saving current progress...")
finally:
    driver.quit()
    # Always sort by saving descending
    all_products.sort(key=lambda x: float(x["saving"].replace(",", ".")), reverse=True)
    safe_write_csv(csv_filename, all_products)
Create Scraper.py 2025-10-31 10:26:05 +01:00			`import csv`
			`import os`
			`import re`
			`import time`
			`import errno`
			`from datetime import datetime`
			`from selenium import webdriver`
			`from selenium.webdriver.chrome.options import Options`
			`from selenium.webdriver.common.by import By`
			`from selenium.webdriver.support.ui import WebDriverWait`
			`from selenium.webdriver.support import expected_conditions as EC`
			`from selenium.common.exceptions import TimeoutException`
			`from bs4 import BeautifulSoup`

			`BASE_URL = "https://www.lebensmittel-sonderposten.de/Suessigkeiten/?p={page}"`

			`def parse_price(price_str):`
			`# Keep decimal comma, remove other characters`
			`cleaned = re.sub(r"[^\d,\.]", "", price_str or "")`
			`if "," in cleaned and "." in cleaned:`
			`cleaned = cleaned.replace(".", "") # remove thousands separator`
			`try:`
			`return cleaned # keep as string with comma`
			`except ValueError:`
			`return "0,0"`

			`# Selenium`
			`opts = Options()`
			`opts.add_argument("--headless")`
			`opts.add_argument("--disable-gpu")`
			`opts.add_argument("--no-sandbox")`
			`opts.add_argument("--disable-blink-features=AutomationControlled")`
			`opts.add_argument("start-maximized")`
			`opts.add_argument(`
			`"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "`
			`"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"`
			`)`

			`driver = webdriver.Chrome(options=opts)`
			`wait = WebDriverWait(driver, 10)`

			`def get_products(page, retries=3):`
			`url = BASE_URL.format(page=page)`
			`for attempt in range(retries):`
			`driver.get(url)`
			`try:`
			`wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".product-box")))`
			`time.sleep(1)`
			`break`
			`except TimeoutException:`
			`print(f"⚠️ Timeout loading page {page}, retry {attempt+1}/{retries}")`
			`if attempt == retries - 1:`
			`return []`

			`soup = BeautifulSoup(driver.page_source, "html.parser")`
			`products = []`

			`for product in soup.select(".product-box"):`
			`name_el = product.select_one("a.product-name")`
			`price_el = product.select_one(".product-price.with-list-price, .product-price")`
			`list_price_el = product.select_one(".list-price-price")`
			`saving_el = product.select_one(".list-price-percentage")`
			`base_price_el = product.select_one(".product-base-price")`
			`unit_el = product.select_one(".price-unit-content")`

			`if not name_el or not price_el:`
			`continue`

			`name = name_el.text.strip()`
			`url = name_el.get("href")`

			`# Extract prices with regex`
			`price = "0,0"`
			`if price_el and price_el.contents:`
			`text = price_el.get_text(" ", strip=True)`
			`match = re.search(r"[\d,.]+", text)`
			`if match:`
			`price = match.group(0).replace(".", "").replace(",", ",")`

			`list_price = price`
			`if list_price_el:`
			`match = re.search(r"[\d,.]+", list_price_el.text)`
			`if match:`
			`list_price = match.group(0).replace(".", "").replace(",", ",")`

			`saving_percent = "0,0"`
			`if saving_el:`
			`match = re.search(r"([\d,.]+)", saving_el.text)`
			`if match:`
			`saving_percent = match.group(1).replace(".", ",")`

			`base_price = None`
			`if base_price_el:`
			`match = re.search(r"[\d,.]+", base_price_el.text)`
			`if match:`
			`base_price = match.group(0).replace(".", ",")`

			`unit = unit_el.text.strip() if unit_el else None`

			`products.append({`
			`"name": name,`
			`"url": url,`
			`"price": price,`
			`"list_price": list_price,`
			`"saving": saving_percent,`
			`"price_per_kg": base_price,`
			`"unit": unit`
			`})`

			`return products`

			`def safe_write_csv(filename, data):`
			`"""Write CSV safely, retrying if permission denied, keeping decimal commas."""`
			`max_tries = 100`
			`attempt = 0`
			`base, ext = os.path.splitext(filename)`
			`csv_file = None`

			`while attempt < max_tries:`
			`suffix = f"_{attempt}" if attempt > 0 else ""`
			`candidate = f"{base}{suffix}{ext}"`
			`try:`
			`with open(candidate, mode="w", newline="", encoding="utf-8") as f:`
			`writer = csv.DictWriter(`
			`f,`
			`fieldnames=["name", "url", "price", "list_price", "saving", "price_per_kg", "unit"],`
			`delimiter=";"`
			`)`
			`writer.writeheader()`
			`for p in data:`
			`writer.writerow(p)`
			`csv_file = candidate`
			`break`
			`except PermissionError:`
			`attempt += 1`
			`continue`
			`except OSError as e:`
			`if e.errno in (errno.EACCES, errno.EPERM):`
			`attempt += 1`
			`continue`
			`raise`

			`if csv_file:`
			`print(f"✅ Saved {len(data)} products to {csv_file}")`
			`else:`
			`print("❌ Failed to save CSV file after multiple attempts.")`

			`# scraping loop`
			`all_products = []`
			`page = 1`
			`current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M")`
			`csv_filename = f"sonderposten_{current_datetime}.csv"`

			`try:`
			`while True:`
			`print(f"Scraping page {page}...")`
			`products = get_products(page)`

			`soup = BeautifulSoup(driver.page_source, "html.parser")`
			`alert = soup.select_one("div.alert.alert-info[role='alert']")`
			`if alert and "Keine Produkte gefunden" in alert.text:`
			`print("✅ No more products found, stopping.")`
			`break`

			`if not products:`
			`print("⚠️ No products found on this page, stopping.")`
			`break`

			`all_products.extend(products)`
			`page += 1`

			`except KeyboardInterrupt:`
			`print("\n⏹️ Scraping cancelled — saving current progress...")`
			`finally:`
			`driver.quit()`
			`# Always sort by saving descending`
			`all_products.sort(key=lambda x: float(x["saving"].replace(",", ".")), reverse=True)`
			`safe_write_csv(csv_filename, all_products)`