INit
This commit is contained in:
commit
791398168c
3 changed files with 303 additions and 0 deletions
277
DOIScraper.py
Normal file
277
DOIScraper.py
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
import re
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import csv
|
||||
import signal
|
||||
import requests
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from pdfminer.high_level import extract_text_to_fp
|
||||
from pdfminer.layout import LAParams
|
||||
from openpyxl import load_workbook, Workbook
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; DOI-Email-Finder/1.0)"
|
||||
}
|
||||
|
||||
EMAIL_REGEX = re.compile(
|
||||
r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
|
||||
)
|
||||
|
||||
PUBLISHER_EMAIL_DOMAINS = {
|
||||
"elsevier.com", "springer.com", "springernature.com", "nature.com",
|
||||
"wiley.com", "tandfonline.com", "oxfordjournals.org", "oup.com",
|
||||
"cambridge.org", "cambridgejournals.org", "ieee.org", "acm.org",
|
||||
"sciencedirect.com", "mdpi.com", "frontiersin.org", "hindawi.com", "plos.org",
|
||||
}
|
||||
|
||||
LOCALPART_BLACKLIST = {
|
||||
"editor", "editors", "support", "info", "contact",
|
||||
"office", "help", "admin", "journal"
|
||||
}
|
||||
|
||||
|
||||
def normalize_doi(raw):
|
||||
if not raw:
|
||||
return ""
|
||||
raw = raw.strip()
|
||||
return re.sub(r"^https?://(dx\.)?doi\.org/", "", raw, flags=re.I)
|
||||
|
||||
|
||||
def filter_publisher_emails(emails_with_source):
|
||||
filtered = set()
|
||||
for email, source in emails_with_source:
|
||||
try:
|
||||
local, domain = email.lower().split("@", 1)
|
||||
except ValueError:
|
||||
continue
|
||||
if local in LOCALPART_BLACKLIST:
|
||||
continue
|
||||
if any(domain.endswith(p) for p in PUBLISHER_EMAIL_DOMAINS):
|
||||
continue
|
||||
filtered.add((email, source))
|
||||
return filtered
|
||||
|
||||
|
||||
|
||||
def get_crossref_emails(doi):
|
||||
try:
|
||||
r = requests.get(
|
||||
f"https://api.crossref.org/works/{doi}",
|
||||
headers=HEADERS,
|
||||
timeout=10
|
||||
)
|
||||
r.raise_for_status()
|
||||
except Exception:
|
||||
return set()
|
||||
emails = set()
|
||||
for author in r.json().get("message", {}).get("author", []):
|
||||
if "email" in author:
|
||||
emails.add((author["email"], "crossref"))
|
||||
return emails
|
||||
|
||||
|
||||
def extract_emails_from_html(url):
|
||||
try:
|
||||
r = requests.get(url, headers=HEADERS, timeout=10)
|
||||
r.raise_for_status()
|
||||
except Exception:
|
||||
return set()
|
||||
return {(e, "website") for e in EMAIL_REGEX.findall(r.text)}
|
||||
|
||||
|
||||
def get_scihub_pdf(doi, mirror_url=None):
|
||||
if not mirror_url:
|
||||
return None
|
||||
try:
|
||||
url = mirror_url + doi
|
||||
r = requests.get(url, headers=HEADERS, timeout=10)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
|
||||
obj_tag = soup.find("object", type="application/pdf")
|
||||
if obj_tag and obj_tag.get("data"):
|
||||
return urljoin(mirror_url, obj_tag["data"])
|
||||
|
||||
dl_tag = soup.select_one("div.download a[href]")
|
||||
if dl_tag:
|
||||
return urljoin(mirror_url, dl_tag["href"])
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def find_pdf_url(soup, base_url):
|
||||
if not soup:
|
||||
return None
|
||||
for a in soup.find_all("a", href=True):
|
||||
if ".pdf" in a["href"].lower():
|
||||
return urljoin(base_url, a["href"])
|
||||
return None
|
||||
|
||||
|
||||
def extract_emails_from_pdf(pdf_url, source):
|
||||
try:
|
||||
r = requests.get(pdf_url, headers=HEADERS, timeout=15)
|
||||
r.raise_for_status()
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
emails = set()
|
||||
output_string = io.StringIO()
|
||||
try:
|
||||
extract_text_to_fp(io.BytesIO(r.content), output_string, laparams=LAParams(), output_type='text')
|
||||
text = output_string.getvalue()
|
||||
for e in EMAIL_REGEX.findall(text):
|
||||
emails.add((e, source))
|
||||
except Exception:
|
||||
pass
|
||||
return emails
|
||||
|
||||
|
||||
|
||||
def find_author_emails(doi, scihub_mirror=None):
|
||||
emails = set()
|
||||
emails |= get_crossref_emails(doi)
|
||||
|
||||
# DOI landing page
|
||||
try:
|
||||
r = requests.get(f"https://doi.org/{doi}", headers=HEADERS, allow_redirects=True, timeout=10)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
emails |= extract_emails_from_html(r.url)
|
||||
except Exception:
|
||||
soup = None
|
||||
|
||||
# Sci-Hub first
|
||||
scihub_pdf = get_scihub_pdf(doi, scihub_mirror)
|
||||
if scihub_pdf:
|
||||
emails |= extract_emails_from_pdf(scihub_pdf, "pdf (Sci-Hub)")
|
||||
else:
|
||||
pdf_url = find_pdf_url(soup, r.url) if soup else None
|
||||
if pdf_url:
|
||||
emails |= extract_emails_from_pdf(pdf_url, "pdf (landing page)")
|
||||
|
||||
return filter_publisher_emails(emails)
|
||||
|
||||
|
||||
def read_input_file(path):
|
||||
if path.lower().endswith(".csv"):
|
||||
with open(path, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
return list(reader), "csv"
|
||||
|
||||
if path.lower().endswith(".xlsx"):
|
||||
wb = load_workbook(path)
|
||||
ws = wb.active
|
||||
headers = [c.value for c in ws[1]]
|
||||
rows = []
|
||||
for row in ws.iter_rows(min_row=2, values_only=True):
|
||||
rows.append(dict(zip(headers, row)))
|
||||
return rows, "xlsx"
|
||||
|
||||
raise ValueError("Unsupported file type")
|
||||
|
||||
|
||||
def get_safe_output_path(input_path, filetype):
|
||||
base = input_path.rsplit(".", 1)[0] + "_with_emails"
|
||||
output_path = f"{base}.{filetype}"
|
||||
counter = 1
|
||||
while os.path.exists(output_path):
|
||||
output_path = f"{base}_{counter}.{filetype}"
|
||||
counter += 1
|
||||
return output_path
|
||||
|
||||
|
||||
def write_output_file(rows, input_path, filetype):
|
||||
output_path = get_safe_output_path(input_path, filetype)
|
||||
try:
|
||||
if filetype == "csv":
|
||||
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
else:
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.append(list(rows[0].keys()))
|
||||
for row in rows:
|
||||
ws.append(list(row.values()))
|
||||
wb.save(output_path)
|
||||
print(f"\nDone.\nOutput written to: {output_path}")
|
||||
except PermissionError:
|
||||
counter = 1
|
||||
while True:
|
||||
try:
|
||||
alt_path = output_path.rsplit(".", 1)[0] + f"_{counter}.{filetype}"
|
||||
if filetype == "csv":
|
||||
with open(alt_path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
else:
|
||||
wb.save(alt_path)
|
||||
print(f"\nDone.\nOutput written to: {alt_path}")
|
||||
break
|
||||
except PermissionError:
|
||||
counter += 1
|
||||
|
||||
|
||||
|
||||
def setup_ctrlc_handler(rows, input_path, filetype):
|
||||
def handler(signum, frame):
|
||||
print("\n[INFO] Ctrl+C detected. Saving progress...")
|
||||
write_output_file(rows, input_path, filetype)
|
||||
print("[INFO] Partial results saved. Exiting.")
|
||||
sys.exit(0)
|
||||
signal.signal(signal.SIGINT, handler)
|
||||
|
||||
|
||||
|
||||
def get_scihub_mirror_from_user():
|
||||
mirror = input("Enter Sci-Hub mirror URL (e.g., sci-hub.st or https://sci-hub.st/) or leave blank to skip: ").strip()
|
||||
if not mirror:
|
||||
return None
|
||||
if not mirror.startswith("http"):
|
||||
mirror = "https://" + mirror
|
||||
if not mirror.endswith("/"):
|
||||
mirror += "/"
|
||||
return mirror
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
root = tk.Tk()
|
||||
root.withdraw()
|
||||
|
||||
file_path = filedialog.askopenfilename(
|
||||
title="Select CSV or XLSX file",
|
||||
filetypes=[("CSV / Excel", "*.csv *.xlsx")]
|
||||
)
|
||||
|
||||
if not file_path:
|
||||
print("No file selected.")
|
||||
raise SystemExit
|
||||
|
||||
scihub_mirror = get_scihub_mirror_from_user()
|
||||
rows, filetype = read_input_file(file_path)
|
||||
print(f"Processing {len(rows)} DOI(s)...\n")
|
||||
|
||||
setup_ctrlc_handler(rows, file_path, filetype)
|
||||
|
||||
for i, row in enumerate(rows, start=1):
|
||||
doi = normalize_doi(row.get("DOI", ""))
|
||||
if doi:
|
||||
print(f"[{i}/{len(rows)}] Processing DOI: {doi}")
|
||||
emails = sorted({e for e, _ in find_author_emails(doi, scihub_mirror)})
|
||||
print(f" Found {len(emails)} email(s): {', '.join(emails) if emails else 'None'}")
|
||||
row["author_emails"] = "; ".join(emails)
|
||||
else:
|
||||
print(f"[{i}/{len(rows)}] DOI missing or empty")
|
||||
row["author_emails"] = ""
|
||||
|
||||
write_output_file(rows, file_path, filetype)
|
||||
22
readme.md
Normal file
22
readme.md
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
This Script requires a CSV or XLSX with a column named DOI. It will create save a new file in the same format with a new column containing the mails.
|
||||
|
||||
|
||||
|
||||
Open a commandline and run:
|
||||
|
||||
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
|
||||
|
||||
python DOIScraper.py
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
requests
|
||||
beautifulsoup4
|
||||
pdfminer.six
|
||||
openpyxl
|
||||
Loading…
Add table
Add a link
Reference in a new issue