Files
PlantGuideScraper/backend/app/scrapers/eol.py
2026-04-12 09:54:27 -05:00

227 lines
8.7 KiB
Python

import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class EOLScraper(BaseScraper):
"""Scraper for Encyclopedia of Life (EOL) images.
EOL aggregates biodiversity data from many sources and provides
a free API with no authentication required.
"""
name = "eol"
requires_api_key = False
BASE_URL = "https://eol.org/api"
HEADERS = {
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
"Accept": "application/json",
}
# Map EOL license URLs to short codes
LICENSE_MAP = {
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"http://creativecommons.org/publicdomain/mark/1.0/": "CC0",
"http://creativecommons.org/licenses/by/2.0/": "CC-BY",
"http://creativecommons.org/licenses/by/3.0/": "CC-BY",
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
"http://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
"http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
"http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"https://creativecommons.org/publicdomain/mark/1.0/": "CC0",
"https://creativecommons.org/licenses/by/2.0/": "CC-BY",
"https://creativecommons.org/licenses/by/3.0/": "CC-BY",
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
"https://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
"https://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
"https://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
"pd": "CC0", # Public domain
"public domain": "CC0",
}
# Commercial-safe licenses
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA"}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from EOL for a species."""
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
downloaded = 0
rejected = 0
def log(level: str, msg: str):
if logger:
getattr(logger, level)(msg)
try:
# Disable SSL verification - EOL is a trusted source and some Docker
# environments lack proper CA certificates
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
# Step 1: Search for the species
search_response = client.get(
f"{self.BASE_URL}/search/1.0.json",
params={
"q": species.scientific_name,
"page": 1,
"exact": "true",
},
)
search_response.raise_for_status()
search_data = search_response.json()
results = search_data.get("results", [])
if not results:
log("info", f" Species not found in EOL: {species.scientific_name}")
return {"downloaded": 0, "rejected": 0}
# Get the EOL page ID
eol_page_id = results[0].get("id")
if not eol_page_id:
return {"downloaded": 0, "rejected": 0}
time.sleep(1.0 / rate_limit)
# Step 2: Get page details with images
page_response = client.get(
f"{self.BASE_URL}/pages/1.0/{eol_page_id}.json",
params={
"images_per_page": 75,
"images_page": 1,
"videos_per_page": 0,
"sounds_per_page": 0,
"maps_per_page": 0,
"texts_per_page": 0,
"details": "true",
"licenses": "cc-by|cc-by-sa|pd|cc-by-nc",
},
)
page_response.raise_for_status()
page_data = page_response.json()
data_objects = page_data.get("dataObjects", [])
log("debug", f" Found {len(data_objects)} media objects")
for obj in data_objects:
# Only process images
media_type = obj.get("dataType", "")
if "image" not in media_type.lower() and "stillimage" not in media_type.lower():
continue
# Get image URL
image_url = obj.get("eolMediaURL") or obj.get("mediaURL")
if not image_url:
rejected += 1
continue
# Check license
license_url = obj.get("license", "").lower()
license_code = None
# Try to match license URL
for pattern, code in self.LICENSE_MAP.items():
if pattern in license_url:
license_code = code
break
if not license_code:
# Check for NC licenses which we reject
if "-nc" in license_url:
rejected += 1
continue
# Unknown license, skip
log("debug", f" Rejected: unknown license {license_url}")
rejected += 1
continue
if license_code not in self.ALLOWED_LICENSES:
rejected += 1
continue
# Create unique source ID
source_id = str(obj.get("dataObjectVersionID") or obj.get("identifier") or hash(image_url))
# Check if already exists
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Build attribution
agents = obj.get("agents", [])
photographer = None
rights_holder = None
for agent in agents:
role = agent.get("role", "").lower()
name = agent.get("full_name", "")
if role == "photographer":
photographer = name
elif role == "owner" or role == "rights holder":
rights_holder = name
attribution_parts = []
if photographer:
attribution_parts.append(f"Photo by {photographer}")
if rights_holder and rights_holder != photographer:
attribution_parts.append(f"Rights: {rights_holder}")
attribution_parts.append(f"via EOL ({license_code})")
attribution = " | ".join(attribution_parts)
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=image_url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
time.sleep(1.0 / rate_limit)
except httpx.HTTPStatusError as e:
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
except Exception as e:
log("error", f" Error scraping EOL for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test EOL API connection."""
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
response = client.get(
f"{self.BASE_URL}/search/1.0.json",
params={"q": "Rosa", "page": 1},
)
response.raise_for_status()
data = response.json()
total = data.get("totalResults", 0)
return f"EOL API connection successful ({total} results for 'Rosa')"