Initial commit — PlantGuideScraper project
This commit is contained in:
226
backend/app/scrapers/eol.py
Normal file
226
backend/app/scrapers/eol.py
Normal file
@@ -0,0 +1,226 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class EOLScraper(BaseScraper):
|
||||
"""Scraper for Encyclopedia of Life (EOL) images.
|
||||
|
||||
EOL aggregates biodiversity data from many sources and provides
|
||||
a free API with no authentication required.
|
||||
"""
|
||||
|
||||
name = "eol"
|
||||
requires_api_key = False
|
||||
|
||||
BASE_URL = "https://eol.org/api"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
# Map EOL license URLs to short codes
|
||||
LICENSE_MAP = {
|
||||
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
||||
"http://creativecommons.org/publicdomain/mark/1.0/": "CC0",
|
||||
"http://creativecommons.org/licenses/by/2.0/": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by/3.0/": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
|
||||
"http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
|
||||
"http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
|
||||
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
||||
"https://creativecommons.org/publicdomain/mark/1.0/": "CC0",
|
||||
"https://creativecommons.org/licenses/by/2.0/": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by/3.0/": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
|
||||
"https://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
|
||||
"https://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
|
||||
"pd": "CC0", # Public domain
|
||||
"public domain": "CC0",
|
||||
}
|
||||
|
||||
# Commercial-safe licenses
|
||||
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA"}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from EOL for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
def log(level: str, msg: str):
|
||||
if logger:
|
||||
getattr(logger, level)(msg)
|
||||
|
||||
try:
|
||||
# Disable SSL verification - EOL is a trusted source and some Docker
|
||||
# environments lack proper CA certificates
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
|
||||
# Step 1: Search for the species
|
||||
search_response = client.get(
|
||||
f"{self.BASE_URL}/search/1.0.json",
|
||||
params={
|
||||
"q": species.scientific_name,
|
||||
"page": 1,
|
||||
"exact": "true",
|
||||
},
|
||||
)
|
||||
search_response.raise_for_status()
|
||||
search_data = search_response.json()
|
||||
|
||||
results = search_data.get("results", [])
|
||||
if not results:
|
||||
log("info", f" Species not found in EOL: {species.scientific_name}")
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
# Get the EOL page ID
|
||||
eol_page_id = results[0].get("id")
|
||||
if not eol_page_id:
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Step 2: Get page details with images
|
||||
page_response = client.get(
|
||||
f"{self.BASE_URL}/pages/1.0/{eol_page_id}.json",
|
||||
params={
|
||||
"images_per_page": 75,
|
||||
"images_page": 1,
|
||||
"videos_per_page": 0,
|
||||
"sounds_per_page": 0,
|
||||
"maps_per_page": 0,
|
||||
"texts_per_page": 0,
|
||||
"details": "true",
|
||||
"licenses": "cc-by|cc-by-sa|pd|cc-by-nc",
|
||||
},
|
||||
)
|
||||
page_response.raise_for_status()
|
||||
page_data = page_response.json()
|
||||
|
||||
data_objects = page_data.get("dataObjects", [])
|
||||
log("debug", f" Found {len(data_objects)} media objects")
|
||||
|
||||
for obj in data_objects:
|
||||
# Only process images
|
||||
media_type = obj.get("dataType", "")
|
||||
if "image" not in media_type.lower() and "stillimage" not in media_type.lower():
|
||||
continue
|
||||
|
||||
# Get image URL
|
||||
image_url = obj.get("eolMediaURL") or obj.get("mediaURL")
|
||||
if not image_url:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Check license
|
||||
license_url = obj.get("license", "").lower()
|
||||
license_code = None
|
||||
|
||||
# Try to match license URL
|
||||
for pattern, code in self.LICENSE_MAP.items():
|
||||
if pattern in license_url:
|
||||
license_code = code
|
||||
break
|
||||
|
||||
if not license_code:
|
||||
# Check for NC licenses which we reject
|
||||
if "-nc" in license_url:
|
||||
rejected += 1
|
||||
continue
|
||||
# Unknown license, skip
|
||||
log("debug", f" Rejected: unknown license {license_url}")
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
if license_code not in self.ALLOWED_LICENSES:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Create unique source ID
|
||||
source_id = str(obj.get("dataObjectVersionID") or obj.get("identifier") or hash(image_url))
|
||||
|
||||
# Check if already exists
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Build attribution
|
||||
agents = obj.get("agents", [])
|
||||
photographer = None
|
||||
rights_holder = None
|
||||
|
||||
for agent in agents:
|
||||
role = agent.get("role", "").lower()
|
||||
name = agent.get("full_name", "")
|
||||
if role == "photographer":
|
||||
photographer = name
|
||||
elif role == "owner" or role == "rights holder":
|
||||
rights_holder = name
|
||||
|
||||
attribution_parts = []
|
||||
if photographer:
|
||||
attribution_parts.append(f"Photo by {photographer}")
|
||||
if rights_holder and rights_holder != photographer:
|
||||
attribution_parts.append(f"Rights: {rights_holder}")
|
||||
attribution_parts.append(f"via EOL ({license_code})")
|
||||
attribution = " | ".join(attribution_parts)
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=image_url,
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
|
||||
except Exception as e:
|
||||
log("error", f" Error scraping EOL for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test EOL API connection."""
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/search/1.0.json",
|
||||
params={"q": "Rosa", "page": 1},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
total = data.get("totalResults", 0)
|
||||
return f"EOL API connection successful ({total} results for 'Rosa')"
|
||||
Reference in New Issue
Block a user