Files
PlantGuideScraper/backend/app/scrapers/bhl.py
2026-04-12 09:54:27 -05:00

229 lines
8.9 KiB
Python

import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class BHLScraper(BaseScraper):
"""Scraper for Biodiversity Heritage Library (BHL) images.
BHL provides access to digitized biodiversity literature and illustrations.
Most content is public domain (pre-1927) or CC-licensed.
Note: BHL images are primarily historical botanical illustrations,
which may differ from photographs but are valuable for training.
"""
name = "bhl"
requires_api_key = True # BHL requires free API key
BASE_URL = "https://www.biodiversitylibrary.org/api3"
HEADERS = {
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
"Accept": "application/json",
}
# BHL content is mostly public domain
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from BHL for a species."""
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
downloaded = 0
rejected = 0
def log(level: str, msg: str):
if logger:
getattr(logger, level)(msg)
try:
# Disable SSL verification - some Docker environments lack proper CA certificates
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
# Search for name in BHL
search_response = client.get(
f"{self.BASE_URL}",
params={
"op": "NameSearch",
"name": species.scientific_name,
"format": "json",
"apikey": api_key.api_key,
},
)
search_response.raise_for_status()
search_data = search_response.json()
results = search_data.get("Result", [])
if not results:
log("info", f" Species not found in BHL: {species.scientific_name}")
return {"downloaded": 0, "rejected": 0}
time.sleep(1.0 / rate_limit)
# Get pages with illustrations for each name result
for name_result in results[:5]: # Limit to top 5 matches
name_bank_id = name_result.get("NameBankID")
if not name_bank_id:
continue
# Get publications with this name
pub_response = client.get(
f"{self.BASE_URL}",
params={
"op": "NameGetDetail",
"namebankid": name_bank_id,
"format": "json",
"apikey": api_key.api_key,
},
)
pub_response.raise_for_status()
pub_data = pub_response.json()
time.sleep(1.0 / rate_limit)
# Extract titles and get page images
for title in pub_data.get("Result", []):
title_id = title.get("TitleID")
if not title_id:
continue
# Get pages for this title
pages_response = client.get(
f"{self.BASE_URL}",
params={
"op": "GetPageMetadata",
"titleid": title_id,
"format": "json",
"apikey": api_key.api_key,
"ocr": "false",
"names": "false",
},
)
if pages_response.status_code != 200:
continue
pages_data = pages_response.json()
pages = pages_data.get("Result", [])
time.sleep(1.0 / rate_limit)
# Look for pages that are likely illustrations
for page in pages[:100]: # Limit pages per title
page_types = page.get("PageTypes", [])
# Only get illustration/plate pages
is_illustration = any(
pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"]
for pt in page_types
) if page_types else False
if not is_illustration and page_types:
continue
page_id = page.get("PageID")
if not page_id:
continue
# Construct image URL
# BHL provides multiple image sizes
image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}"
# Check if already exists
source_id = str(page_id)
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Determine license - BHL content is usually public domain
item_url = page.get("ItemUrl", "")
year = None
try:
# Try to extract year from ItemUrl or other fields
if "Year" in page:
year = int(page.get("Year", 0))
except (ValueError, TypeError):
pass
# Content before 1927 is public domain in US
if year and year < 1927:
license_code = "PD"
else:
license_code = "CC0" # BHL default for older works
# Build attribution
title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown"))
attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=image_url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Limit total per species
if downloaded >= 50:
break
if downloaded >= 50:
break
if downloaded >= 50:
break
except httpx.HTTPStatusError as e:
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
except Exception as e:
log("error", f" Error scraping BHL for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test BHL API connection."""
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
response = client.get(
f"{self.BASE_URL}",
params={
"op": "NameSearch",
"name": "Rosa",
"format": "json",
"apikey": api_key.api_key,
},
)
response.raise_for_status()
data = response.json()
results = data.get("Result", [])
return f"BHL API connection successful ({len(results)} results for 'Rosa')"