Initial commit — PlantGuideScraper project
This commit is contained in:
228
backend/app/scrapers/bhl.py
Normal file
228
backend/app/scrapers/bhl.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class BHLScraper(BaseScraper):
|
||||
"""Scraper for Biodiversity Heritage Library (BHL) images.
|
||||
|
||||
BHL provides access to digitized biodiversity literature and illustrations.
|
||||
Most content is public domain (pre-1927) or CC-licensed.
|
||||
|
||||
Note: BHL images are primarily historical botanical illustrations,
|
||||
which may differ from photographs but are valuable for training.
|
||||
"""
|
||||
|
||||
name = "bhl"
|
||||
requires_api_key = True # BHL requires free API key
|
||||
|
||||
BASE_URL = "https://www.biodiversitylibrary.org/api3"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
# BHL content is mostly public domain
|
||||
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from BHL for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
if not api_key:
|
||||
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
|
||||
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
def log(level: str, msg: str):
|
||||
if logger:
|
||||
getattr(logger, level)(msg)
|
||||
|
||||
try:
|
||||
# Disable SSL verification - some Docker environments lack proper CA certificates
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
|
||||
# Search for name in BHL
|
||||
search_response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "NameSearch",
|
||||
"name": species.scientific_name,
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
},
|
||||
)
|
||||
search_response.raise_for_status()
|
||||
search_data = search_response.json()
|
||||
|
||||
results = search_data.get("Result", [])
|
||||
if not results:
|
||||
log("info", f" Species not found in BHL: {species.scientific_name}")
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Get pages with illustrations for each name result
|
||||
for name_result in results[:5]: # Limit to top 5 matches
|
||||
name_bank_id = name_result.get("NameBankID")
|
||||
if not name_bank_id:
|
||||
continue
|
||||
|
||||
# Get publications with this name
|
||||
pub_response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "NameGetDetail",
|
||||
"namebankid": name_bank_id,
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
},
|
||||
)
|
||||
pub_response.raise_for_status()
|
||||
pub_data = pub_response.json()
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Extract titles and get page images
|
||||
for title in pub_data.get("Result", []):
|
||||
title_id = title.get("TitleID")
|
||||
if not title_id:
|
||||
continue
|
||||
|
||||
# Get pages for this title
|
||||
pages_response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "GetPageMetadata",
|
||||
"titleid": title_id,
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
"ocr": "false",
|
||||
"names": "false",
|
||||
},
|
||||
)
|
||||
|
||||
if pages_response.status_code != 200:
|
||||
continue
|
||||
|
||||
pages_data = pages_response.json()
|
||||
pages = pages_data.get("Result", [])
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Look for pages that are likely illustrations
|
||||
for page in pages[:100]: # Limit pages per title
|
||||
page_types = page.get("PageTypes", [])
|
||||
|
||||
# Only get illustration/plate pages
|
||||
is_illustration = any(
|
||||
pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"]
|
||||
for pt in page_types
|
||||
) if page_types else False
|
||||
|
||||
if not is_illustration and page_types:
|
||||
continue
|
||||
|
||||
page_id = page.get("PageID")
|
||||
if not page_id:
|
||||
continue
|
||||
|
||||
# Construct image URL
|
||||
# BHL provides multiple image sizes
|
||||
image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}"
|
||||
|
||||
# Check if already exists
|
||||
source_id = str(page_id)
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Determine license - BHL content is usually public domain
|
||||
item_url = page.get("ItemUrl", "")
|
||||
year = None
|
||||
try:
|
||||
# Try to extract year from ItemUrl or other fields
|
||||
if "Year" in page:
|
||||
year = int(page.get("Year", 0))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Content before 1927 is public domain in US
|
||||
if year and year < 1927:
|
||||
license_code = "PD"
|
||||
else:
|
||||
license_code = "CC0" # BHL default for older works
|
||||
|
||||
# Build attribution
|
||||
title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown"))
|
||||
attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})"
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=image_url,
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Limit total per species
|
||||
if downloaded >= 50:
|
||||
break
|
||||
|
||||
if downloaded >= 50:
|
||||
break
|
||||
|
||||
if downloaded >= 50:
|
||||
break
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
|
||||
except Exception as e:
|
||||
log("error", f" Error scraping BHL for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test BHL API connection."""
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "NameSearch",
|
||||
"name": "Rosa",
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get("Result", [])
|
||||
return f"BHL API connection successful ({len(results)} results for 'Rosa')"
|
||||
Reference in New Issue
Block a user