import time import logging from typing import Dict, Optional import httpx from sqlalchemy.orm import Session from app.scrapers.base import BaseScraper from app.models import Species, Image, ApiKey from app.workers.quality_tasks import download_and_process_image class BHLScraper(BaseScraper): """Scraper for Biodiversity Heritage Library (BHL) images. BHL provides access to digitized biodiversity literature and illustrations. Most content is public domain (pre-1927) or CC-licensed. Note: BHL images are primarily historical botanical illustrations, which may differ from photographs but are valuable for training. """ name = "bhl" requires_api_key = True # BHL requires free API key BASE_URL = "https://www.biodiversitylibrary.org/api3" HEADERS = { "User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)", "Accept": "application/json", } # BHL content is mostly public domain ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"} def scrape_species( self, species: Species, db: Session, logger: Optional[logging.Logger] = None ) -> Dict[str, int]: """Scrape images from BHL for a species.""" api_key = self.get_api_key(db) if not api_key: return {"downloaded": 0, "rejected": 0, "error": "No API key configured"} rate_limit = api_key.rate_limit_per_sec if api_key else 0.5 downloaded = 0 rejected = 0 def log(level: str, msg: str): if logger: getattr(logger, level)(msg) try: # Disable SSL verification - some Docker environments lack proper CA certificates with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client: # Search for name in BHL search_response = client.get( f"{self.BASE_URL}", params={ "op": "NameSearch", "name": species.scientific_name, "format": "json", "apikey": api_key.api_key, }, ) search_response.raise_for_status() search_data = search_response.json() results = search_data.get("Result", []) if not results: log("info", f" Species not found in BHL: {species.scientific_name}") return {"downloaded": 0, "rejected": 0} time.sleep(1.0 / rate_limit) # Get pages with illustrations for each name result for name_result in results[:5]: # Limit to top 5 matches name_bank_id = name_result.get("NameBankID") if not name_bank_id: continue # Get publications with this name pub_response = client.get( f"{self.BASE_URL}", params={ "op": "NameGetDetail", "namebankid": name_bank_id, "format": "json", "apikey": api_key.api_key, }, ) pub_response.raise_for_status() pub_data = pub_response.json() time.sleep(1.0 / rate_limit) # Extract titles and get page images for title in pub_data.get("Result", []): title_id = title.get("TitleID") if not title_id: continue # Get pages for this title pages_response = client.get( f"{self.BASE_URL}", params={ "op": "GetPageMetadata", "titleid": title_id, "format": "json", "apikey": api_key.api_key, "ocr": "false", "names": "false", }, ) if pages_response.status_code != 200: continue pages_data = pages_response.json() pages = pages_data.get("Result", []) time.sleep(1.0 / rate_limit) # Look for pages that are likely illustrations for page in pages[:100]: # Limit pages per title page_types = page.get("PageTypes", []) # Only get illustration/plate pages is_illustration = any( pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"] for pt in page_types ) if page_types else False if not is_illustration and page_types: continue page_id = page.get("PageID") if not page_id: continue # Construct image URL # BHL provides multiple image sizes image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}" # Check if already exists source_id = str(page_id) existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if existing: continue # Determine license - BHL content is usually public domain item_url = page.get("ItemUrl", "") year = None try: # Try to extract year from ItemUrl or other fields if "Year" in page: year = int(page.get("Year", 0)) except (ValueError, TypeError): pass # Content before 1927 is public domain in US if year and year < 1927: license_code = "PD" else: license_code = "CC0" # BHL default for older works # Build attribution title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown")) attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})" # Create image record image = Image( species_id=species.id, source=self.name, source_id=source_id, url=image_url, license=license_code, attribution=attribution, status="pending", ) db.add(image) db.commit() # Queue for download download_and_process_image.delay(image.id) downloaded += 1 # Limit total per species if downloaded >= 50: break if downloaded >= 50: break if downloaded >= 50: break except httpx.HTTPStatusError as e: log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}") except Exception as e: log("error", f" Error scraping BHL for {species.scientific_name}: {e}") return {"downloaded": downloaded, "rejected": rejected} def test_connection(self, api_key: ApiKey) -> str: """Test BHL API connection.""" with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client: response = client.get( f"{self.BASE_URL}", params={ "op": "NameSearch", "name": "Rosa", "format": "json", "apikey": api_key.api_key, }, ) response.raise_for_status() data = response.json() results = data.get("Result", []) return f"BHL API connection successful ({len(results)} results for 'Rosa')"