PlantGuideScraper/backend/app/scrapers/bhl.py

import time
import logging
from typing import Dict, Optional

import httpx
from sqlalchemy.orm import Session

from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image


class BHLScraper(BaseScraper):
    """Scraper for Biodiversity Heritage Library (BHL) images.

    BHL provides access to digitized biodiversity literature and illustrations.
    Most content is public domain (pre-1927) or CC-licensed.

    Note: BHL images are primarily historical botanical illustrations,
    which may differ from photographs but are valuable for training.
    """

    name = "bhl"
    requires_api_key = True  # BHL requires free API key

    BASE_URL = "https://www.biodiversitylibrary.org/api3"

    HEADERS = {
        "User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
        "Accept": "application/json",
    }

    # BHL content is mostly public domain
    ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"}

    def scrape_species(
        self,
        species: Species,
        db: Session,
        logger: Optional[logging.Logger] = None
    ) -> Dict[str, int]:
        """Scrape images from BHL for a species."""
        api_key = self.get_api_key(db)
        if not api_key:
            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}

        rate_limit = api_key.rate_limit_per_sec if api_key else 0.5

        downloaded = 0
        rejected = 0

        def log(level: str, msg: str):
            if logger:
                getattr(logger, level)(msg)

        try:
            # Disable SSL verification - some Docker environments lack proper CA certificates
            with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
                # Search for name in BHL
                search_response = client.get(
                    f"{self.BASE_URL}",
                    params={
                        "op": "NameSearch",
                        "name": species.scientific_name,
                        "format": "json",
                        "apikey": api_key.api_key,
                    },
                )
                search_response.raise_for_status()
                search_data = search_response.json()

                results = search_data.get("Result", [])
                if not results:
                    log("info", f"  Species not found in BHL: {species.scientific_name}")
                    return {"downloaded": 0, "rejected": 0}

                time.sleep(1.0 / rate_limit)

                # Get pages with illustrations for each name result
                for name_result in results[:5]:  # Limit to top 5 matches
                    name_bank_id = name_result.get("NameBankID")
                    if not name_bank_id:
                        continue

                    # Get publications with this name
                    pub_response = client.get(
                        f"{self.BASE_URL}",
                        params={
                            "op": "NameGetDetail",
                            "namebankid": name_bank_id,
                            "format": "json",
                            "apikey": api_key.api_key,
                        },
                    )
                    pub_response.raise_for_status()
                    pub_data = pub_response.json()

                    time.sleep(1.0 / rate_limit)

                    # Extract titles and get page images
                    for title in pub_data.get("Result", []):
                        title_id = title.get("TitleID")
                        if not title_id:
                            continue

                        # Get pages for this title
                        pages_response = client.get(
                            f"{self.BASE_URL}",
                            params={
                                "op": "GetPageMetadata",
                                "titleid": title_id,
                                "format": "json",
                                "apikey": api_key.api_key,
                                "ocr": "false",
                                "names": "false",
                            },
                        )

                        if pages_response.status_code != 200:
                            continue

                        pages_data = pages_response.json()
                        pages = pages_data.get("Result", [])

                        time.sleep(1.0 / rate_limit)

                        # Look for pages that are likely illustrations
                        for page in pages[:100]:  # Limit pages per title
                            page_types = page.get("PageTypes", [])

                            # Only get illustration/plate pages
                            is_illustration = any(
                                pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"]
                                for pt in page_types
                            ) if page_types else False

                            if not is_illustration and page_types:
                                continue

                            page_id = page.get("PageID")
                            if not page_id:
                                continue

                            # Construct image URL
                            # BHL provides multiple image sizes
                            image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}"

                            # Check if already exists
                            source_id = str(page_id)
                            existing = db.query(Image).filter(
                                Image.source == self.name,
                                Image.source_id == source_id,
                            ).first()

                            if existing:
                                continue

                            # Determine license - BHL content is usually public domain
                            item_url = page.get("ItemUrl", "")
                            year = None
                            try:
                                # Try to extract year from ItemUrl or other fields
                                if "Year" in page:
                                    year = int(page.get("Year", 0))
                            except (ValueError, TypeError):
                                pass

                            # Content before 1927 is public domain in US
                            if year and year < 1927:
                                license_code = "PD"
                            else:
                                license_code = "CC0"  # BHL default for older works

                            # Build attribution
                            title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown"))
                            attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})"

                            # Create image record
                            image = Image(
                                species_id=species.id,
                                source=self.name,
                                source_id=source_id,
                                url=image_url,
                                license=license_code,
                                attribution=attribution,
                                status="pending",
                            )
                            db.add(image)
                            db.commit()

                            # Queue for download
                            download_and_process_image.delay(image.id)
                            downloaded += 1

                            # Limit total per species
                            if downloaded >= 50:
                                break

                        if downloaded >= 50:
                            break

                    if downloaded >= 50:
                        break

        except httpx.HTTPStatusError as e:
            log("error", f"  HTTP error for {species.scientific_name}: {e.response.status_code}")
        except Exception as e:
            log("error", f"  Error scraping BHL for {species.scientific_name}: {e}")

        return {"downloaded": downloaded, "rejected": rejected}

    def test_connection(self, api_key: ApiKey) -> str:
        """Test BHL API connection."""
        with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
            response = client.get(
                f"{self.BASE_URL}",
                params={
                    "op": "NameSearch",
                    "name": "Rosa",
                    "format": "json",
                    "apikey": api_key.api_key,
                },
            )
            response.raise_for_status()
            data = response.json()

        results = data.get("Result", [])
        return f"BHL API connection successful ({len(results)} results for 'Rosa')"