Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/scrapers/bhl.py
+++ b/backend/app/scrapers/bhl.py
@@ -0,0 +1,228 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class BHLScraper(BaseScraper):
+    """Scraper for Biodiversity Heritage Library (BHL) images.
+
+    BHL provides access to digitized biodiversity literature and illustrations.
+    Most content is public domain (pre-1927) or CC-licensed.
+
+    Note: BHL images are primarily historical botanical illustrations,
+    which may differ from photographs but are valuable for training.
+    """
+
+    name = "bhl"
+    requires_api_key = True  # BHL requires free API key
+
+    BASE_URL = "https://www.biodiversitylibrary.org/api3"
+
+    HEADERS = {
+        "User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
+        "Accept": "application/json",
+    }
+
+    # BHL content is mostly public domain
+    ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"}
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from BHL for a species."""
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
+
+        rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
+
+        downloaded = 0
+        rejected = 0
+
+        def log(level: str, msg: str):
+            if logger:
+                getattr(logger, level)(msg)
+
+        try:
+            # Disable SSL verification - some Docker environments lack proper CA certificates
+            with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
+                # Search for name in BHL
+                search_response = client.get(
+                    f"{self.BASE_URL}",
+                    params={
+                        "op": "NameSearch",
+                        "name": species.scientific_name,
+                        "format": "json",
+                        "apikey": api_key.api_key,
+                    },
+                )
+                search_response.raise_for_status()
+                search_data = search_response.json()
+
+                results = search_data.get("Result", [])
+                if not results:
+                    log("info", f"  Species not found in BHL: {species.scientific_name}")
+                    return {"downloaded": 0, "rejected": 0}
+
+                time.sleep(1.0 / rate_limit)
+
+                # Get pages with illustrations for each name result
+                for name_result in results[:5]:  # Limit to top 5 matches
+                    name_bank_id = name_result.get("NameBankID")
+                    if not name_bank_id:
+                        continue
+
+                    # Get publications with this name
+                    pub_response = client.get(
+                        f"{self.BASE_URL}",
+                        params={
+                            "op": "NameGetDetail",
+                            "namebankid": name_bank_id,
+                            "format": "json",
+                            "apikey": api_key.api_key,
+                        },
+                    )
+                    pub_response.raise_for_status()
+                    pub_data = pub_response.json()
+
+                    time.sleep(1.0 / rate_limit)
+
+                    # Extract titles and get page images
+                    for title in pub_data.get("Result", []):
+                        title_id = title.get("TitleID")
+                        if not title_id:
+                            continue
+
+                        # Get pages for this title
+                        pages_response = client.get(
+                            f"{self.BASE_URL}",
+                            params={
+                                "op": "GetPageMetadata",
+                                "titleid": title_id,
+                                "format": "json",
+                                "apikey": api_key.api_key,
+                                "ocr": "false",
+                                "names": "false",
+                            },
+                        )
+
+                        if pages_response.status_code != 200:
+                            continue
+
+                        pages_data = pages_response.json()
+                        pages = pages_data.get("Result", [])
+
+                        time.sleep(1.0 / rate_limit)
+
+                        # Look for pages that are likely illustrations
+                        for page in pages[:100]:  # Limit pages per title
+                            page_types = page.get("PageTypes", [])
+
+                            # Only get illustration/plate pages
+                            is_illustration = any(
+                                pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"]
+                                for pt in page_types
+                            ) if page_types else False
+
+                            if not is_illustration and page_types:
+                                continue
+
+                            page_id = page.get("PageID")
+                            if not page_id:
+                                continue
+
+                            # Construct image URL
+                            # BHL provides multiple image sizes
+                            image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}"
+
+                            # Check if already exists
+                            source_id = str(page_id)
+                            existing = db.query(Image).filter(
+                                Image.source == self.name,
+                                Image.source_id == source_id,
+                            ).first()
+
+                            if existing:
+                                continue
+
+                            # Determine license - BHL content is usually public domain
+                            item_url = page.get("ItemUrl", "")
+                            year = None
+                            try:
+                                # Try to extract year from ItemUrl or other fields
+                                if "Year" in page:
+                                    year = int(page.get("Year", 0))
+                            except (ValueError, TypeError):
+                                pass
+
+                            # Content before 1927 is public domain in US
+                            if year and year < 1927:
+                                license_code = "PD"
+                            else:
+                                license_code = "CC0"  # BHL default for older works
+
+                            # Build attribution
+                            title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown"))
+                            attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})"
+
+                            # Create image record
+                            image = Image(
+                                species_id=species.id,
+                                source=self.name,
+                                source_id=source_id,
+                                url=image_url,
+                                license=license_code,
+                                attribution=attribution,
+                                status="pending",
+                            )
+                            db.add(image)
+                            db.commit()
+
+                            # Queue for download
+                            download_and_process_image.delay(image.id)
+                            downloaded += 1
+
+                            # Limit total per species
+                            if downloaded >= 50:
+                                break
+
+                        if downloaded >= 50:
+                            break
+
+                    if downloaded >= 50:
+                        break
+
+        except httpx.HTTPStatusError as e:
+            log("error", f"  HTTP error for {species.scientific_name}: {e.response.status_code}")
+        except Exception as e:
+            log("error", f"  Error scraping BHL for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test BHL API connection."""
+        with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
+            response = client.get(
+                f"{self.BASE_URL}",
+                params={
+                    "op": "NameSearch",
+                    "name": "Rosa",
+                    "format": "json",
+                    "apikey": api_key.api_key,
+                },
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        results = data.get("Result", [])
+        return f"BHL API connection successful ({len(results)} results for 'Rosa')"