Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/scrapers/bing.py
+++ b/backend/app/scrapers/bing.py
@@ -0,0 +1,135 @@
+import hashlib
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class BingScraper(BaseScraper):
+    """Scraper for Bing Image Search v7 API (Azure Cognitive Services)."""
+
+    name = "bing"
+    requires_api_key = True
+
+    BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search"
+
+    NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
+
+    LICENSE_MAP = {
+        "Public": "CC0",
+        "Share": "CC-BY-SA",
+        "ShareCommercially": "CC-BY",
+        "Modify": "CC-BY-SA",
+        "ModifyCommercially": "CC-BY",
+    }
+
+    def _build_queries(self, species: Species) -> list[str]:
+        queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
+        if species.common_name:
+            queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
+        return queries
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None,
+    ) -> Dict[str, int]:
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0}
+
+        rate_limit = api_key.rate_limit_per_sec or 3.0
+        downloaded = 0
+        rejected = 0
+        seen_urls = set()
+
+        headers = {
+            "Ocp-Apim-Subscription-Key": api_key.api_key,
+        }
+
+        try:
+            queries = self._build_queries(species)
+
+            with httpx.Client(timeout=30, headers=headers) as client:
+                for query in queries:
+                    params = {
+                        "q": query,
+                        "imageType": "Photo",
+                        "license": "ShareCommercially",
+                        "count": 50,
+                    }
+
+                    response = client.get(self.BASE_URL, params=params)
+                    response.raise_for_status()
+                    data = response.json()
+
+                    for result in data.get("value", []):
+                        url = result.get("contentUrl")
+                        if not url or url in seen_urls:
+                            continue
+                        seen_urls.add(url)
+
+                        # Use Bing's imageId, fall back to md5 hash
+                        source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16]
+
+                        existing = db.query(Image).filter(
+                            Image.source == self.name,
+                            Image.source_id == source_id,
+                        ).first()
+
+                        if existing:
+                            continue
+
+                        # Map license
+                        bing_license = result.get("license", "")
+                        license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN")
+
+                        host = result.get("hostPageDisplayUrl", "")
+                        attribution = f"via Bing ({host})" if host else "via Bing Image Search"
+
+                        image = Image(
+                            species_id=species.id,
+                            source=self.name,
+                            source_id=source_id,
+                            url=url,
+                            width=result.get("width"),
+                            height=result.get("height"),
+                            license=license_code,
+                            attribution=attribution,
+                            status="pending",
+                        )
+                        db.add(image)
+                        db.commit()
+
+                        download_and_process_image.delay(image.id)
+                        downloaded += 1
+
+                    time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            if logger:
+                logger.error(f"Error scraping Bing for {species.scientific_name}: {e}")
+            else:
+                print(f"Error scraping Bing for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        headers = {"Ocp-Apim-Subscription-Key": api_key.api_key}
+        with httpx.Client(timeout=10, headers=headers) as client:
+            response = client.get(
+                self.BASE_URL,
+                params={"q": "Monstera deliciosa plant", "count": 1},
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        count = data.get("totalEstimatedMatches", 0)
+        return f"Bing Image Search working ({count:,} estimated matches)"