PlantGuideScraper/backend/app/scrapers/bing.py

import hashlib
import time
import logging
from typing import Dict, Optional

import httpx
from sqlalchemy.orm import Session

from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image


class BingScraper(BaseScraper):
    """Scraper for Bing Image Search v7 API (Azure Cognitive Services)."""

    name = "bing"
    requires_api_key = True

    BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search"

    NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"

    LICENSE_MAP = {
        "Public": "CC0",
        "Share": "CC-BY-SA",
        "ShareCommercially": "CC-BY",
        "Modify": "CC-BY-SA",
        "ModifyCommercially": "CC-BY",
    }

    def _build_queries(self, species: Species) -> list[str]:
        queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
        if species.common_name:
            queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
        return queries

    def scrape_species(
        self,
        species: Species,
        db: Session,
        logger: Optional[logging.Logger] = None,
    ) -> Dict[str, int]:
        api_key = self.get_api_key(db)
        if not api_key:
            return {"downloaded": 0, "rejected": 0}

        rate_limit = api_key.rate_limit_per_sec or 3.0
        downloaded = 0
        rejected = 0
        seen_urls = set()

        headers = {
            "Ocp-Apim-Subscription-Key": api_key.api_key,
        }

        try:
            queries = self._build_queries(species)

            with httpx.Client(timeout=30, headers=headers) as client:
                for query in queries:
                    params = {
                        "q": query,
                        "imageType": "Photo",
                        "license": "ShareCommercially",
                        "count": 50,
                    }

                    response = client.get(self.BASE_URL, params=params)
                    response.raise_for_status()
                    data = response.json()

                    for result in data.get("value", []):
                        url = result.get("contentUrl")
                        if not url or url in seen_urls:
                            continue
                        seen_urls.add(url)

                        # Use Bing's imageId, fall back to md5 hash
                        source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16]

                        existing = db.query(Image).filter(
                            Image.source == self.name,
                            Image.source_id == source_id,
                        ).first()

                        if existing:
                            continue

                        # Map license
                        bing_license = result.get("license", "")
                        license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN")

                        host = result.get("hostPageDisplayUrl", "")
                        attribution = f"via Bing ({host})" if host else "via Bing Image Search"

                        image = Image(
                            species_id=species.id,
                            source=self.name,
                            source_id=source_id,
                            url=url,
                            width=result.get("width"),
                            height=result.get("height"),
                            license=license_code,
                            attribution=attribution,
                            status="pending",
                        )
                        db.add(image)
                        db.commit()

                        download_and_process_image.delay(image.id)
                        downloaded += 1

                    time.sleep(1.0 / rate_limit)

        except Exception as e:
            if logger:
                logger.error(f"Error scraping Bing for {species.scientific_name}: {e}")
            else:
                print(f"Error scraping Bing for {species.scientific_name}: {e}")

        return {"downloaded": downloaded, "rejected": rejected}

    def test_connection(self, api_key: ApiKey) -> str:
        headers = {"Ocp-Apim-Subscription-Key": api_key.api_key}
        with httpx.Client(timeout=10, headers=headers) as client:
            response = client.get(
                self.BASE_URL,
                params={"q": "Monstera deliciosa plant", "count": 1},
            )
            response.raise_for_status()
            data = response.json()

        count = data.get("totalEstimatedMatches", 0)
        return f"Bing Image Search working ({count:,} estimated matches)"