import hashlib import time import logging from typing import Dict, Optional import httpx from sqlalchemy.orm import Session from app.scrapers.base import BaseScraper from app.models import Species, Image, ApiKey from app.workers.quality_tasks import download_and_process_image class BingScraper(BaseScraper): """Scraper for Bing Image Search v7 API (Azure Cognitive Services).""" name = "bing" requires_api_key = True BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search" NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed" LICENSE_MAP = { "Public": "CC0", "Share": "CC-BY-SA", "ShareCommercially": "CC-BY", "Modify": "CC-BY-SA", "ModifyCommercially": "CC-BY", } def _build_queries(self, species: Species) -> list[str]: queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}'] if species.common_name: queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}') return queries def scrape_species( self, species: Species, db: Session, logger: Optional[logging.Logger] = None, ) -> Dict[str, int]: api_key = self.get_api_key(db) if not api_key: return {"downloaded": 0, "rejected": 0} rate_limit = api_key.rate_limit_per_sec or 3.0 downloaded = 0 rejected = 0 seen_urls = set() headers = { "Ocp-Apim-Subscription-Key": api_key.api_key, } try: queries = self._build_queries(species) with httpx.Client(timeout=30, headers=headers) as client: for query in queries: params = { "q": query, "imageType": "Photo", "license": "ShareCommercially", "count": 50, } response = client.get(self.BASE_URL, params=params) response.raise_for_status() data = response.json() for result in data.get("value", []): url = result.get("contentUrl") if not url or url in seen_urls: continue seen_urls.add(url) # Use Bing's imageId, fall back to md5 hash source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16] existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if existing: continue # Map license bing_license = result.get("license", "") license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN") host = result.get("hostPageDisplayUrl", "") attribution = f"via Bing ({host})" if host else "via Bing Image Search" image = Image( species_id=species.id, source=self.name, source_id=source_id, url=url, width=result.get("width"), height=result.get("height"), license=license_code, attribution=attribution, status="pending", ) db.add(image) db.commit() download_and_process_image.delay(image.id) downloaded += 1 time.sleep(1.0 / rate_limit) except Exception as e: if logger: logger.error(f"Error scraping Bing for {species.scientific_name}: {e}") else: print(f"Error scraping Bing for {species.scientific_name}: {e}") return {"downloaded": downloaded, "rejected": rejected} def test_connection(self, api_key: ApiKey) -> str: headers = {"Ocp-Apim-Subscription-Key": api_key.api_key} with httpx.Client(timeout=10, headers=headers) as client: response = client.get( self.BASE_URL, params={"q": "Monstera deliciosa plant", "count": 1}, ) response.raise_for_status() data = response.json() count = data.get("totalEstimatedMatches", 0) return f"Bing Image Search working ({count:,} estimated matches)"