Files
2026-04-12 09:54:27 -05:00

136 lines
4.7 KiB
Python

import hashlib
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class BingScraper(BaseScraper):
"""Scraper for Bing Image Search v7 API (Azure Cognitive Services)."""
name = "bing"
requires_api_key = True
BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search"
NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
LICENSE_MAP = {
"Public": "CC0",
"Share": "CC-BY-SA",
"ShareCommercially": "CC-BY",
"Modify": "CC-BY-SA",
"ModifyCommercially": "CC-BY",
}
def _build_queries(self, species: Species) -> list[str]:
queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
if species.common_name:
queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
return queries
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None,
) -> Dict[str, int]:
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0}
rate_limit = api_key.rate_limit_per_sec or 3.0
downloaded = 0
rejected = 0
seen_urls = set()
headers = {
"Ocp-Apim-Subscription-Key": api_key.api_key,
}
try:
queries = self._build_queries(species)
with httpx.Client(timeout=30, headers=headers) as client:
for query in queries:
params = {
"q": query,
"imageType": "Photo",
"license": "ShareCommercially",
"count": 50,
}
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
for result in data.get("value", []):
url = result.get("contentUrl")
if not url or url in seen_urls:
continue
seen_urls.add(url)
# Use Bing's imageId, fall back to md5 hash
source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16]
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Map license
bing_license = result.get("license", "")
license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN")
host = result.get("hostPageDisplayUrl", "")
attribution = f"via Bing ({host})" if host else "via Bing Image Search"
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
width=result.get("width"),
height=result.get("height"),
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
time.sleep(1.0 / rate_limit)
except Exception as e:
if logger:
logger.error(f"Error scraping Bing for {species.scientific_name}: {e}")
else:
print(f"Error scraping Bing for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
headers = {"Ocp-Apim-Subscription-Key": api_key.api_key}
with httpx.Client(timeout=10, headers=headers) as client:
response = client.get(
self.BASE_URL,
params={"q": "Monstera deliciosa plant", "count": 1},
)
response.raise_for_status()
data = response.json()
count = data.get("totalEstimatedMatches", 0)
return f"Bing Image Search working ({count:,} estimated matches)"