Initial commit — PlantGuideScraper project
This commit is contained in:
135
backend/app/scrapers/bing.py
Normal file
135
backend/app/scrapers/bing.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import hashlib
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class BingScraper(BaseScraper):
|
||||
"""Scraper for Bing Image Search v7 API (Azure Cognitive Services)."""
|
||||
|
||||
name = "bing"
|
||||
requires_api_key = True
|
||||
|
||||
BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search"
|
||||
|
||||
NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
|
||||
|
||||
LICENSE_MAP = {
|
||||
"Public": "CC0",
|
||||
"Share": "CC-BY-SA",
|
||||
"ShareCommercially": "CC-BY",
|
||||
"Modify": "CC-BY-SA",
|
||||
"ModifyCommercially": "CC-BY",
|
||||
}
|
||||
|
||||
def _build_queries(self, species: Species) -> list[str]:
|
||||
queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
|
||||
if species.common_name:
|
||||
queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
|
||||
return queries
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
) -> Dict[str, int]:
|
||||
api_key = self.get_api_key(db)
|
||||
if not api_key:
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
rate_limit = api_key.rate_limit_per_sec or 3.0
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
seen_urls = set()
|
||||
|
||||
headers = {
|
||||
"Ocp-Apim-Subscription-Key": api_key.api_key,
|
||||
}
|
||||
|
||||
try:
|
||||
queries = self._build_queries(species)
|
||||
|
||||
with httpx.Client(timeout=30, headers=headers) as client:
|
||||
for query in queries:
|
||||
params = {
|
||||
"q": query,
|
||||
"imageType": "Photo",
|
||||
"license": "ShareCommercially",
|
||||
"count": 50,
|
||||
}
|
||||
|
||||
response = client.get(self.BASE_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
for result in data.get("value", []):
|
||||
url = result.get("contentUrl")
|
||||
if not url or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
|
||||
# Use Bing's imageId, fall back to md5 hash
|
||||
source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16]
|
||||
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Map license
|
||||
bing_license = result.get("license", "")
|
||||
license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN")
|
||||
|
||||
host = result.get("hostPageDisplayUrl", "")
|
||||
attribution = f"via Bing ({host})" if host else "via Bing Image Search"
|
||||
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
width=result.get("width"),
|
||||
height=result.get("height"),
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
if logger:
|
||||
logger.error(f"Error scraping Bing for {species.scientific_name}: {e}")
|
||||
else:
|
||||
print(f"Error scraping Bing for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
headers = {"Ocp-Apim-Subscription-Key": api_key.api_key}
|
||||
with httpx.Client(timeout=10, headers=headers) as client:
|
||||
response = client.get(
|
||||
self.BASE_URL,
|
||||
params={"q": "Monstera deliciosa plant", "count": 1},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
count = data.get("totalEstimatedMatches", 0)
|
||||
return f"Bing Image Search working ({count:,} estimated matches)"
|
||||
Reference in New Issue
Block a user