Files
PlantGuideScraper/backend/app/scrapers/wikimedia.py
2026-04-12 09:54:27 -05:00

147 lines
4.7 KiB
Python

import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class WikimediaScraper(BaseScraper):
"""Scraper for Wikimedia Commons images."""
name = "wikimedia"
requires_api_key = False
BASE_URL = "https://commons.wikimedia.org/w/api.php"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from Wikimedia Commons for a species."""
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
downloaded = 0
rejected = 0
try:
# Search for images in the species category
search_term = species.scientific_name
params = {
"action": "query",
"format": "json",
"generator": "search",
"gsrsearch": f"filetype:bitmap {search_term}",
"gsrnamespace": 6, # File namespace
"gsrlimit": 50,
"prop": "imageinfo",
"iiprop": "url|extmetadata|size",
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if int(page_id) < 0:
continue
imageinfo = page.get("imageinfo", [{}])[0]
url = imageinfo.get("url", "")
if not url:
continue
# Check size
width = imageinfo.get("width", 0)
height = imageinfo.get("height", 0)
if width < 256 or height < 256:
rejected += 1
continue
# Get license from metadata
metadata = imageinfo.get("extmetadata", {})
license_info = metadata.get("LicenseShortName", {}).get("value", "")
# Filter for commercial-safe licenses
license_upper = license_info.upper()
if "CC BY" in license_upper or "CC0" in license_upper or "PUBLIC DOMAIN" in license_upper:
license_code = license_info
else:
rejected += 1
continue
# Check if already exists
source_id = str(page_id)
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Get attribution
artist = metadata.get("Artist", {}).get("value", "Unknown")
# Clean HTML from artist
if "<" in artist:
import re
artist = re.sub(r"<[^>]+>", "", artist).strip()
attribution = f"{artist} via Wikimedia Commons ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code,
attribution=attribution,
width=width,
height=height,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping Wikimedia for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test Wikimedia API connection."""
params = {
"action": "query",
"format": "json",
"meta": "siteinfo",
}
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
return "Wikimedia Commons API connection successful"