Initial commit — PlantGuideScraper project

This commit is contained in:
Trey T
2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions

View File

@@ -0,0 +1,146 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class FlickrScraper(BaseScraper):
"""Scraper for Flickr images via their API."""
name = "flickr"
requires_api_key = True
BASE_URL = "https://api.flickr.com/services/rest/"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
# Commercial-safe license IDs
# 4 = CC BY 2.0, 7 = No known copyright, 8 = US Gov, 9 = CC0
ALLOWED_LICENSES = "4,7,8,9"
LICENSE_MAP = {
"4": "CC-BY",
"7": "NO-KNOWN-COPYRIGHT",
"8": "US-GOV",
"9": "CC0",
}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from Flickr for a species."""
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
rate_limit = api_key.rate_limit_per_sec
downloaded = 0
rejected = 0
try:
params = {
"method": "flickr.photos.search",
"api_key": api_key.api_key,
"text": species.scientific_name,
"license": self.ALLOWED_LICENSES,
"content_type": 1, # Photos only
"media": "photos",
"extras": "license,url_l,url_o,owner_name",
"per_page": 100,
"format": "json",
"nojsoncallback": 1,
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
if data.get("stat") != "ok":
return {"downloaded": 0, "rejected": 0, "error": data.get("message")}
photos = data.get("photos", {}).get("photo", [])
for photo in photos:
# Get best URL (original or large)
url = photo.get("url_o") or photo.get("url_l")
if not url:
rejected += 1
continue
# Get license
license_id = str(photo.get("license", ""))
license_code = self.LICENSE_MAP.get(license_id, "UNKNOWN")
if license_code == "UNKNOWN":
rejected += 1
continue
# Check if already exists
source_id = str(photo.get("id"))
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Build attribution
owner = photo.get("ownername", "Unknown")
attribution = f"Photo by {owner} on Flickr ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping Flickr for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test Flickr API connection."""
params = {
"method": "flickr.test.echo",
"api_key": api_key.api_key,
"format": "json",
"nojsoncallback": 1,
}
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
if data.get("stat") != "ok":
raise Exception(data.get("message", "API test failed"))
return "Flickr API connection successful"