Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/scrapers/flickr.py
+++ b/backend/app/scrapers/flickr.py
@@ -0,0 +1,146 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class FlickrScraper(BaseScraper):
+    """Scraper for Flickr images via their API."""
+
+    name = "flickr"
+    requires_api_key = True
+
+    BASE_URL = "https://api.flickr.com/services/rest/"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    # Commercial-safe license IDs
+    # 4 = CC BY 2.0, 7 = No known copyright, 8 = US Gov, 9 = CC0
+    ALLOWED_LICENSES = "4,7,8,9"
+
+    LICENSE_MAP = {
+        "4": "CC-BY",
+        "7": "NO-KNOWN-COPYRIGHT",
+        "8": "US-GOV",
+        "9": "CC0",
+    }
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from Flickr for a species."""
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
+
+        rate_limit = api_key.rate_limit_per_sec
+
+        downloaded = 0
+        rejected = 0
+
+        try:
+            params = {
+                "method": "flickr.photos.search",
+                "api_key": api_key.api_key,
+                "text": species.scientific_name,
+                "license": self.ALLOWED_LICENSES,
+                "content_type": 1,  # Photos only
+                "media": "photos",
+                "extras": "license,url_l,url_o,owner_name",
+                "per_page": 100,
+                "format": "json",
+                "nojsoncallback": 1,
+            }
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(self.BASE_URL, params=params)
+                response.raise_for_status()
+                data = response.json()
+
+            if data.get("stat") != "ok":
+                return {"downloaded": 0, "rejected": 0, "error": data.get("message")}
+
+            photos = data.get("photos", {}).get("photo", [])
+
+            for photo in photos:
+                # Get best URL (original or large)
+                url = photo.get("url_o") or photo.get("url_l")
+                if not url:
+                    rejected += 1
+                    continue
+
+                # Get license
+                license_id = str(photo.get("license", ""))
+                license_code = self.LICENSE_MAP.get(license_id, "UNKNOWN")
+                if license_code == "UNKNOWN":
+                    rejected += 1
+                    continue
+
+                # Check if already exists
+                source_id = str(photo.get("id"))
+                existing = db.query(Image).filter(
+                    Image.source == self.name,
+                    Image.source_id == source_id,
+                ).first()
+
+                if existing:
+                    continue
+
+                # Build attribution
+                owner = photo.get("ownername", "Unknown")
+                attribution = f"Photo by {owner} on Flickr ({license_code})"
+
+                # Create image record
+                image = Image(
+                    species_id=species.id,
+                    source=self.name,
+                    source_id=source_id,
+                    url=url,
+                    license=license_code,
+                    attribution=attribution,
+                    status="pending",
+                )
+                db.add(image)
+                db.commit()
+
+                # Queue for download
+                download_and_process_image.delay(image.id)
+                downloaded += 1
+
+            # Rate limiting
+            time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            print(f"Error scraping Flickr for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test Flickr API connection."""
+        params = {
+            "method": "flickr.test.echo",
+            "api_key": api_key.api_key,
+            "format": "json",
+            "nojsoncallback": 1,
+        }
+
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(self.BASE_URL, params=params)
+            response.raise_for_status()
+            data = response.json()
+
+        if data.get("stat") != "ok":
+            raise Exception(data.get("message", "API test failed"))
+
+        return "Flickr API connection successful"