160 lines
6.2 KiB
Python
160 lines
6.2 KiB
Python
import time
|
|
import logging
|
|
from typing import Dict, Optional
|
|
|
|
import httpx
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.scrapers.base import BaseScraper
|
|
from app.models import Species, Image, ApiKey
|
|
from app.workers.quality_tasks import download_and_process_image
|
|
|
|
|
|
class GBIFScraper(BaseScraper):
|
|
"""Scraper for GBIF (Global Biodiversity Information Facility) images."""
|
|
|
|
name = "gbif"
|
|
requires_api_key = False # GBIF is free to use
|
|
|
|
BASE_URL = "https://api.gbif.org/v1"
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
|
}
|
|
|
|
# Map GBIF license URLs to short codes
|
|
LICENSE_MAP = {
|
|
"http://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
|
|
"http://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
|
|
"http://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
|
|
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
|
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
|
"http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
|
|
"https://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
|
|
"https://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
|
|
"https://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
|
|
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
|
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
|
"https://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
|
|
}
|
|
|
|
# Only allow commercial-safe licenses
|
|
ALLOWED_LICENSES = {"CC0", "CC-BY"}
|
|
|
|
def scrape_species(
|
|
self,
|
|
species: Species,
|
|
db: Session,
|
|
logger: Optional[logging.Logger] = None
|
|
) -> Dict[str, int]:
|
|
"""Scrape images from GBIF for a species."""
|
|
# GBIF doesn't require API key, but we still respect rate limits
|
|
api_key = self.get_api_key(db)
|
|
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
|
|
|
|
downloaded = 0
|
|
rejected = 0
|
|
|
|
try:
|
|
params = {
|
|
"scientificName": species.scientific_name,
|
|
"mediaType": "StillImage",
|
|
"limit": 100,
|
|
}
|
|
|
|
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
|
response = client.get(
|
|
f"{self.BASE_URL}/occurrence/search",
|
|
params=params,
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
results = data.get("results", [])
|
|
|
|
for occurrence in results:
|
|
media_list = occurrence.get("media", [])
|
|
|
|
for media in media_list:
|
|
# Only process still images
|
|
if media.get("type") != "StillImage":
|
|
continue
|
|
|
|
url = media.get("identifier")
|
|
if not url:
|
|
rejected += 1
|
|
continue
|
|
|
|
# Check license
|
|
license_url = media.get("license", "")
|
|
license_code = self.LICENSE_MAP.get(license_url)
|
|
|
|
if not license_code or license_code not in self.ALLOWED_LICENSES:
|
|
rejected += 1
|
|
continue
|
|
|
|
# Create unique source ID from occurrence key and media URL
|
|
occurrence_key = occurrence.get("key", "")
|
|
# Use hash of URL for uniqueness within occurrence
|
|
url_hash = str(hash(url))[-8:]
|
|
source_id = f"{occurrence_key}_{url_hash}"
|
|
|
|
# Check if already exists
|
|
existing = db.query(Image).filter(
|
|
Image.source == self.name,
|
|
Image.source_id == source_id,
|
|
).first()
|
|
|
|
if existing:
|
|
continue
|
|
|
|
# Build attribution
|
|
creator = media.get("creator", "")
|
|
rights_holder = media.get("rightsHolder", "")
|
|
attribution_parts = []
|
|
if creator:
|
|
attribution_parts.append(f"Photo by {creator}")
|
|
if rights_holder and rights_holder != creator:
|
|
attribution_parts.append(f"Rights: {rights_holder}")
|
|
attribution_parts.append(f"via GBIF ({license_code})")
|
|
attribution = " | ".join(attribution_parts) if attribution_parts else f"GBIF ({license_code})"
|
|
|
|
# Create image record
|
|
image = Image(
|
|
species_id=species.id,
|
|
source=self.name,
|
|
source_id=source_id,
|
|
url=url,
|
|
license=license_code,
|
|
attribution=attribution,
|
|
status="pending",
|
|
)
|
|
db.add(image)
|
|
db.commit()
|
|
|
|
# Queue for download
|
|
download_and_process_image.delay(image.id)
|
|
downloaded += 1
|
|
|
|
# Rate limiting
|
|
time.sleep(1.0 / rate_limit)
|
|
|
|
except Exception as e:
|
|
print(f"Error scraping GBIF for {species.scientific_name}: {e}")
|
|
|
|
return {"downloaded": downloaded, "rejected": rejected}
|
|
|
|
def test_connection(self, api_key: ApiKey) -> str:
|
|
"""Test GBIF API connection."""
|
|
# GBIF doesn't require authentication, just test the endpoint
|
|
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
|
response = client.get(
|
|
f"{self.BASE_URL}/occurrence/search",
|
|
params={"limit": 1},
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
count = data.get("count", 0)
|
|
return f"GBIF API connection successful ({count:,} total occurrences available)"
|