Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/workers/init.py
+++ b/backend/app/workers/init.py
@@ -0,0 +1 @@
+# Celery workers
--- a/backend/app/workers/celery_app.py
+++ b/backend/app/workers/celery_app.py
@@ -0,0 +1,36 @@
+from celery import Celery
+
+from app.config import get_settings
+
+settings = get_settings()
+
+celery_app = Celery(
+    "plant_scraper",
+    broker=settings.redis_url,
+    backend=settings.redis_url,
+    include=[
+        "app.workers.scrape_tasks",
+        "app.workers.quality_tasks",
+        "app.workers.export_tasks",
+        "app.workers.stats_tasks",
+    ],
+)
+
+celery_app.conf.update(
+    task_serializer="json",
+    accept_content=["json"],
+    result_serializer="json",
+    timezone="UTC",
+    enable_utc=True,
+    task_track_started=True,
+    task_time_limit=3600 * 24,  # 24 hour max per task
+    worker_prefetch_multiplier=1,
+    task_acks_late=True,
+    beat_schedule={
+        "refresh-stats-every-5min": {
+            "task": "app.workers.stats_tasks.refresh_stats",
+            "schedule": 300.0,  # Every 5 minutes
+        },
+    },
+    beat_schedule_filename="/tmp/celerybeat-schedule",
+)
--- a/backend/app/workers/export_tasks.py
+++ b/backend/app/workers/export_tasks.py
@@ -0,0 +1,170 @@
+import json
+import os
+import random
+import shutil
+import zipfile
+from datetime import datetime
+from pathlib import Path
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Export, Image, Species
+from app.config import get_settings
+
+settings = get_settings()
+
+
+@celery_app.task(bind=True)
+def generate_export(self, export_id: int):
+    """Generate a zip export for CoreML training."""
+    db = SessionLocal()
+    try:
+        export = db.query(Export).filter(Export.id == export_id).first()
+        if not export:
+            return {"error": "Export not found"}
+
+        # Update status
+        export.status = "generating"
+        export.celery_task_id = self.request.id
+        db.commit()
+
+        # Parse filter criteria
+        criteria = json.loads(export.filter_criteria) if export.filter_criteria else {}
+        min_images = criteria.get("min_images_per_species", 100)
+        licenses = criteria.get("licenses")
+        min_quality = criteria.get("min_quality")
+        species_ids = criteria.get("species_ids")
+
+        # Build query for images
+        query = db.query(Image).filter(Image.status == "downloaded")
+
+        if licenses:
+            query = query.filter(Image.license.in_(licenses))
+
+        if min_quality:
+            query = query.filter(Image.quality_score >= min_quality)
+
+        if species_ids:
+            query = query.filter(Image.species_id.in_(species_ids))
+
+        # Group by species and filter by min count
+        from sqlalchemy import func
+        species_counts = db.query(
+            Image.species_id,
+            func.count(Image.id).label("count")
+        ).filter(Image.status == "downloaded").group_by(Image.species_id).all()
+
+        valid_species_ids = [s.species_id for s in species_counts if s.count >= min_images]
+
+        if species_ids:
+            valid_species_ids = [s for s in valid_species_ids if s in species_ids]
+
+        if not valid_species_ids:
+            export.status = "failed"
+            export.error_message = "No species meet the criteria"
+            export.completed_at = datetime.utcnow()
+            db.commit()
+            return {"error": "No species meet the criteria"}
+
+        # Create export directory
+        export_dir = Path(settings.exports_path) / f"export_{export_id}"
+        train_dir = export_dir / "Training"
+        test_dir = export_dir / "Testing"
+        train_dir.mkdir(parents=True, exist_ok=True)
+        test_dir.mkdir(parents=True, exist_ok=True)
+
+        total_images = 0
+        species_count = 0
+
+        # Process each valid species
+        for i, species_id in enumerate(valid_species_ids):
+            species = db.query(Species).filter(Species.id == species_id).first()
+            if not species:
+                continue
+
+            # Get images for this species
+            images_query = query.filter(Image.species_id == species_id)
+            if licenses:
+                images_query = images_query.filter(Image.license.in_(licenses))
+            if min_quality:
+                images_query = images_query.filter(Image.quality_score >= min_quality)
+
+            images = images_query.all()
+            if len(images) < min_images:
+                continue
+
+            species_count += 1
+
+            # Create species folders
+            species_name = species.scientific_name.replace(" ", "_")
+            (train_dir / species_name).mkdir(exist_ok=True)
+            (test_dir / species_name).mkdir(exist_ok=True)
+
+            # Shuffle and split
+            random.shuffle(images)
+            split_idx = int(len(images) * export.train_split)
+            train_images = images[:split_idx]
+            test_images = images[split_idx:]
+
+            # Copy images
+            for j, img in enumerate(train_images):
+                if img.local_path and os.path.exists(img.local_path):
+                    ext = Path(img.local_path).suffix or ".jpg"
+                    dest = train_dir / species_name / f"img_{j:05d}{ext}"
+                    shutil.copy2(img.local_path, dest)
+                    total_images += 1
+
+            for j, img in enumerate(test_images):
+                if img.local_path and os.path.exists(img.local_path):
+                    ext = Path(img.local_path).suffix or ".jpg"
+                    dest = test_dir / species_name / f"img_{j:05d}{ext}"
+                    shutil.copy2(img.local_path, dest)
+                    total_images += 1
+
+            # Update progress
+            self.update_state(
+                state="PROGRESS",
+                meta={
+                    "current": i + 1,
+                    "total": len(valid_species_ids),
+                    "species": species.scientific_name,
+                }
+            )
+
+        # Create zip file
+        zip_path = Path(settings.exports_path) / f"export_{export_id}.zip"
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+            for root, dirs, files in os.walk(export_dir):
+                for file in files:
+                    file_path = Path(root) / file
+                    arcname = file_path.relative_to(export_dir)
+                    zipf.write(file_path, arcname)
+
+        # Clean up directory
+        shutil.rmtree(export_dir)
+
+        # Update export record
+        export.status = "completed"
+        export.file_path = str(zip_path)
+        export.file_size = zip_path.stat().st_size
+        export.species_count = species_count
+        export.image_count = total_images
+        export.completed_at = datetime.utcnow()
+        db.commit()
+
+        return {
+            "status": "completed",
+            "species_count": species_count,
+            "image_count": total_images,
+            "file_size": export.file_size,
+        }
+
+    except Exception as e:
+        if export:
+            export.status = "failed"
+            export.error_message = str(e)
+            export.completed_at = datetime.utcnow()
+            db.commit()
+        raise
+    finally:
+        db.close()
--- a/backend/app/workers/quality_tasks.py
+++ b/backend/app/workers/quality_tasks.py
@@ -0,0 +1,224 @@
+import os
+from pathlib import Path
+
+import httpx
+from PIL import Image as PILImage
+import imagehash
+import numpy as np
+from scipy import ndimage
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Image
+from app.config import get_settings
+
+settings = get_settings()
+
+
+def calculate_blur_score(image_path: str) -> float:
+    """Calculate blur score using Laplacian variance. Higher = sharper."""
+    try:
+        img = PILImage.open(image_path).convert("L")
+        img_array = np.array(img)
+        laplacian = ndimage.laplace(img_array)
+        return float(np.var(laplacian))
+    except Exception:
+        return 0.0
+
+
+def calculate_phash(image_path: str) -> str:
+    """Calculate perceptual hash for deduplication."""
+    try:
+        img = PILImage.open(image_path)
+        return str(imagehash.phash(img))
+    except Exception:
+        return ""
+
+
+def check_color_distribution(image_path: str) -> tuple[bool, str]:
+    """Check if image has healthy color distribution for a plant photo.
+
+    Returns (passed, reason) tuple.
+    Rejects:
+    - Low color variance (mean channel std < 25): herbarium specimens (brown on white)
+    - No green + low variance (green ratio < 5% AND mean std < 40): monochrome illustrations
+    """
+    try:
+        img = PILImage.open(image_path).convert("RGB")
+        arr = np.array(img, dtype=np.float64)
+
+        # Per-channel standard deviation
+        channel_stds = arr.std(axis=(0, 1))  # [R_std, G_std, B_std]
+        mean_std = float(channel_stds.mean())
+
+        if mean_std < 25:
+            return False, f"Low color variance ({mean_std:.1f})"
+
+        # Check green ratio
+        channel_means = arr.mean(axis=(0, 1))
+        total = channel_means.sum()
+        green_ratio = channel_means[1] / total if total > 0 else 0
+
+        if green_ratio < 0.05 and mean_std < 40:
+            return False, f"No green ({green_ratio:.2%}) + low variance ({mean_std:.1f})"
+
+        return True, ""
+    except Exception:
+        return True, ""  # Don't reject on error
+
+
+def resize_image(image_path: str, target_size: int = 512) -> bool:
+    """Resize image to target size while maintaining aspect ratio."""
+    try:
+        img = PILImage.open(image_path)
+        img.thumbnail((target_size, target_size), PILImage.Resampling.LANCZOS)
+        img.save(image_path, quality=95)
+        return True
+    except Exception:
+        return False
+
+
+@celery_app.task
+def download_and_process_image(image_id: int):
+    """Download image, check quality, dedupe, and resize."""
+    db = SessionLocal()
+    try:
+        image = db.query(Image).filter(Image.id == image_id).first()
+        if not image:
+            return {"error": "Image not found"}
+
+        # Create directory for species
+        species = image.species
+        species_dir = Path(settings.images_path) / species.scientific_name.replace(" ", "_")
+        species_dir.mkdir(parents=True, exist_ok=True)
+
+        # Download image
+        filename = f"{image.source}_{image.source_id or image.id}.jpg"
+        local_path = species_dir / filename
+
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+            }
+            with httpx.Client(timeout=30, headers=headers, follow_redirects=True) as client:
+                response = client.get(image.url)
+                response.raise_for_status()
+
+                with open(local_path, "wb") as f:
+                    f.write(response.content)
+        except Exception as e:
+            image.status = "rejected"
+            db.commit()
+            return {"error": f"Download failed: {e}"}
+
+        # Check minimum size
+        try:
+            with PILImage.open(local_path) as img:
+                width, height = img.size
+                if width < 256 or height < 256:
+                    os.remove(local_path)
+                    image.status = "rejected"
+                    db.commit()
+                    return {"error": "Image too small"}
+                image.width = width
+                image.height = height
+        except Exception as e:
+            if local_path.exists():
+                os.remove(local_path)
+            image.status = "rejected"
+            db.commit()
+            return {"error": f"Invalid image: {e}"}
+
+        # Calculate perceptual hash for deduplication
+        phash = calculate_phash(str(local_path))
+        if phash:
+            # Check for duplicates
+            existing = db.query(Image).filter(
+                Image.phash == phash,
+                Image.id != image.id,
+                Image.status == "downloaded"
+            ).first()
+
+            if existing:
+                os.remove(local_path)
+                image.status = "rejected"
+                image.phash = phash
+                db.commit()
+                return {"error": "Duplicate image"}
+
+            image.phash = phash
+
+        # Calculate blur score
+        quality_score = calculate_blur_score(str(local_path))
+        image.quality_score = quality_score
+
+        # Reject very blurry images (threshold can be tuned)
+        if quality_score < 100:  # Low variance = blurry
+            os.remove(local_path)
+            image.status = "rejected"
+            db.commit()
+            return {"error": "Image too blurry"}
+
+        # Check color distribution (reject herbarium specimens, illustrations)
+        color_ok, color_reason = check_color_distribution(str(local_path))
+        if not color_ok:
+            os.remove(local_path)
+            image.status = "rejected"
+            db.commit()
+            return {"error": f"Non-photo content: {color_reason}"}
+
+        # Resize to 512x512 max
+        resize_image(str(local_path))
+
+        # Update image record
+        image.local_path = str(local_path)
+        image.status = "downloaded"
+        db.commit()
+
+        return {
+            "status": "success",
+            "path": str(local_path),
+            "quality_score": quality_score,
+        }
+
+    except Exception as e:
+        if image:
+            image.status = "rejected"
+            db.commit()
+        return {"error": str(e)}
+    finally:
+        db.close()
+
+
+@celery_app.task(bind=True)
+def batch_process_pending_images(self, source: str = None, chunk_size: int = 500):
+    """Process ALL pending images in chunks, with progress tracking."""
+    db = SessionLocal()
+    try:
+        query = db.query(Image).filter(Image.status == "pending")
+        if source:
+            query = query.filter(Image.source == source)
+
+        total = query.count()
+        queued = 0
+        offset = 0
+
+        while offset < total:
+            chunk = query.order_by(Image.id).offset(offset).limit(chunk_size).all()
+            if not chunk:
+                break
+
+            for image in chunk:
+                download_and_process_image.delay(image.id)
+                queued += 1
+
+            offset += len(chunk)
+
+            self.update_state(
+                state="PROGRESS",
+                meta={"queued": queued, "total": total},
+            )
+
+        return {"queued": queued, "total": total}
+    finally:
+        db.close()
--- a/backend/app/workers/scrape_tasks.py
+++ b/backend/app/workers/scrape_tasks.py
@@ -0,0 +1,164 @@
+import json
+from datetime import datetime
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Job, Species, Image
+from app.utils.logging import get_job_logger
+
+
+@celery_app.task(bind=True)
+def run_scrape_job(self, job_id: int):
+    """Main scrape task that dispatches to source-specific scrapers."""
+    logger = get_job_logger(job_id)
+    logger.info(f"Starting scrape job {job_id}")
+
+    db = SessionLocal()
+    job = None
+    try:
+        job = db.query(Job).filter(Job.id == job_id).first()
+        if not job:
+            logger.error(f"Job {job_id} not found")
+            return {"error": "Job not found"}
+
+        logger.info(f"Job: {job.name}, Source: {job.source}")
+
+        # Update job status
+        job.status = "running"
+        job.started_at = datetime.utcnow()
+        job.celery_task_id = self.request.id
+        db.commit()
+
+        # Get species to scrape
+        if job.species_filter:
+            species_ids = json.loads(job.species_filter)
+            query = db.query(Species).filter(Species.id.in_(species_ids))
+            logger.info(f"Filtered to species IDs: {species_ids}")
+        else:
+            query = db.query(Species)
+            logger.info("Scraping all species")
+
+        # Filter by image count if requested
+        if job.only_without_images or job.max_images:
+            from sqlalchemy import func
+            # Subquery to count downloaded images per species
+            image_count_subquery = (
+                db.query(Image.species_id, func.count(Image.id).label("count"))
+                .filter(Image.status == "downloaded")
+                .group_by(Image.species_id)
+                .subquery()
+            )
+            # Left join with the count subquery
+            query = query.outerjoin(
+                image_count_subquery,
+                Species.id == image_count_subquery.c.species_id
+            )
+
+            if job.only_without_images:
+                # Filter where count is NULL or 0
+                query = query.filter(
+                    (image_count_subquery.c.count == None) | (image_count_subquery.c.count == 0)
+                )
+                logger.info("Filtering to species without images")
+            elif job.max_images:
+                # Filter where count is NULL or less than max_images
+                query = query.filter(
+                    (image_count_subquery.c.count == None) | (image_count_subquery.c.count < job.max_images)
+                )
+                logger.info(f"Filtering to species with fewer than {job.max_images} images")
+
+        species_list = query.all()
+        logger.info(f"Total species to scrape: {len(species_list)}")
+
+        job.progress_total = len(species_list)
+        db.commit()
+
+        # Import scraper based on source
+        from app.scrapers import get_scraper
+        scraper = get_scraper(job.source)
+
+        if not scraper:
+            error_msg = f"Unknown source: {job.source}"
+            logger.error(error_msg)
+            job.status = "failed"
+            job.error_message = error_msg
+            job.completed_at = datetime.utcnow()
+            db.commit()
+            return {"error": error_msg}
+
+        logger.info(f"Using scraper: {scraper.name}")
+
+        # Scrape each species
+        for i, species in enumerate(species_list):
+            try:
+                # Update progress
+                job.progress_current = i + 1
+                db.commit()
+
+                logger.info(f"[{i+1}/{len(species_list)}] Scraping: {species.scientific_name}")
+
+                # Update task state for real-time monitoring
+                self.update_state(
+                    state="PROGRESS",
+                    meta={
+                        "current": i + 1,
+                        "total": len(species_list),
+                        "species": species.scientific_name,
+                    }
+                )
+
+                # Run scraper for this species
+                results = scraper.scrape_species(species, db, logger)
+                downloaded = results.get("downloaded", 0)
+                rejected = results.get("rejected", 0)
+                job.images_downloaded += downloaded
+                job.images_rejected += rejected
+                db.commit()
+
+                logger.info(f"  -> Downloaded: {downloaded}, Rejected: {rejected}")
+
+            except Exception as e:
+                # Log error but continue with other species
+                logger.error(f"Error scraping {species.scientific_name}: {e}", exc_info=True)
+                continue
+
+        # Mark job complete
+        job.status = "completed"
+        job.completed_at = datetime.utcnow()
+        db.commit()
+
+        logger.info(f"Job {job_id} completed. Total downloaded: {job.images_downloaded}, rejected: {job.images_rejected}")
+
+        return {
+            "status": "completed",
+            "downloaded": job.images_downloaded,
+            "rejected": job.images_rejected,
+        }
+
+    except Exception as e:
+        logger.error(f"Job {job_id} failed with error: {e}", exc_info=True)
+        if job:
+            job.status = "failed"
+            job.error_message = str(e)
+            job.completed_at = datetime.utcnow()
+            db.commit()
+        raise
+    finally:
+        db.close()
+
+
+@celery_app.task
+def pause_scrape_job(job_id: int):
+    """Pause a running scrape job."""
+    db = SessionLocal()
+    try:
+        job = db.query(Job).filter(Job.id == job_id).first()
+        if job and job.status == "running":
+            job.status = "paused"
+            db.commit()
+            # Revoke the Celery task
+            if job.celery_task_id:
+                celery_app.control.revoke(job.celery_task_id, terminate=True)
+        return {"status": "paused"}
+    finally:
+        db.close()
--- a/backend/app/workers/stats_tasks.py
+++ b/backend/app/workers/stats_tasks.py
@@ -0,0 +1,193 @@
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+from sqlalchemy import func, case, text
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Species, Image, Job
+from app.models.cached_stats import CachedStats
+from app.config import get_settings
+
+
+def get_directory_size_fast(path: str) -> int:
+    """Get directory size in bytes using fast os.scandir."""
+    total = 0
+    try:
+        with os.scandir(path) as it:
+            for entry in it:
+                try:
+                    if entry.is_file(follow_symlinks=False):
+                        total += entry.stat(follow_symlinks=False).st_size
+                    elif entry.is_dir(follow_symlinks=False):
+                        total += get_directory_size_fast(entry.path)
+                except (OSError, PermissionError):
+                    pass
+    except (OSError, PermissionError):
+        pass
+    return total
+
+
+@celery_app.task
+def refresh_stats():
+    """Calculate and cache dashboard statistics."""
+    print("=== STATS TASK: Starting refresh ===", flush=True)
+
+    db = SessionLocal()
+    try:
+        # Use raw SQL for maximum performance on SQLite
+        # All counts in a single query
+        counts_sql = text("""
+            SELECT
+                (SELECT COUNT(*) FROM species) as total_species,
+                (SELECT COUNT(*) FROM images) as total_images,
+                (SELECT COUNT(*) FROM images WHERE status = 'downloaded') as images_downloaded,
+                (SELECT COUNT(*) FROM images WHERE status = 'pending') as images_pending,
+                (SELECT COUNT(*) FROM images WHERE status = 'rejected') as images_rejected
+        """)
+        counts = db.execute(counts_sql).fetchone()
+        total_species = counts[0] or 0
+        total_images = counts[1] or 0
+        images_downloaded = counts[2] or 0
+        images_pending = counts[3] or 0
+        images_rejected = counts[4] or 0
+
+        # Per-source stats - single query with GROUP BY
+        source_sql = text("""
+            SELECT
+                source,
+                COUNT(*) as total,
+                SUM(CASE WHEN status = 'downloaded' THEN 1 ELSE 0 END) as downloaded,
+                SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
+                SUM(CASE WHEN status = 'rejected' THEN 1 ELSE 0 END) as rejected
+            FROM images
+            GROUP BY source
+        """)
+        source_stats_raw = db.execute(source_sql).fetchall()
+        sources = [
+            {
+                "source": s[0],
+                "image_count": s[1],
+                "downloaded": s[2] or 0,
+                "pending": s[3] or 0,
+                "rejected": s[4] or 0,
+            }
+            for s in source_stats_raw
+        ]
+
+        # Per-license stats - single indexed query
+        license_sql = text("""
+            SELECT license, COUNT(*) as count
+            FROM images
+            WHERE status = 'downloaded'
+            GROUP BY license
+        """)
+        license_stats_raw = db.execute(license_sql).fetchall()
+        licenses = [
+            {"license": l[0], "count": l[1]}
+            for l in license_stats_raw
+        ]
+
+        # Job stats - single query
+        job_sql = text("""
+            SELECT
+                SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) as running,
+                SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
+                SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
+                SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
+            FROM jobs
+        """)
+        job_counts = db.execute(job_sql).fetchone()
+        jobs = {
+            "running": job_counts[0] or 0,
+            "pending": job_counts[1] or 0,
+            "completed": job_counts[2] or 0,
+            "failed": job_counts[3] or 0,
+        }
+
+        # Top species by image count - optimized with index
+        top_sql = text("""
+            SELECT s.id, s.scientific_name, s.common_name, COUNT(i.id) as image_count
+            FROM species s
+            INNER JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
+            GROUP BY s.id
+            ORDER BY image_count DESC
+            LIMIT 10
+        """)
+        top_species_raw = db.execute(top_sql).fetchall()
+        top_species = [
+            {
+                "id": s[0],
+                "scientific_name": s[1],
+                "common_name": s[2],
+                "image_count": s[3],
+            }
+            for s in top_species_raw
+        ]
+
+        # Under-represented species - use pre-computed counts
+        under_sql = text("""
+            SELECT s.id, s.scientific_name, s.common_name, COALESCE(img_counts.cnt, 0) as image_count
+            FROM species s
+            LEFT JOIN (
+                SELECT species_id, COUNT(*) as cnt
+                FROM images
+                WHERE status = 'downloaded'
+                GROUP BY species_id
+            ) img_counts ON img_counts.species_id = s.id
+            WHERE COALESCE(img_counts.cnt, 0) < 100
+            ORDER BY image_count ASC
+            LIMIT 10
+        """)
+        under_rep_raw = db.execute(under_sql).fetchall()
+        under_represented = [
+            {
+                "id": s[0],
+                "scientific_name": s[1],
+                "common_name": s[2],
+                "image_count": s[3],
+            }
+            for s in under_rep_raw
+        ]
+
+        # Calculate disk usage (fast recursive scan)
+        settings = get_settings()
+        disk_usage_bytes = get_directory_size_fast(settings.images_path)
+        disk_usage_mb = round(disk_usage_bytes / (1024 * 1024), 2)
+
+        # Build the stats object
+        stats = {
+            "total_species": total_species,
+            "total_images": total_images,
+            "images_downloaded": images_downloaded,
+            "images_pending": images_pending,
+            "images_rejected": images_rejected,
+            "disk_usage_mb": disk_usage_mb,
+            "sources": sources,
+            "licenses": licenses,
+            "jobs": jobs,
+            "top_species": top_species,
+            "under_represented": under_represented,
+        }
+
+        # Store in database
+        cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
+        if cached:
+            cached.value = json.dumps(stats)
+            cached.updated_at = datetime.utcnow()
+        else:
+            cached = CachedStats(key="dashboard_stats", value=json.dumps(stats))
+            db.add(cached)
+
+        db.commit()
+        print(f"=== STATS TASK: Refreshed (species={total_species}, images={total_images}) ===", flush=True)
+
+        return {"status": "success", "total_species": total_species, "total_images": total_images}
+
+    except Exception as e:
+        print(f"=== STATS TASK ERROR: {e} ===", flush=True)
+        raise
+    finally:
+        db.close()