Initial commit — PlantGuideScraper project
This commit is contained in:
1
backend/app/workers/__init__.py
Normal file
1
backend/app/workers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Celery workers
|
||||
36
backend/app/workers/celery_app.py
Normal file
36
backend/app/workers/celery_app.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from celery import Celery
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
celery_app = Celery(
|
||||
"plant_scraper",
|
||||
broker=settings.redis_url,
|
||||
backend=settings.redis_url,
|
||||
include=[
|
||||
"app.workers.scrape_tasks",
|
||||
"app.workers.quality_tasks",
|
||||
"app.workers.export_tasks",
|
||||
"app.workers.stats_tasks",
|
||||
],
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
task_serializer="json",
|
||||
accept_content=["json"],
|
||||
result_serializer="json",
|
||||
timezone="UTC",
|
||||
enable_utc=True,
|
||||
task_track_started=True,
|
||||
task_time_limit=3600 * 24, # 24 hour max per task
|
||||
worker_prefetch_multiplier=1,
|
||||
task_acks_late=True,
|
||||
beat_schedule={
|
||||
"refresh-stats-every-5min": {
|
||||
"task": "app.workers.stats_tasks.refresh_stats",
|
||||
"schedule": 300.0, # Every 5 minutes
|
||||
},
|
||||
},
|
||||
beat_schedule_filename="/tmp/celerybeat-schedule",
|
||||
)
|
||||
170
backend/app/workers/export_tasks.py
Normal file
170
backend/app/workers/export_tasks.py
Normal file
@@ -0,0 +1,170 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Export, Image, Species
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def generate_export(self, export_id: int):
|
||||
"""Generate a zip export for CoreML training."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
export = db.query(Export).filter(Export.id == export_id).first()
|
||||
if not export:
|
||||
return {"error": "Export not found"}
|
||||
|
||||
# Update status
|
||||
export.status = "generating"
|
||||
export.celery_task_id = self.request.id
|
||||
db.commit()
|
||||
|
||||
# Parse filter criteria
|
||||
criteria = json.loads(export.filter_criteria) if export.filter_criteria else {}
|
||||
min_images = criteria.get("min_images_per_species", 100)
|
||||
licenses = criteria.get("licenses")
|
||||
min_quality = criteria.get("min_quality")
|
||||
species_ids = criteria.get("species_ids")
|
||||
|
||||
# Build query for images
|
||||
query = db.query(Image).filter(Image.status == "downloaded")
|
||||
|
||||
if licenses:
|
||||
query = query.filter(Image.license.in_(licenses))
|
||||
|
||||
if min_quality:
|
||||
query = query.filter(Image.quality_score >= min_quality)
|
||||
|
||||
if species_ids:
|
||||
query = query.filter(Image.species_id.in_(species_ids))
|
||||
|
||||
# Group by species and filter by min count
|
||||
from sqlalchemy import func
|
||||
species_counts = db.query(
|
||||
Image.species_id,
|
||||
func.count(Image.id).label("count")
|
||||
).filter(Image.status == "downloaded").group_by(Image.species_id).all()
|
||||
|
||||
valid_species_ids = [s.species_id for s in species_counts if s.count >= min_images]
|
||||
|
||||
if species_ids:
|
||||
valid_species_ids = [s for s in valid_species_ids if s in species_ids]
|
||||
|
||||
if not valid_species_ids:
|
||||
export.status = "failed"
|
||||
export.error_message = "No species meet the criteria"
|
||||
export.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
return {"error": "No species meet the criteria"}
|
||||
|
||||
# Create export directory
|
||||
export_dir = Path(settings.exports_path) / f"export_{export_id}"
|
||||
train_dir = export_dir / "Training"
|
||||
test_dir = export_dir / "Testing"
|
||||
train_dir.mkdir(parents=True, exist_ok=True)
|
||||
test_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
total_images = 0
|
||||
species_count = 0
|
||||
|
||||
# Process each valid species
|
||||
for i, species_id in enumerate(valid_species_ids):
|
||||
species = db.query(Species).filter(Species.id == species_id).first()
|
||||
if not species:
|
||||
continue
|
||||
|
||||
# Get images for this species
|
||||
images_query = query.filter(Image.species_id == species_id)
|
||||
if licenses:
|
||||
images_query = images_query.filter(Image.license.in_(licenses))
|
||||
if min_quality:
|
||||
images_query = images_query.filter(Image.quality_score >= min_quality)
|
||||
|
||||
images = images_query.all()
|
||||
if len(images) < min_images:
|
||||
continue
|
||||
|
||||
species_count += 1
|
||||
|
||||
# Create species folders
|
||||
species_name = species.scientific_name.replace(" ", "_")
|
||||
(train_dir / species_name).mkdir(exist_ok=True)
|
||||
(test_dir / species_name).mkdir(exist_ok=True)
|
||||
|
||||
# Shuffle and split
|
||||
random.shuffle(images)
|
||||
split_idx = int(len(images) * export.train_split)
|
||||
train_images = images[:split_idx]
|
||||
test_images = images[split_idx:]
|
||||
|
||||
# Copy images
|
||||
for j, img in enumerate(train_images):
|
||||
if img.local_path and os.path.exists(img.local_path):
|
||||
ext = Path(img.local_path).suffix or ".jpg"
|
||||
dest = train_dir / species_name / f"img_{j:05d}{ext}"
|
||||
shutil.copy2(img.local_path, dest)
|
||||
total_images += 1
|
||||
|
||||
for j, img in enumerate(test_images):
|
||||
if img.local_path and os.path.exists(img.local_path):
|
||||
ext = Path(img.local_path).suffix or ".jpg"
|
||||
dest = test_dir / species_name / f"img_{j:05d}{ext}"
|
||||
shutil.copy2(img.local_path, dest)
|
||||
total_images += 1
|
||||
|
||||
# Update progress
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={
|
||||
"current": i + 1,
|
||||
"total": len(valid_species_ids),
|
||||
"species": species.scientific_name,
|
||||
}
|
||||
)
|
||||
|
||||
# Create zip file
|
||||
zip_path = Path(settings.exports_path) / f"export_{export_id}.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
||||
for root, dirs, files in os.walk(export_dir):
|
||||
for file in files:
|
||||
file_path = Path(root) / file
|
||||
arcname = file_path.relative_to(export_dir)
|
||||
zipf.write(file_path, arcname)
|
||||
|
||||
# Clean up directory
|
||||
shutil.rmtree(export_dir)
|
||||
|
||||
# Update export record
|
||||
export.status = "completed"
|
||||
export.file_path = str(zip_path)
|
||||
export.file_size = zip_path.stat().st_size
|
||||
export.species_count = species_count
|
||||
export.image_count = total_images
|
||||
export.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"species_count": species_count,
|
||||
"image_count": total_images,
|
||||
"file_size": export.file_size,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if export:
|
||||
export.status = "failed"
|
||||
export.error_message = str(e)
|
||||
export.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
224
backend/app/workers/quality_tasks.py
Normal file
224
backend/app/workers/quality_tasks.py
Normal file
@@ -0,0 +1,224 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from PIL import Image as PILImage
|
||||
import imagehash
|
||||
import numpy as np
|
||||
from scipy import ndimage
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Image
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
def calculate_blur_score(image_path: str) -> float:
|
||||
"""Calculate blur score using Laplacian variance. Higher = sharper."""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("L")
|
||||
img_array = np.array(img)
|
||||
laplacian = ndimage.laplace(img_array)
|
||||
return float(np.var(laplacian))
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
def calculate_phash(image_path: str) -> str:
|
||||
"""Calculate perceptual hash for deduplication."""
|
||||
try:
|
||||
img = PILImage.open(image_path)
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def check_color_distribution(image_path: str) -> tuple[bool, str]:
|
||||
"""Check if image has healthy color distribution for a plant photo.
|
||||
|
||||
Returns (passed, reason) tuple.
|
||||
Rejects:
|
||||
- Low color variance (mean channel std < 25): herbarium specimens (brown on white)
|
||||
- No green + low variance (green ratio < 5% AND mean std < 40): monochrome illustrations
|
||||
"""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("RGB")
|
||||
arr = np.array(img, dtype=np.float64)
|
||||
|
||||
# Per-channel standard deviation
|
||||
channel_stds = arr.std(axis=(0, 1)) # [R_std, G_std, B_std]
|
||||
mean_std = float(channel_stds.mean())
|
||||
|
||||
if mean_std < 25:
|
||||
return False, f"Low color variance ({mean_std:.1f})"
|
||||
|
||||
# Check green ratio
|
||||
channel_means = arr.mean(axis=(0, 1))
|
||||
total = channel_means.sum()
|
||||
green_ratio = channel_means[1] / total if total > 0 else 0
|
||||
|
||||
if green_ratio < 0.05 and mean_std < 40:
|
||||
return False, f"No green ({green_ratio:.2%}) + low variance ({mean_std:.1f})"
|
||||
|
||||
return True, ""
|
||||
except Exception:
|
||||
return True, "" # Don't reject on error
|
||||
|
||||
|
||||
def resize_image(image_path: str, target_size: int = 512) -> bool:
|
||||
"""Resize image to target size while maintaining aspect ratio."""
|
||||
try:
|
||||
img = PILImage.open(image_path)
|
||||
img.thumbnail((target_size, target_size), PILImage.Resampling.LANCZOS)
|
||||
img.save(image_path, quality=95)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
@celery_app.task
|
||||
def download_and_process_image(image_id: int):
|
||||
"""Download image, check quality, dedupe, and resize."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
image = db.query(Image).filter(Image.id == image_id).first()
|
||||
if not image:
|
||||
return {"error": "Image not found"}
|
||||
|
||||
# Create directory for species
|
||||
species = image.species
|
||||
species_dir = Path(settings.images_path) / species.scientific_name.replace(" ", "_")
|
||||
species_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download image
|
||||
filename = f"{image.source}_{image.source_id or image.id}.jpg"
|
||||
local_path = species_dir / filename
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
with httpx.Client(timeout=30, headers=headers, follow_redirects=True) as client:
|
||||
response = client.get(image.url)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
except Exception as e:
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Download failed: {e}"}
|
||||
|
||||
# Check minimum size
|
||||
try:
|
||||
with PILImage.open(local_path) as img:
|
||||
width, height = img.size
|
||||
if width < 256 or height < 256:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": "Image too small"}
|
||||
image.width = width
|
||||
image.height = height
|
||||
except Exception as e:
|
||||
if local_path.exists():
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Invalid image: {e}"}
|
||||
|
||||
# Calculate perceptual hash for deduplication
|
||||
phash = calculate_phash(str(local_path))
|
||||
if phash:
|
||||
# Check for duplicates
|
||||
existing = db.query(Image).filter(
|
||||
Image.phash == phash,
|
||||
Image.id != image.id,
|
||||
Image.status == "downloaded"
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
image.phash = phash
|
||||
db.commit()
|
||||
return {"error": "Duplicate image"}
|
||||
|
||||
image.phash = phash
|
||||
|
||||
# Calculate blur score
|
||||
quality_score = calculate_blur_score(str(local_path))
|
||||
image.quality_score = quality_score
|
||||
|
||||
# Reject very blurry images (threshold can be tuned)
|
||||
if quality_score < 100: # Low variance = blurry
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": "Image too blurry"}
|
||||
|
||||
# Check color distribution (reject herbarium specimens, illustrations)
|
||||
color_ok, color_reason = check_color_distribution(str(local_path))
|
||||
if not color_ok:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Non-photo content: {color_reason}"}
|
||||
|
||||
# Resize to 512x512 max
|
||||
resize_image(str(local_path))
|
||||
|
||||
# Update image record
|
||||
image.local_path = str(local_path)
|
||||
image.status = "downloaded"
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"path": str(local_path),
|
||||
"quality_score": quality_score,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if image:
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def batch_process_pending_images(self, source: str = None, chunk_size: int = 500):
|
||||
"""Process ALL pending images in chunks, with progress tracking."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
query = db.query(Image).filter(Image.status == "pending")
|
||||
if source:
|
||||
query = query.filter(Image.source == source)
|
||||
|
||||
total = query.count()
|
||||
queued = 0
|
||||
offset = 0
|
||||
|
||||
while offset < total:
|
||||
chunk = query.order_by(Image.id).offset(offset).limit(chunk_size).all()
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
for image in chunk:
|
||||
download_and_process_image.delay(image.id)
|
||||
queued += 1
|
||||
|
||||
offset += len(chunk)
|
||||
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={"queued": queued, "total": total},
|
||||
)
|
||||
|
||||
return {"queued": queued, "total": total}
|
||||
finally:
|
||||
db.close()
|
||||
164
backend/app/workers/scrape_tasks.py
Normal file
164
backend/app/workers/scrape_tasks.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Job, Species, Image
|
||||
from app.utils.logging import get_job_logger
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def run_scrape_job(self, job_id: int):
|
||||
"""Main scrape task that dispatches to source-specific scrapers."""
|
||||
logger = get_job_logger(job_id)
|
||||
logger.info(f"Starting scrape job {job_id}")
|
||||
|
||||
db = SessionLocal()
|
||||
job = None
|
||||
try:
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if not job:
|
||||
logger.error(f"Job {job_id} not found")
|
||||
return {"error": "Job not found"}
|
||||
|
||||
logger.info(f"Job: {job.name}, Source: {job.source}")
|
||||
|
||||
# Update job status
|
||||
job.status = "running"
|
||||
job.started_at = datetime.utcnow()
|
||||
job.celery_task_id = self.request.id
|
||||
db.commit()
|
||||
|
||||
# Get species to scrape
|
||||
if job.species_filter:
|
||||
species_ids = json.loads(job.species_filter)
|
||||
query = db.query(Species).filter(Species.id.in_(species_ids))
|
||||
logger.info(f"Filtered to species IDs: {species_ids}")
|
||||
else:
|
||||
query = db.query(Species)
|
||||
logger.info("Scraping all species")
|
||||
|
||||
# Filter by image count if requested
|
||||
if job.only_without_images or job.max_images:
|
||||
from sqlalchemy import func
|
||||
# Subquery to count downloaded images per species
|
||||
image_count_subquery = (
|
||||
db.query(Image.species_id, func.count(Image.id).label("count"))
|
||||
.filter(Image.status == "downloaded")
|
||||
.group_by(Image.species_id)
|
||||
.subquery()
|
||||
)
|
||||
# Left join with the count subquery
|
||||
query = query.outerjoin(
|
||||
image_count_subquery,
|
||||
Species.id == image_count_subquery.c.species_id
|
||||
)
|
||||
|
||||
if job.only_without_images:
|
||||
# Filter where count is NULL or 0
|
||||
query = query.filter(
|
||||
(image_count_subquery.c.count == None) | (image_count_subquery.c.count == 0)
|
||||
)
|
||||
logger.info("Filtering to species without images")
|
||||
elif job.max_images:
|
||||
# Filter where count is NULL or less than max_images
|
||||
query = query.filter(
|
||||
(image_count_subquery.c.count == None) | (image_count_subquery.c.count < job.max_images)
|
||||
)
|
||||
logger.info(f"Filtering to species with fewer than {job.max_images} images")
|
||||
|
||||
species_list = query.all()
|
||||
logger.info(f"Total species to scrape: {len(species_list)}")
|
||||
|
||||
job.progress_total = len(species_list)
|
||||
db.commit()
|
||||
|
||||
# Import scraper based on source
|
||||
from app.scrapers import get_scraper
|
||||
scraper = get_scraper(job.source)
|
||||
|
||||
if not scraper:
|
||||
error_msg = f"Unknown source: {job.source}"
|
||||
logger.error(error_msg)
|
||||
job.status = "failed"
|
||||
job.error_message = error_msg
|
||||
job.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
return {"error": error_msg}
|
||||
|
||||
logger.info(f"Using scraper: {scraper.name}")
|
||||
|
||||
# Scrape each species
|
||||
for i, species in enumerate(species_list):
|
||||
try:
|
||||
# Update progress
|
||||
job.progress_current = i + 1
|
||||
db.commit()
|
||||
|
||||
logger.info(f"[{i+1}/{len(species_list)}] Scraping: {species.scientific_name}")
|
||||
|
||||
# Update task state for real-time monitoring
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={
|
||||
"current": i + 1,
|
||||
"total": len(species_list),
|
||||
"species": species.scientific_name,
|
||||
}
|
||||
)
|
||||
|
||||
# Run scraper for this species
|
||||
results = scraper.scrape_species(species, db, logger)
|
||||
downloaded = results.get("downloaded", 0)
|
||||
rejected = results.get("rejected", 0)
|
||||
job.images_downloaded += downloaded
|
||||
job.images_rejected += rejected
|
||||
db.commit()
|
||||
|
||||
logger.info(f" -> Downloaded: {downloaded}, Rejected: {rejected}")
|
||||
|
||||
except Exception as e:
|
||||
# Log error but continue with other species
|
||||
logger.error(f"Error scraping {species.scientific_name}: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
# Mark job complete
|
||||
job.status = "completed"
|
||||
job.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
logger.info(f"Job {job_id} completed. Total downloaded: {job.images_downloaded}, rejected: {job.images_rejected}")
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"downloaded": job.images_downloaded,
|
||||
"rejected": job.images_rejected,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Job {job_id} failed with error: {e}", exc_info=True)
|
||||
if job:
|
||||
job.status = "failed"
|
||||
job.error_message = str(e)
|
||||
job.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task
|
||||
def pause_scrape_job(job_id: int):
|
||||
"""Pause a running scrape job."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if job and job.status == "running":
|
||||
job.status = "paused"
|
||||
db.commit()
|
||||
# Revoke the Celery task
|
||||
if job.celery_task_id:
|
||||
celery_app.control.revoke(job.celery_task_id, terminate=True)
|
||||
return {"status": "paused"}
|
||||
finally:
|
||||
db.close()
|
||||
193
backend/app/workers/stats_tasks.py
Normal file
193
backend/app/workers/stats_tasks.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import func, case, text
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Species, Image, Job
|
||||
from app.models.cached_stats import CachedStats
|
||||
from app.config import get_settings
|
||||
|
||||
|
||||
def get_directory_size_fast(path: str) -> int:
|
||||
"""Get directory size in bytes using fast os.scandir."""
|
||||
total = 0
|
||||
try:
|
||||
with os.scandir(path) as it:
|
||||
for entry in it:
|
||||
try:
|
||||
if entry.is_file(follow_symlinks=False):
|
||||
total += entry.stat(follow_symlinks=False).st_size
|
||||
elif entry.is_dir(follow_symlinks=False):
|
||||
total += get_directory_size_fast(entry.path)
|
||||
except (OSError, PermissionError):
|
||||
pass
|
||||
except (OSError, PermissionError):
|
||||
pass
|
||||
return total
|
||||
|
||||
|
||||
@celery_app.task
|
||||
def refresh_stats():
|
||||
"""Calculate and cache dashboard statistics."""
|
||||
print("=== STATS TASK: Starting refresh ===", flush=True)
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Use raw SQL for maximum performance on SQLite
|
||||
# All counts in a single query
|
||||
counts_sql = text("""
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM species) as total_species,
|
||||
(SELECT COUNT(*) FROM images) as total_images,
|
||||
(SELECT COUNT(*) FROM images WHERE status = 'downloaded') as images_downloaded,
|
||||
(SELECT COUNT(*) FROM images WHERE status = 'pending') as images_pending,
|
||||
(SELECT COUNT(*) FROM images WHERE status = 'rejected') as images_rejected
|
||||
""")
|
||||
counts = db.execute(counts_sql).fetchone()
|
||||
total_species = counts[0] or 0
|
||||
total_images = counts[1] or 0
|
||||
images_downloaded = counts[2] or 0
|
||||
images_pending = counts[3] or 0
|
||||
images_rejected = counts[4] or 0
|
||||
|
||||
# Per-source stats - single query with GROUP BY
|
||||
source_sql = text("""
|
||||
SELECT
|
||||
source,
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN status = 'downloaded' THEN 1 ELSE 0 END) as downloaded,
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
|
||||
SUM(CASE WHEN status = 'rejected' THEN 1 ELSE 0 END) as rejected
|
||||
FROM images
|
||||
GROUP BY source
|
||||
""")
|
||||
source_stats_raw = db.execute(source_sql).fetchall()
|
||||
sources = [
|
||||
{
|
||||
"source": s[0],
|
||||
"image_count": s[1],
|
||||
"downloaded": s[2] or 0,
|
||||
"pending": s[3] or 0,
|
||||
"rejected": s[4] or 0,
|
||||
}
|
||||
for s in source_stats_raw
|
||||
]
|
||||
|
||||
# Per-license stats - single indexed query
|
||||
license_sql = text("""
|
||||
SELECT license, COUNT(*) as count
|
||||
FROM images
|
||||
WHERE status = 'downloaded'
|
||||
GROUP BY license
|
||||
""")
|
||||
license_stats_raw = db.execute(license_sql).fetchall()
|
||||
licenses = [
|
||||
{"license": l[0], "count": l[1]}
|
||||
for l in license_stats_raw
|
||||
]
|
||||
|
||||
# Job stats - single query
|
||||
job_sql = text("""
|
||||
SELECT
|
||||
SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) as running,
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
|
||||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
|
||||
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
|
||||
FROM jobs
|
||||
""")
|
||||
job_counts = db.execute(job_sql).fetchone()
|
||||
jobs = {
|
||||
"running": job_counts[0] or 0,
|
||||
"pending": job_counts[1] or 0,
|
||||
"completed": job_counts[2] or 0,
|
||||
"failed": job_counts[3] or 0,
|
||||
}
|
||||
|
||||
# Top species by image count - optimized with index
|
||||
top_sql = text("""
|
||||
SELECT s.id, s.scientific_name, s.common_name, COUNT(i.id) as image_count
|
||||
FROM species s
|
||||
INNER JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
|
||||
GROUP BY s.id
|
||||
ORDER BY image_count DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
top_species_raw = db.execute(top_sql).fetchall()
|
||||
top_species = [
|
||||
{
|
||||
"id": s[0],
|
||||
"scientific_name": s[1],
|
||||
"common_name": s[2],
|
||||
"image_count": s[3],
|
||||
}
|
||||
for s in top_species_raw
|
||||
]
|
||||
|
||||
# Under-represented species - use pre-computed counts
|
||||
under_sql = text("""
|
||||
SELECT s.id, s.scientific_name, s.common_name, COALESCE(img_counts.cnt, 0) as image_count
|
||||
FROM species s
|
||||
LEFT JOIN (
|
||||
SELECT species_id, COUNT(*) as cnt
|
||||
FROM images
|
||||
WHERE status = 'downloaded'
|
||||
GROUP BY species_id
|
||||
) img_counts ON img_counts.species_id = s.id
|
||||
WHERE COALESCE(img_counts.cnt, 0) < 100
|
||||
ORDER BY image_count ASC
|
||||
LIMIT 10
|
||||
""")
|
||||
under_rep_raw = db.execute(under_sql).fetchall()
|
||||
under_represented = [
|
||||
{
|
||||
"id": s[0],
|
||||
"scientific_name": s[1],
|
||||
"common_name": s[2],
|
||||
"image_count": s[3],
|
||||
}
|
||||
for s in under_rep_raw
|
||||
]
|
||||
|
||||
# Calculate disk usage (fast recursive scan)
|
||||
settings = get_settings()
|
||||
disk_usage_bytes = get_directory_size_fast(settings.images_path)
|
||||
disk_usage_mb = round(disk_usage_bytes / (1024 * 1024), 2)
|
||||
|
||||
# Build the stats object
|
||||
stats = {
|
||||
"total_species": total_species,
|
||||
"total_images": total_images,
|
||||
"images_downloaded": images_downloaded,
|
||||
"images_pending": images_pending,
|
||||
"images_rejected": images_rejected,
|
||||
"disk_usage_mb": disk_usage_mb,
|
||||
"sources": sources,
|
||||
"licenses": licenses,
|
||||
"jobs": jobs,
|
||||
"top_species": top_species,
|
||||
"under_represented": under_represented,
|
||||
}
|
||||
|
||||
# Store in database
|
||||
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
|
||||
if cached:
|
||||
cached.value = json.dumps(stats)
|
||||
cached.updated_at = datetime.utcnow()
|
||||
else:
|
||||
cached = CachedStats(key="dashboard_stats", value=json.dumps(stats))
|
||||
db.add(cached)
|
||||
|
||||
db.commit()
|
||||
print(f"=== STATS TASK: Refreshed (species={total_species}, images={total_images}) ===", flush=True)
|
||||
|
||||
return {"status": "success", "total_species": total_species, "total_images": total_images}
|
||||
|
||||
except Exception as e:
|
||||
print(f"=== STATS TASK ERROR: {e} ===", flush=True)
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
Reference in New Issue
Block a user