Initial commit — PlantGuideScraper project

This commit is contained in:
Trey T
2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions

190
backend/app/api/stats.py Normal file
View File

@@ -0,0 +1,190 @@
import json
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy.orm import Session
from sqlalchemy import func, case
from app.database import get_db
from app.models import Species, Image, Job
from app.models.cached_stats import CachedStats
from app.schemas.stats import StatsResponse, SourceStats, LicenseStats, SpeciesStats, JobStats
router = APIRouter()
@router.get("", response_model=StatsResponse)
def get_stats(db: Session = Depends(get_db)):
"""Get dashboard statistics from cache (updated every 60s by Celery)."""
# Try to get cached stats
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
if cached:
data = json.loads(cached.value)
return StatsResponse(
total_species=data["total_species"],
total_images=data["total_images"],
images_downloaded=data["images_downloaded"],
images_pending=data["images_pending"],
images_rejected=data["images_rejected"],
disk_usage_mb=data["disk_usage_mb"],
sources=[SourceStats(**s) for s in data["sources"]],
licenses=[LicenseStats(**l) for l in data["licenses"]],
jobs=JobStats(**data["jobs"]),
top_species=[SpeciesStats(**s) for s in data["top_species"]],
under_represented=[SpeciesStats(**s) for s in data["under_represented"]],
)
# No cache yet - return empty stats (Celery will populate soon)
# This only happens on first startup before Celery runs
return StatsResponse(
total_species=0,
total_images=0,
images_downloaded=0,
images_pending=0,
images_rejected=0,
disk_usage_mb=0.0,
sources=[],
licenses=[],
jobs=JobStats(running=0, pending=0, completed=0, failed=0),
top_species=[],
under_represented=[],
)
@router.post("/refresh")
def refresh_stats_now(db: Session = Depends(get_db)):
"""Manually trigger a stats refresh."""
from app.workers.stats_tasks import refresh_stats
refresh_stats.delay()
return {"status": "refresh_queued"}
@router.get("/sources")
def get_source_stats(db: Session = Depends(get_db)):
"""Get per-source breakdown."""
stats = db.query(
Image.source,
func.count(Image.id).label("total"),
func.sum(case((Image.status == "downloaded", 1), else_=0)).label("downloaded"),
func.sum(case((Image.status == "pending", 1), else_=0)).label("pending"),
func.sum(case((Image.status == "rejected", 1), else_=0)).label("rejected"),
).group_by(Image.source).all()
return [
{
"source": s.source,
"total": s.total,
"downloaded": s.downloaded or 0,
"pending": s.pending or 0,
"rejected": s.rejected or 0,
}
for s in stats
]
@router.get("/species")
def get_species_stats(
min_count: int = 0,
max_count: int = None,
db: Session = Depends(get_db),
):
"""Get per-species image counts."""
query = db.query(
Species.id,
Species.scientific_name,
Species.common_name,
Species.genus,
func.count(Image.id).label("image_count")
).outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded")
).group_by(Species.id)
if min_count > 0:
query = query.having(func.count(Image.id) >= min_count)
if max_count is not None:
query = query.having(func.count(Image.id) <= max_count)
stats = query.order_by(func.count(Image.id).desc()).all()
return [
{
"id": s.id,
"scientific_name": s.scientific_name,
"common_name": s.common_name,
"genus": s.genus,
"image_count": s.image_count,
}
for s in stats
]
@router.get("/distribution")
def get_image_distribution(db: Session = Depends(get_db)):
"""Get distribution of images per species for ML training assessment.
Returns counts of species at various image thresholds to help
determine dataset quality for training image classifiers.
"""
from sqlalchemy import text
# Get image counts per species using optimized raw SQL
distribution_sql = text("""
WITH species_counts AS (
SELECT
s.id,
COUNT(i.id) as cnt
FROM species s
LEFT JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
GROUP BY s.id
)
SELECT
COUNT(*) as total_species,
SUM(CASE WHEN cnt = 0 THEN 1 ELSE 0 END) as with_0,
SUM(CASE WHEN cnt >= 1 AND cnt < 10 THEN 1 ELSE 0 END) as with_1_9,
SUM(CASE WHEN cnt >= 10 AND cnt < 25 THEN 1 ELSE 0 END) as with_10_24,
SUM(CASE WHEN cnt >= 25 AND cnt < 50 THEN 1 ELSE 0 END) as with_25_49,
SUM(CASE WHEN cnt >= 50 AND cnt < 100 THEN 1 ELSE 0 END) as with_50_99,
SUM(CASE WHEN cnt >= 100 AND cnt < 200 THEN 1 ELSE 0 END) as with_100_199,
SUM(CASE WHEN cnt >= 200 THEN 1 ELSE 0 END) as with_200_plus,
SUM(CASE WHEN cnt >= 10 THEN 1 ELSE 0 END) as trainable_10,
SUM(CASE WHEN cnt >= 25 THEN 1 ELSE 0 END) as trainable_25,
SUM(CASE WHEN cnt >= 50 THEN 1 ELSE 0 END) as trainable_50,
SUM(CASE WHEN cnt >= 100 THEN 1 ELSE 0 END) as trainable_100,
AVG(cnt) as avg_images,
MAX(cnt) as max_images,
MIN(cnt) as min_images,
SUM(cnt) as total_images
FROM species_counts
""")
result = db.execute(distribution_sql).fetchone()
return {
"total_species": result[0] or 0,
"distribution": {
"0_images": result[1] or 0,
"1_to_9": result[2] or 0,
"10_to_24": result[3] or 0,
"25_to_49": result[4] or 0,
"50_to_99": result[5] or 0,
"100_to_199": result[6] or 0,
"200_plus": result[7] or 0,
},
"trainable_species": {
"min_10_images": result[8] or 0,
"min_25_images": result[9] or 0,
"min_50_images": result[10] or 0,
"min_100_images": result[11] or 0,
},
"summary": {
"avg_images_per_species": round(result[12] or 0, 1),
"max_images": result[13] or 0,
"min_images": result[14] or 0,
"total_downloaded_images": result[15] or 0,
},
"recommendations": {
"for_basic_model": f"{result[8] or 0} species with 10+ images",
"for_good_model": f"{result[10] or 0} species with 50+ images",
"for_excellent_model": f"{result[11] or 0} species with 100+ images",
}
}