Initial commit — PlantGuideScraper project

This commit is contained in:
Trey T
2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
from app.models.species import Species
from app.models.image import Image
from app.models.job import Job
from app.models.api_key import ApiKey
from app.models.export import Export
from app.models.cached_stats import CachedStats
__all__ = ["Species", "Image", "Job", "ApiKey", "Export", "CachedStats"]

View File

@@ -0,0 +1,18 @@
from sqlalchemy import Column, Integer, String, Float, Boolean
from app.database import Base
class ApiKey(Base):
__tablename__ = "api_keys"
id = Column(Integer, primary_key=True, index=True)
source = Column(String, unique=True, nullable=False) # 'flickr', 'inaturalist', 'wikimedia', 'trefle'
api_key = Column(String, nullable=False) # Also used as Client ID for OAuth sources
api_secret = Column(String, nullable=True) # Also used as Client Secret for OAuth sources
access_token = Column(String, nullable=True) # For OAuth sources like Wikimedia
rate_limit_per_sec = Column(Float, default=1.0)
enabled = Column(Boolean, default=True)
def __repr__(self):
return f"<ApiKey(id={self.id}, source='{self.source}', enabled={self.enabled})>"

View File

@@ -0,0 +1,14 @@
from datetime import datetime
from sqlalchemy import Column, Integer, String, Text, DateTime
from app.database import Base
class CachedStats(Base):
"""Stores pre-calculated statistics updated by Celery beat."""
__tablename__ = "cached_stats"
id = Column(Integer, primary_key=True, index=True)
key = Column(String(50), unique=True, nullable=False, index=True)
value = Column(Text, nullable=False) # JSON-encoded stats
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

View File

@@ -0,0 +1,24 @@
from sqlalchemy import Column, Integer, String, Float, DateTime, Text, func
from app.database import Base
class Export(Base):
__tablename__ = "exports"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
filter_criteria = Column(Text, nullable=True) # JSON: min_images, licenses, min_quality, species_ids
train_split = Column(Float, default=0.8)
status = Column(String, default="pending") # pending, generating, completed, failed
file_path = Column(String, nullable=True)
file_size = Column(Integer, nullable=True)
species_count = Column(Integer, nullable=True)
image_count = Column(Integer, nullable=True)
celery_task_id = Column(String, nullable=True)
created_at = Column(DateTime, server_default=func.now())
completed_at = Column(DateTime, nullable=True)
error_message = Column(Text, nullable=True)
def __repr__(self):
return f"<Export(id={self.id}, name='{self.name}', status='{self.status}')>"

View File

@@ -0,0 +1,36 @@
from sqlalchemy import Column, Integer, String, Float, DateTime, ForeignKey, func, UniqueConstraint, Index
from sqlalchemy.orm import relationship
from app.database import Base
class Image(Base):
__tablename__ = "images"
id = Column(Integer, primary_key=True, index=True)
species_id = Column(Integer, ForeignKey("species.id"), nullable=False, index=True)
source = Column(String, nullable=False, index=True)
source_id = Column(String, nullable=True)
url = Column(String, nullable=False)
local_path = Column(String, nullable=True)
license = Column(String, nullable=False, index=True)
attribution = Column(String, nullable=True)
width = Column(Integer, nullable=True)
height = Column(Integer, nullable=True)
phash = Column(String, nullable=True, index=True)
quality_score = Column(Float, nullable=True)
status = Column(String, default="pending", index=True) # pending, downloaded, rejected, deleted
created_at = Column(DateTime, server_default=func.now())
# Composite indexes for common query patterns
__table_args__ = (
UniqueConstraint("source", "source_id", name="uq_source_source_id"),
Index("ix_images_species_status", "species_id", "status"), # For counting images per species by status
Index("ix_images_status_created", "status", "created_at"), # For listing images by status
)
# Relationships
species = relationship("Species", back_populates="images")
def __repr__(self):
return f"<Image(id={self.id}, source='{self.source}', status='{self.status}')>"

27
backend/app/models/job.py Normal file
View File

@@ -0,0 +1,27 @@
from sqlalchemy import Column, Integer, String, DateTime, Text, Boolean, func
from app.database import Base
class Job(Base):
__tablename__ = "jobs"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
source = Column(String, nullable=False)
species_filter = Column(Text, nullable=True) # JSON array of species IDs or NULL for all
only_without_images = Column(Boolean, default=False) # If True, only scrape species with 0 images
max_images = Column(Integer, nullable=True) # If set, only scrape species with fewer than N images
status = Column(String, default="pending", index=True) # pending, running, paused, completed, failed
progress_current = Column(Integer, default=0)
progress_total = Column(Integer, default=0)
images_downloaded = Column(Integer, default=0)
images_rejected = Column(Integer, default=0)
celery_task_id = Column(String, nullable=True)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
error_message = Column(Text, nullable=True)
created_at = Column(DateTime, server_default=func.now())
def __repr__(self):
return f"<Job(id={self.id}, name='{self.name}', status='{self.status}')>"

View File

@@ -0,0 +1,21 @@
from sqlalchemy import Column, Integer, String, DateTime, func
from sqlalchemy.orm import relationship
from app.database import Base
class Species(Base):
__tablename__ = "species"
id = Column(Integer, primary_key=True, index=True)
scientific_name = Column(String, unique=True, nullable=False, index=True)
common_name = Column(String, nullable=True)
genus = Column(String, nullable=True, index=True)
family = Column(String, nullable=True)
created_at = Column(DateTime, server_default=func.now())
# Relationships
images = relationship("Image", back_populates="species", cascade="all, delete-orphan")
def __repr__(self):
return f"<Species(id={self.id}, scientific_name='{self.scientific_name}')>"