Initial commit — PlantGuideScraper project

This commit is contained in:
Trey T
2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions

View File

@@ -0,0 +1 @@
# Utility functions

View File

@@ -0,0 +1,80 @@
"""Image deduplication utilities using perceptual hashing."""
from typing import Optional
import imagehash
from PIL import Image as PILImage
def calculate_phash(image_path: str) -> Optional[str]:
"""
Calculate perceptual hash for an image.
Args:
image_path: Path to image file
Returns:
Hex string of perceptual hash, or None if failed
"""
try:
with PILImage.open(image_path) as img:
return str(imagehash.phash(img))
except Exception:
return None
def calculate_dhash(image_path: str) -> Optional[str]:
"""
Calculate difference hash for an image.
Faster but less accurate than phash.
Args:
image_path: Path to image file
Returns:
Hex string of difference hash, or None if failed
"""
try:
with PILImage.open(image_path) as img:
return str(imagehash.dhash(img))
except Exception:
return None
def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
"""
Check if two hashes are similar (potential duplicates).
Args:
hash1: First hash string
hash2: Second hash string
threshold: Maximum Hamming distance (default 10)
Returns:
True if hashes are similar
"""
try:
h1 = imagehash.hex_to_hash(hash1)
h2 = imagehash.hex_to_hash(hash2)
return (h1 - h2) <= threshold
except Exception:
return False
def hamming_distance(hash1: str, hash2: str) -> int:
"""
Calculate Hamming distance between two hashes.
Args:
hash1: First hash string
hash2: Second hash string
Returns:
Hamming distance (0 = identical, higher = more different)
"""
try:
h1 = imagehash.hex_to_hash(hash1)
h2 = imagehash.hex_to_hash(hash2)
return int(h1 - h2)
except Exception:
return 64 # Maximum distance

View File

@@ -0,0 +1,109 @@
"""Image quality assessment utilities."""
import numpy as np
from PIL import Image as PILImage
from scipy import ndimage
def calculate_blur_score(image_path: str) -> float:
"""
Calculate blur score using Laplacian variance.
Higher score = sharper image.
Args:
image_path: Path to image file
Returns:
Variance of Laplacian (higher = sharper)
"""
try:
img = PILImage.open(image_path).convert("L")
img_array = np.array(img)
laplacian = ndimage.laplace(img_array)
return float(np.var(laplacian))
except Exception:
return 0.0
def is_too_blurry(image_path: str, threshold: float = 100.0) -> bool:
"""
Check if image is too blurry for training.
Args:
image_path: Path to image file
threshold: Minimum acceptable blur score (default 100)
Returns:
True if image is too blurry
"""
score = calculate_blur_score(image_path)
return score < threshold
def get_image_dimensions(image_path: str) -> tuple[int, int]:
"""
Get image dimensions.
Args:
image_path: Path to image file
Returns:
Tuple of (width, height)
"""
try:
with PILImage.open(image_path) as img:
return img.size
except Exception:
return (0, 0)
def is_too_small(image_path: str, min_size: int = 256) -> bool:
"""
Check if image is too small for training.
Args:
image_path: Path to image file
min_size: Minimum dimension size (default 256)
Returns:
True if image is too small
"""
width, height = get_image_dimensions(image_path)
return width < min_size or height < min_size
def resize_image(
image_path: str,
output_path: str = None,
max_size: int = 512,
quality: int = 95,
) -> bool:
"""
Resize image to max dimension while preserving aspect ratio.
Args:
image_path: Path to input image
output_path: Path for output (defaults to overwriting input)
max_size: Maximum dimension size (default 512)
quality: JPEG quality (default 95)
Returns:
True if successful
"""
try:
output_path = output_path or image_path
with PILImage.open(image_path) as img:
# Only resize if larger than max_size
if max(img.size) > max_size:
img.thumbnail((max_size, max_size), PILImage.Resampling.LANCZOS)
# Convert to RGB if necessary (for JPEG)
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(output_path, "JPEG", quality=quality)
return True
except Exception:
return False

View File

@@ -0,0 +1,92 @@
import logging
import os
from datetime import datetime
from pathlib import Path
from app.config import get_settings
settings = get_settings()
def setup_logging():
"""Configure file and console logging."""
logs_path = Path(settings.logs_path)
logs_path.mkdir(parents=True, exist_ok=True)
# Create a dated log file
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
# Configure root logger
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
return logging.getLogger("plant_scraper")
def get_logger(name: str = "plant_scraper"):
"""Get a logger instance."""
logs_path = Path(settings.logs_path)
logs_path.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger(name)
if not logger.handlers:
logger.setLevel(logging.INFO)
# File handler with daily rotation
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
))
# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s'
))
logger.addHandler(file_handler)
logger.addHandler(console_handler)
return logger
def get_job_logger(job_id: int):
"""Get a logger specific to a job, writing to a job-specific file."""
logs_path = Path(settings.logs_path)
logs_path.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger(f"job_{job_id}")
if not logger.handlers:
logger.setLevel(logging.DEBUG)
# Job-specific log file
job_log_file = logs_path / f"job_{job_id}.log"
file_handler = logging.FileHandler(job_log_file)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s'
))
# Also log to daily file
daily_log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
daily_handler = logging.FileHandler(daily_log_file)
daily_handler.setLevel(logging.INFO)
daily_handler.setFormatter(logging.Formatter(
'%(asctime)s - job_%(name)s - %(levelname)s - %(message)s'
))
logger.addHandler(file_handler)
logger.addHandler(daily_handler)
return logger