Initial commit — PlantGuideScraper project
This commit is contained in:
1
backend/app/utils/__init__.py
Normal file
1
backend/app/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Utility functions
|
||||
80
backend/app/utils/dedup.py
Normal file
80
backend/app/utils/dedup.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Image deduplication utilities using perceptual hashing."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import imagehash
|
||||
from PIL import Image as PILImage
|
||||
|
||||
|
||||
def calculate_phash(image_path: str) -> Optional[str]:
|
||||
"""
|
||||
Calculate perceptual hash for an image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Hex string of perceptual hash, or None if failed
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def calculate_dhash(image_path: str) -> Optional[str]:
|
||||
"""
|
||||
Calculate difference hash for an image.
|
||||
Faster but less accurate than phash.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Hex string of difference hash, or None if failed
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return str(imagehash.dhash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
|
||||
"""
|
||||
Check if two hashes are similar (potential duplicates).
|
||||
|
||||
Args:
|
||||
hash1: First hash string
|
||||
hash2: Second hash string
|
||||
threshold: Maximum Hamming distance (default 10)
|
||||
|
||||
Returns:
|
||||
True if hashes are similar
|
||||
"""
|
||||
try:
|
||||
h1 = imagehash.hex_to_hash(hash1)
|
||||
h2 = imagehash.hex_to_hash(hash2)
|
||||
return (h1 - h2) <= threshold
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def hamming_distance(hash1: str, hash2: str) -> int:
|
||||
"""
|
||||
Calculate Hamming distance between two hashes.
|
||||
|
||||
Args:
|
||||
hash1: First hash string
|
||||
hash2: Second hash string
|
||||
|
||||
Returns:
|
||||
Hamming distance (0 = identical, higher = more different)
|
||||
"""
|
||||
try:
|
||||
h1 = imagehash.hex_to_hash(hash1)
|
||||
h2 = imagehash.hex_to_hash(hash2)
|
||||
return int(h1 - h2)
|
||||
except Exception:
|
||||
return 64 # Maximum distance
|
||||
109
backend/app/utils/image_quality.py
Normal file
109
backend/app/utils/image_quality.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Image quality assessment utilities."""
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image as PILImage
|
||||
from scipy import ndimage
|
||||
|
||||
|
||||
def calculate_blur_score(image_path: str) -> float:
|
||||
"""
|
||||
Calculate blur score using Laplacian variance.
|
||||
Higher score = sharper image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Variance of Laplacian (higher = sharper)
|
||||
"""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("L")
|
||||
img_array = np.array(img)
|
||||
laplacian = ndimage.laplace(img_array)
|
||||
return float(np.var(laplacian))
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
def is_too_blurry(image_path: str, threshold: float = 100.0) -> bool:
|
||||
"""
|
||||
Check if image is too blurry for training.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
threshold: Minimum acceptable blur score (default 100)
|
||||
|
||||
Returns:
|
||||
True if image is too blurry
|
||||
"""
|
||||
score = calculate_blur_score(image_path)
|
||||
return score < threshold
|
||||
|
||||
|
||||
def get_image_dimensions(image_path: str) -> tuple[int, int]:
|
||||
"""
|
||||
Get image dimensions.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height)
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return img.size
|
||||
except Exception:
|
||||
return (0, 0)
|
||||
|
||||
|
||||
def is_too_small(image_path: str, min_size: int = 256) -> bool:
|
||||
"""
|
||||
Check if image is too small for training.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
min_size: Minimum dimension size (default 256)
|
||||
|
||||
Returns:
|
||||
True if image is too small
|
||||
"""
|
||||
width, height = get_image_dimensions(image_path)
|
||||
return width < min_size or height < min_size
|
||||
|
||||
|
||||
def resize_image(
|
||||
image_path: str,
|
||||
output_path: str = None,
|
||||
max_size: int = 512,
|
||||
quality: int = 95,
|
||||
) -> bool:
|
||||
"""
|
||||
Resize image to max dimension while preserving aspect ratio.
|
||||
|
||||
Args:
|
||||
image_path: Path to input image
|
||||
output_path: Path for output (defaults to overwriting input)
|
||||
max_size: Maximum dimension size (default 512)
|
||||
quality: JPEG quality (default 95)
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
try:
|
||||
output_path = output_path or image_path
|
||||
|
||||
with PILImage.open(image_path) as img:
|
||||
# Only resize if larger than max_size
|
||||
if max(img.size) > max_size:
|
||||
img.thumbnail((max_size, max_size), PILImage.Resampling.LANCZOS)
|
||||
|
||||
# Convert to RGB if necessary (for JPEG)
|
||||
if img.mode in ("RGBA", "P"):
|
||||
img = img.convert("RGB")
|
||||
|
||||
img.save(output_path, "JPEG", quality=quality)
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
92
backend/app/utils/logging.py
Normal file
92
backend/app/utils/logging.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
def setup_logging():
|
||||
"""Configure file and console logging."""
|
||||
logs_path = Path(settings.logs_path)
|
||||
logs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create a dated log file
|
||||
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
|
||||
# Configure root logger
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_file),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
|
||||
return logging.getLogger("plant_scraper")
|
||||
|
||||
|
||||
def get_logger(name: str = "plant_scraper"):
|
||||
"""Get a logger instance."""
|
||||
logs_path = Path(settings.logs_path)
|
||||
logs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger = logging.getLogger(name)
|
||||
|
||||
if not logger.handlers:
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# File handler with daily rotation
|
||||
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
file_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
# Console handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
console_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
def get_job_logger(job_id: int):
|
||||
"""Get a logger specific to a job, writing to a job-specific file."""
|
||||
logs_path = Path(settings.logs_path)
|
||||
logs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger = logging.getLogger(f"job_{job_id}")
|
||||
|
||||
if not logger.handlers:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# Job-specific log file
|
||||
job_log_file = logs_path / f"job_{job_id}.log"
|
||||
file_handler = logging.FileHandler(job_log_file)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
# Also log to daily file
|
||||
daily_log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
daily_handler = logging.FileHandler(daily_log_file)
|
||||
daily_handler.setLevel(logging.INFO)
|
||||
daily_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - job_%(name)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(daily_handler)
|
||||
|
||||
return logger
|
||||
Reference in New Issue
Block a user