Initial commit — PlantGuideScraper project

This commit is contained in:
Trey T
2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions

View File

@@ -0,0 +1,80 @@
"""Image deduplication utilities using perceptual hashing."""
from typing import Optional
import imagehash
from PIL import Image as PILImage
def calculate_phash(image_path: str) -> Optional[str]:
"""
Calculate perceptual hash for an image.
Args:
image_path: Path to image file
Returns:
Hex string of perceptual hash, or None if failed
"""
try:
with PILImage.open(image_path) as img:
return str(imagehash.phash(img))
except Exception:
return None
def calculate_dhash(image_path: str) -> Optional[str]:
"""
Calculate difference hash for an image.
Faster but less accurate than phash.
Args:
image_path: Path to image file
Returns:
Hex string of difference hash, or None if failed
"""
try:
with PILImage.open(image_path) as img:
return str(imagehash.dhash(img))
except Exception:
return None
def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
"""
Check if two hashes are similar (potential duplicates).
Args:
hash1: First hash string
hash2: Second hash string
threshold: Maximum Hamming distance (default 10)
Returns:
True if hashes are similar
"""
try:
h1 = imagehash.hex_to_hash(hash1)
h2 = imagehash.hex_to_hash(hash2)
return (h1 - h2) <= threshold
except Exception:
return False
def hamming_distance(hash1: str, hash2: str) -> int:
"""
Calculate Hamming distance between two hashes.
Args:
hash1: First hash string
hash2: Second hash string
Returns:
Hamming distance (0 = identical, higher = more different)
"""
try:
h1 = imagehash.hex_to_hash(hash1)
h2 = imagehash.hex_to_hash(hash2)
return int(h1 - h2)
except Exception:
return 64 # Maximum distance