""" Fixes estimated_rep_duration on all Exercise records using three sources: 1. **Exact match** from JSON workout files (AI/all_workouts_data/ and AI/cho/workouts/) Each set has `estimated_duration` (total seconds) and `reps`. We compute per_rep = estimated_duration / reps, averaged across all appearances of each exercise. 2. **Fuzzy match** from the same JSON data for exercises whose DB name doesn't match exactly. Uses name normalization (strip parentheticals, punctuation, plurals) + difflib with a 0.85 cutoff, rejecting matches where the equipment type differs (e.g. barbell vs dumbbell). 3. **Movement-pattern lookup** for exercises not found by either method. Uses the exercise's `movement_patterns` field against PATTERN_DURATIONS. 4. **Category-based defaults** for exercises that don't match any pattern. Falls back to DEFAULT_DURATION (3.0s). Duration-only exercises (is_duration=True AND is_reps=False) are skipped since they use the `duration` field instead. Usage: python manage.py fix_rep_durations python manage.py fix_rep_durations --dry-run """ import difflib import glob import json import os import re import statistics from collections import defaultdict from django.conf import settings from django.core.management.base import BaseCommand from exercise.models import Exercise # Movement-pattern lookup table: maps movement pattern keywords to per-rep durations. PATTERN_DURATIONS = { 'compound_push': 3.0, 'compound_pull': 3.0, 'squat': 3.0, 'hinge': 3.0, 'lunge': 3.0, 'isolation_push': 2.5, 'isolation_pull': 2.5, 'isolation': 2.5, 'olympic': 2.0, 'explosive': 2.0, 'plyometric': 2.0, 'carry': 1.0, 'core': 2.5, } # Category defaults keyed by substring match on movement_patterns. # Order matters: first match wins. More specific patterns go first. CATEGORY_DEFAULTS = [ # Explosive / ballistic -- fast reps ('plyometric', 1.5), ('combat', 1.0), ('cardio/locomotion', 1.0), # Compound lower -- heavy, slower ('lower pull - hip hinge', 5.0), ('lower push - squat', 4.5), ('lower push - lunge', 4.0), ('lower pull', 4.5), ('lower push', 4.0), # Compound upper ('upper push - horizontal', 3.5), ('upper push - vertical', 3.5), ('upper pull - vertical', 4.0), ('upper pull - horizonal', 3.5), # note: typo is in DB ('upper pull - horizontal', 3.5), # also match corrected version ('upper push', 3.5), ('upper pull', 3.5), # Isolation / machine ('machine', 2.5), ('arms', 2.5), # Core ('core - anti-extension', 3.5), ('core - carry', 3.0), ('core', 3.0), # Mobility / yoga -- slow, controlled ('yoga', 5.0), ('mobility - static', 5.0), ('mobility - dynamic', 4.0), ('mobility', 4.0), # Olympic lifts -- explosive, technical ('olympic', 4.0), # Isolation ('isolation', 2.5), # Carry / farmer walk ('carry', 3.0), # Agility ('agility', 1.5), # Stretch / activation ('stretch', 5.0), ('activation', 3.0), ('warm up', 3.0), ('warmup', 3.0), ] # Fallback if nothing matches DEFAULT_DURATION = 3.0 # For backwards compat, also expose as DEFAULT_PER_REP DEFAULT_PER_REP = DEFAULT_DURATION # Equipment words -- if these differ between DB and JSON name, reject the match EQUIPMENT_WORDS = { 'barbell', 'dumbbell', 'kettlebell', 'cable', 'band', 'machine', 'smith', 'trx', 'ez-bar', 'ez bar', 'landmine', 'medicine ball', 'resistance band', 'bodyweight', } def _normalize_name(name): """Normalize an exercise name for fuzzy comparison.""" n = name.lower().strip() # Remove parenthetical content: "Squat (Back)" -> "Squat" n = re.sub(r'\([^)]*\)', '', n) # Remove common suffixes/noise n = re.sub(r'\b(each side|per side|each leg|per leg|each arm|per arm)\b', '', n) # Remove direction words (forward/backward variants are same exercise) n = re.sub(r'\b(forward|backward|forwards|backwards)\b', '', n) # Normalize punctuation and whitespace n = re.sub(r'[^\w\s]', ' ', n) n = re.sub(r'\s+', ' ', n).strip() # De-pluralize each word (handles "lunges"->"lunge", "curls"->"curl") words = [] for w in n.split(): if w.endswith('s') and not w.endswith('ss') and len(w) > 2: w = w[:-1] words.append(w) return ' '.join(words) def _extract_equipment(name): """Extract the equipment word from an exercise name, if any.""" name_lower = name.lower() for eq in EQUIPMENT_WORDS: if eq in name_lower: return eq return None class Command(BaseCommand): help = 'Fix estimated_rep_duration using JSON workout data + pattern/category defaults' def add_arguments(self, parser): parser.add_argument( '--dry-run', action='store_true', help='Show what would change without writing to DB', ) def handle(self, *args, **options): dry_run = options['dry_run'] # -- Step 1: Parse JSON files for real per-rep timing -- json_durations = self._parse_json_files() self.stdout.write( f'Parsed JSON: {len(json_durations)} exercises with real timing data' ) # -- Step 1b: Build fuzzy lookup from normalized JSON names -- fuzzy_index = self._build_fuzzy_index(json_durations) # -- Step 2: Update exercises -- exercises = Exercise.objects.all() from_json_exact = 0 from_json_fuzzy = 0 from_pattern = 0 from_category = 0 skipped_duration_only = 0 set_null = 0 unchanged = 0 fuzzy_matches = [] for ex in exercises: # Skip duration-only exercises (is_duration=True AND is_reps=False) if ex.is_duration and not ex.is_reps: if ex.estimated_rep_duration is not None: if not dry_run: ex.estimated_rep_duration = None ex.save(update_fields=['estimated_rep_duration']) set_null += 1 else: skipped_duration_only += 1 continue # Duration-only exercises that aren't reps-based if not ex.is_reps and not ex.is_duration: # Edge case: neither reps nor duration -- skip unchanged += 1 continue # Try exact match first name_lower = ex.name.lower().strip() if name_lower in json_durations: new_val = json_durations[name_lower] source = 'json-exact' from_json_exact += 1 else: # Try fuzzy match fuzzy_result = self._fuzzy_match(ex.name, json_durations, fuzzy_index) if fuzzy_result is not None: new_val, matched_name = fuzzy_result source = 'json-fuzzy' from_json_fuzzy += 1 fuzzy_matches.append((ex.name, matched_name, new_val)) else: # Try movement-pattern lookup pattern_val = self._get_pattern_duration(ex) if pattern_val is not None: new_val = pattern_val source = 'pattern' from_pattern += 1 else: # Fall back to category defaults new_val = self._get_category_default(ex) source = 'category' from_category += 1 old_val = ex.estimated_rep_duration if dry_run: if old_val != new_val: self.stdout.write( f' [{source}] {ex.name}: {old_val:.2f}s -> {new_val:.2f}s' if old_val else f' [{source}] {ex.name}: None -> {new_val:.2f}s' ) else: ex.estimated_rep_duration = new_val ex.save(update_fields=['estimated_rep_duration']) self.stdout.write(self.style.SUCCESS( f'\n{"[DRY RUN] " if dry_run else ""}' f'Updated {from_json_exact + from_json_fuzzy + from_pattern + from_category + set_null} exercises: ' f'{from_json_exact} from JSON (exact), {from_json_fuzzy} from JSON (fuzzy), ' f'{from_pattern} from pattern lookup, {from_category} from category defaults, ' f'{set_null} set to null (duration-only), ' f'{skipped_duration_only} already null (duration-only), ' f'{unchanged} unchanged' )) # Show fuzzy matches for review if fuzzy_matches: self.stdout.write(f'\nFuzzy matches ({len(fuzzy_matches)}):') for db_name, json_name, val in sorted(fuzzy_matches): self.stdout.write(f' {db_name:50s} -> {json_name} ({val:.2f}s)') # -- Step 3: Show summary stats -- reps_exercises = Exercise.objects.filter(is_reps=True) total_reps = reps_exercises.count() with_duration = reps_exercises.exclude(estimated_rep_duration__isnull=True).count() without_duration = reps_exercises.filter(estimated_rep_duration__isnull=True).count() coverage_pct = (with_duration / total_reps * 100) if total_reps > 0 else 0 self.stdout.write( f'\nCoverage: {with_duration}/{total_reps} rep-based exercises ' f'have estimated_rep_duration ({coverage_pct:.1f}%)' ) if without_duration > 0: self.stdout.write( f' {without_duration} exercises still missing estimated_rep_duration' ) if not dry_run: durations = list( reps_exercises .exclude(estimated_rep_duration__isnull=True) .values_list('estimated_rep_duration', flat=True) ) if durations: self.stdout.write( f'\nNew stats for rep-based exercises ({len(durations)}):' f'\n Min: {min(durations):.2f}s' f'\n Max: {max(durations):.2f}s' f'\n Mean: {statistics.mean(durations):.2f}s' f'\n Median: {statistics.median(durations):.2f}s' ) def _build_fuzzy_index(self, json_durations): """ Build a dict of {normalized_name: original_name} for fuzzy matching. """ index = {} for original_name in json_durations: norm = _normalize_name(original_name) # Keep the first occurrence if duplicates after normalization if norm not in index: index[norm] = original_name return index def _fuzzy_match(self, db_name, json_durations, fuzzy_index): """ Try to fuzzy-match a DB exercise name to a JSON exercise name. Strategy: 1. Exact match on normalized names 2. Containment match: all words of the shorter name appear in the longer 3. High-cutoff difflib (0.88) with word overlap >= 75% Equipment must match in all cases. Returns (duration_value, matched_json_name) or None. """ db_norm = _normalize_name(db_name) db_equipment = _extract_equipment(db_name) db_words = set(db_norm.split()) # First try: exact match on normalized names if db_norm in fuzzy_index: original = fuzzy_index[db_norm] json_equipment = _extract_equipment(original) if db_equipment and json_equipment and db_equipment != json_equipment: return None return json_durations[original], original # Second try: containment match -- shorter name's words are a # subset of the longer name's words (e.g. "barbell good morning" # is contained in "barbell russian good morning") for json_norm, original in fuzzy_index.items(): json_words = set(json_norm.split()) shorter, longer = ( (db_words, json_words) if len(db_words) <= len(json_words) else (json_words, db_words) ) # All words of the shorter must appear in the longer if shorter.issubset(longer) and len(shorter) >= 2: # But names shouldn't differ by too many words (max 2 extra) if len(longer) - len(shorter) > 2: continue json_equipment = _extract_equipment(original) if db_equipment and json_equipment and db_equipment != json_equipment: continue if (db_equipment is None) != (json_equipment is None): continue return json_durations[original], original # Third try: high-cutoff difflib with strict word overlap normalized_json_names = list(fuzzy_index.keys()) matches = difflib.get_close_matches( db_norm, normalized_json_names, n=3, cutoff=0.88, ) for match_norm in matches: original = fuzzy_index[match_norm] json_equipment = _extract_equipment(original) if db_equipment and json_equipment and db_equipment != json_equipment: continue if (db_equipment is None) != (json_equipment is None): continue # Require >= 75% word overlap match_words = set(match_norm.split()) overlap = len(db_words & match_words) total = max(len(db_words), len(match_words)) if total > 0 and overlap / total < 0.75: continue return json_durations[original], original return None def _parse_json_files(self): """ Parse all workout JSON files and compute average per-rep duration for each exercise. Returns {lowercase_name: avg_seconds_per_rep}. """ base = settings.BASE_DIR patterns = [ os.path.join(base, 'AI', 'all_workouts_data', '*.json'), os.path.join(base, 'AI', 'cho', 'workouts', '*.json'), ] files = [] for pat in patterns: files.extend(sorted(glob.glob(pat))) exercise_samples = defaultdict(list) for fpath in files: with open(fpath) as f: try: data = json.load(f) except (json.JSONDecodeError, UnicodeDecodeError): continue workouts = [data] if isinstance(data, dict) else data for workout in workouts: if not isinstance(workout, dict): continue for section in workout.get('sections', []): for s in section.get('sets', []): if not isinstance(s, dict): continue ex = s.get('exercise', {}) if not isinstance(ex, dict): continue name = ex.get('name', '').strip() if not name: continue reps = s.get('reps', 0) or 0 est_dur = s.get('estimated_duration', 0) or 0 set_type = s.get('type', '') if set_type == 'reps' and reps > 0 and est_dur > 0: per_rep = est_dur / reps # Sanity: ignore outliers (< 0.5s or > 20s per rep) if 0.5 <= per_rep <= 20.0: exercise_samples[name.lower()].append(per_rep) # Average across all samples per exercise result = {} for name, samples in exercise_samples.items(): result[name] = round(statistics.mean(samples), 2) return result def _get_pattern_duration(self, exercise): """ Return a per-rep duration based on the PATTERN_DURATIONS lookup table. Checks the exercise's movement_patterns field for matching patterns. Returns the first match, or None if no match. """ patterns_str = (exercise.movement_patterns or '').lower() if not patterns_str: return None for pattern_key, duration in PATTERN_DURATIONS.items(): if pattern_key in patterns_str: return duration return None def _get_category_default(self, exercise): """ Return a per-rep duration based on the exercise's movement_patterns using the more detailed CATEGORY_DEFAULTS table. """ patterns = (exercise.movement_patterns or '').lower() for keyword, duration in CATEGORY_DEFAULTS: if keyword in patterns: return duration return DEFAULT_DURATION