workout generator audit: rules engine, structure rules, split patterns, injury UX, metadata cleanup

- Add rules_engine.py with quantitative rules for all 8 workout types - Add quality gate retry loop in generate_single_workout() - Expand calibrate_structure_rules to all 120 combinations (8 types × 5 goals × 3 sections) - Wire WeeklySplitPattern DB records into _pick_weekly_split() - Enforce movement patterns from WorkoutStructureRule in exercise selection - Add straight-set strength support (single main lift, 4-6 rounds) - Add modality consistency check for duration-dominant workout types - Add InjuryStep component to onboarding and preferences - Add sibling exercise exclusion in regenerate and preview_day endpoints - Display generator warnings on dashboard - Expand fix_rep_durations, fix_exercise_flags, fix_movement_pattern_typo - Add audit_exercise_data and check_rules_drift management commands - Add Next.js frontend with dashboard, onboarding, preferences, history pages - Add generator app with ML-powered workout generation pipeline - 96 new tests across 7 test modules Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 20:07:40 -06:00
parent 2a16b75c4b
commit 1c61b80731
111 changed files with 28108 additions and 30 deletions
--- a/generator/management/commands/fix_rep_durations.py
+++ b/generator/management/commands/fix_rep_durations.py
@@ -0,0 +1,463 @@
+"""
+Fixes estimated_rep_duration on all Exercise records using three sources:
+
+1. **Exact match** from JSON workout files (AI/all_workouts_data/ and AI/cho/workouts/)
+   Each set has `estimated_duration` (total seconds) and `reps`.
+   We compute per_rep = estimated_duration / reps, averaged across all
+   appearances of each exercise.
+
+2. **Fuzzy match** from the same JSON data for exercises whose DB name
+   doesn't match exactly.  Uses name normalization (strip parentheticals,
+   punctuation, plurals) + difflib with a 0.85 cutoff, rejecting matches
+   where the equipment type differs (e.g. barbell vs dumbbell).
+
+3. **Movement-pattern lookup** for exercises not found by either method.
+   Uses the exercise's `movement_patterns` field against PATTERN_DURATIONS.
+
+4. **Category-based defaults** for exercises that don't match any pattern.
+   Falls back to DEFAULT_DURATION (3.0s).
+
+Duration-only exercises (is_duration=True AND is_reps=False) are skipped
+since they use the `duration` field instead.
+
+Usage:
+    python manage.py fix_rep_durations
+    python manage.py fix_rep_durations --dry-run
+"""
+
+import difflib
+import glob
+import json
+import os
+import re
+import statistics
+from collections import defaultdict
+
+from django.conf import settings
+from django.core.management.base import BaseCommand
+
+from exercise.models import Exercise
+
+
+# Movement-pattern lookup table: maps movement pattern keywords to per-rep durations.
+PATTERN_DURATIONS = {
+    'compound_push': 3.0,
+    'compound_pull': 3.0,
+    'squat': 3.0,
+    'hinge': 3.0,
+    'lunge': 3.0,
+    'isolation_push': 2.5,
+    'isolation_pull': 2.5,
+    'isolation': 2.5,
+    'olympic': 2.0,
+    'explosive': 2.0,
+    'plyometric': 2.0,
+    'carry': 1.0,
+    'core': 2.5,
+}
+
+# Category defaults keyed by substring match on movement_patterns.
+# Order matters: first match wins.  More specific patterns go first.
+CATEGORY_DEFAULTS = [
+    # Explosive / ballistic -- fast reps
+    ('plyometric',           1.5),
+    ('combat',               1.0),
+    ('cardio/locomotion',    1.0),
+
+    # Compound lower -- heavy, slower
+    ('lower pull - hip hinge', 5.0),
+    ('lower push - squat',     4.5),
+    ('lower push - lunge',     4.0),
+    ('lower pull',             4.5),
+    ('lower push',             4.0),
+
+    # Compound upper
+    ('upper push - horizontal', 3.5),
+    ('upper push - vertical',   3.5),
+    ('upper pull - vertical',   4.0),
+    ('upper pull - horizonal',  3.5),  # note: typo is in DB
+    ('upper pull - horizontal', 3.5),  # also match corrected version
+    ('upper push',              3.5),
+    ('upper pull',              3.5),
+
+    # Isolation / machine
+    ('machine',  2.5),
+    ('arms',     2.5),
+
+    # Core
+    ('core - anti-extension', 3.5),
+    ('core - carry',          3.0),
+    ('core',                  3.0),
+
+    # Mobility / yoga -- slow, controlled
+    ('yoga',               5.0),
+    ('mobility - static',  5.0),
+    ('mobility - dynamic', 4.0),
+    ('mobility',           4.0),
+
+    # Olympic lifts -- explosive, technical
+    ('olympic',            4.0),
+
+    # Isolation
+    ('isolation',          2.5),
+
+    # Carry / farmer walk
+    ('carry',              3.0),
+
+    # Agility
+    ('agility',            1.5),
+
+    # Stretch / activation
+    ('stretch',            5.0),
+    ('activation',         3.0),
+    ('warm up',            3.0),
+    ('warmup',             3.0),
+]
+
+# Fallback if nothing matches
+DEFAULT_DURATION = 3.0
+
+# For backwards compat, also expose as DEFAULT_PER_REP
+DEFAULT_PER_REP = DEFAULT_DURATION
+
+# Equipment words -- if these differ between DB and JSON name, reject the match
+EQUIPMENT_WORDS = {
+    'barbell', 'dumbbell', 'kettlebell', 'cable', 'band', 'machine',
+    'smith', 'trx', 'ez-bar', 'ez bar', 'landmine', 'medicine ball',
+    'resistance band', 'bodyweight',
+}
+
+
+def _normalize_name(name):
+    """Normalize an exercise name for fuzzy comparison."""
+    n = name.lower().strip()
+    # Remove parenthetical content: "Squat (Back)" -> "Squat"
+    n = re.sub(r'\([^)]*\)', '', n)
+    # Remove common suffixes/noise
+    n = re.sub(r'\b(each side|per side|each leg|per leg|each arm|per arm)\b', '', n)
+    # Remove direction words (forward/backward variants are same exercise)
+    n = re.sub(r'\b(forward|backward|forwards|backwards)\b', '', n)
+    # Normalize punctuation and whitespace
+    n = re.sub(r'[^\w\s]', ' ', n)
+    n = re.sub(r'\s+', ' ', n).strip()
+    # De-pluralize each word (handles "lunges"->"lunge", "curls"->"curl")
+    words = []
+    for w in n.split():
+        if w.endswith('s') and not w.endswith('ss') and len(w) > 2:
+            w = w[:-1]
+        words.append(w)
+    return ' '.join(words)
+
+
+def _extract_equipment(name):
+    """Extract the equipment word from an exercise name, if any."""
+    name_lower = name.lower()
+    for eq in EQUIPMENT_WORDS:
+        if eq in name_lower:
+            return eq
+    return None
+
+
+class Command(BaseCommand):
+    help = 'Fix estimated_rep_duration using JSON workout data + pattern/category defaults'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--dry-run',
+            action='store_true',
+            help='Show what would change without writing to DB',
+        )
+
+    def handle(self, *args, **options):
+        dry_run = options['dry_run']
+
+        # -- Step 1: Parse JSON files for real per-rep timing --
+        json_durations = self._parse_json_files()
+        self.stdout.write(
+            f'Parsed JSON: {len(json_durations)} exercises with real timing data'
+        )
+
+        # -- Step 1b: Build fuzzy lookup from normalized JSON names --
+        fuzzy_index = self._build_fuzzy_index(json_durations)
+
+        # -- Step 2: Update exercises --
+        exercises = Exercise.objects.all()
+        from_json_exact = 0
+        from_json_fuzzy = 0
+        from_pattern = 0
+        from_category = 0
+        skipped_duration_only = 0
+        set_null = 0
+        unchanged = 0
+        fuzzy_matches = []
+
+        for ex in exercises:
+            # Skip duration-only exercises (is_duration=True AND is_reps=False)
+            if ex.is_duration and not ex.is_reps:
+                if ex.estimated_rep_duration is not None:
+                    if not dry_run:
+                        ex.estimated_rep_duration = None
+                        ex.save(update_fields=['estimated_rep_duration'])
+                    set_null += 1
+                else:
+                    skipped_duration_only += 1
+                continue
+
+            # Duration-only exercises that aren't reps-based
+            if not ex.is_reps and not ex.is_duration:
+                # Edge case: neither reps nor duration -- skip
+                unchanged += 1
+                continue
+
+            # Try exact match first
+            name_lower = ex.name.lower().strip()
+            if name_lower in json_durations:
+                new_val = json_durations[name_lower]
+                source = 'json-exact'
+                from_json_exact += 1
+            else:
+                # Try fuzzy match
+                fuzzy_result = self._fuzzy_match(ex.name, json_durations, fuzzy_index)
+                if fuzzy_result is not None:
+                    new_val, matched_name = fuzzy_result
+                    source = 'json-fuzzy'
+                    from_json_fuzzy += 1
+                    fuzzy_matches.append((ex.name, matched_name, new_val))
+                else:
+                    # Try movement-pattern lookup
+                    pattern_val = self._get_pattern_duration(ex)
+                    if pattern_val is not None:
+                        new_val = pattern_val
+                        source = 'pattern'
+                        from_pattern += 1
+                    else:
+                        # Fall back to category defaults
+                        new_val = self._get_category_default(ex)
+                        source = 'category'
+                        from_category += 1
+
+            old_val = ex.estimated_rep_duration
+
+            if dry_run:
+                if old_val != new_val:
+                    self.stdout.write(
+                        f'  [{source}] {ex.name}: {old_val:.2f}s -> {new_val:.2f}s'
+                        if old_val else
+                        f'  [{source}] {ex.name}: None -> {new_val:.2f}s'
+                    )
+            else:
+                ex.estimated_rep_duration = new_val
+                ex.save(update_fields=['estimated_rep_duration'])
+
+        self.stdout.write(self.style.SUCCESS(
+            f'\n{"[DRY RUN] " if dry_run else ""}'
+            f'Updated {from_json_exact + from_json_fuzzy + from_pattern + from_category + set_null} exercises: '
+            f'{from_json_exact} from JSON (exact), {from_json_fuzzy} from JSON (fuzzy), '
+            f'{from_pattern} from pattern lookup, {from_category} from category defaults, '
+            f'{set_null} set to null (duration-only), '
+            f'{skipped_duration_only} already null (duration-only), '
+            f'{unchanged} unchanged'
+        ))
+
+        # Show fuzzy matches for review
+        if fuzzy_matches:
+            self.stdout.write(f'\nFuzzy matches ({len(fuzzy_matches)}):')
+            for db_name, json_name, val in sorted(fuzzy_matches):
+                self.stdout.write(f'  {db_name:50s} -> {json_name} ({val:.2f}s)')
+
+        # -- Step 3: Show summary stats --
+        reps_exercises = Exercise.objects.filter(is_reps=True)
+        total_reps = reps_exercises.count()
+        with_duration = reps_exercises.exclude(estimated_rep_duration__isnull=True).count()
+        without_duration = reps_exercises.filter(estimated_rep_duration__isnull=True).count()
+
+        coverage_pct = (with_duration / total_reps * 100) if total_reps > 0 else 0
+        self.stdout.write(
+            f'\nCoverage: {with_duration}/{total_reps} rep-based exercises '
+            f'have estimated_rep_duration ({coverage_pct:.1f}%)'
+        )
+        if without_duration > 0:
+            self.stdout.write(
+                f'  {without_duration} exercises still missing estimated_rep_duration'
+            )
+
+        if not dry_run:
+            durations = list(
+                reps_exercises
+                .exclude(estimated_rep_duration__isnull=True)
+                .values_list('estimated_rep_duration', flat=True)
+            )
+            if durations:
+                self.stdout.write(
+                    f'\nNew stats for rep-based exercises ({len(durations)}):'
+                    f'\n  Min:    {min(durations):.2f}s'
+                    f'\n  Max:    {max(durations):.2f}s'
+                    f'\n  Mean:   {statistics.mean(durations):.2f}s'
+                    f'\n  Median: {statistics.median(durations):.2f}s'
+                )
+
+    def _build_fuzzy_index(self, json_durations):
+        """
+        Build a dict of {normalized_name: original_name} for fuzzy matching.
+        """
+        index = {}
+        for original_name in json_durations:
+            norm = _normalize_name(original_name)
+            # Keep the first occurrence if duplicates after normalization
+            if norm not in index:
+                index[norm] = original_name
+        return index
+
+    def _fuzzy_match(self, db_name, json_durations, fuzzy_index):
+        """
+        Try to fuzzy-match a DB exercise name to a JSON exercise name.
+
+        Strategy:
+        1. Exact match on normalized names
+        2. Containment match: all words of the shorter name appear in the longer
+        3. High-cutoff difflib (0.88) with word overlap >= 75%
+
+        Equipment must match in all cases.
+
+        Returns (duration_value, matched_json_name) or None.
+        """
+        db_norm = _normalize_name(db_name)
+        db_equipment = _extract_equipment(db_name)
+        db_words = set(db_norm.split())
+
+        # First try: exact match on normalized names
+        if db_norm in fuzzy_index:
+            original = fuzzy_index[db_norm]
+            json_equipment = _extract_equipment(original)
+            if db_equipment and json_equipment and db_equipment != json_equipment:
+                return None
+            return json_durations[original], original
+
+        # Second try: containment match -- shorter name's words are a
+        # subset of the longer name's words (e.g. "barbell good morning"
+        # is contained in "barbell russian good morning")
+        for json_norm, original in fuzzy_index.items():
+            json_words = set(json_norm.split())
+            shorter, longer = (
+                (db_words, json_words) if len(db_words) <= len(json_words)
+                else (json_words, db_words)
+            )
+            # All words of the shorter must appear in the longer
+            if shorter.issubset(longer) and len(shorter) >= 2:
+                # But names shouldn't differ by too many words (max 2 extra)
+                if len(longer) - len(shorter) > 2:
+                    continue
+                json_equipment = _extract_equipment(original)
+                if db_equipment and json_equipment and db_equipment != json_equipment:
+                    continue
+                if (db_equipment is None) != (json_equipment is None):
+                    continue
+                return json_durations[original], original
+
+        # Third try: high-cutoff difflib with strict word overlap
+        normalized_json_names = list(fuzzy_index.keys())
+        matches = difflib.get_close_matches(
+            db_norm, normalized_json_names, n=3, cutoff=0.88,
+        )
+
+        for match_norm in matches:
+            original = fuzzy_index[match_norm]
+            json_equipment = _extract_equipment(original)
+            if db_equipment and json_equipment and db_equipment != json_equipment:
+                continue
+            if (db_equipment is None) != (json_equipment is None):
+                continue
+            # Require >= 75% word overlap
+            match_words = set(match_norm.split())
+            overlap = len(db_words & match_words)
+            total = max(len(db_words), len(match_words))
+            if total > 0 and overlap / total < 0.75:
+                continue
+            return json_durations[original], original
+
+        return None
+
+    def _parse_json_files(self):
+        """
+        Parse all workout JSON files and compute average per-rep duration
+        for each exercise.  Returns {lowercase_name: avg_seconds_per_rep}.
+        """
+        base = settings.BASE_DIR
+        patterns = [
+            os.path.join(base, 'AI', 'all_workouts_data', '*.json'),
+            os.path.join(base, 'AI', 'cho', 'workouts', '*.json'),
+        ]
+        files = []
+        for pat in patterns:
+            files.extend(sorted(glob.glob(pat)))
+
+        exercise_samples = defaultdict(list)
+
+        for fpath in files:
+            with open(fpath) as f:
+                try:
+                    data = json.load(f)
+                except (json.JSONDecodeError, UnicodeDecodeError):
+                    continue
+
+            workouts = [data] if isinstance(data, dict) else data
+
+            for workout in workouts:
+                if not isinstance(workout, dict):
+                    continue
+                for section in workout.get('sections', []):
+                    for s in section.get('sets', []):
+                        if not isinstance(s, dict):
+                            continue
+                        ex = s.get('exercise', {})
+                        if not isinstance(ex, dict):
+                            continue
+                        name = ex.get('name', '').strip()
+                        if not name:
+                            continue
+
+                        reps = s.get('reps', 0) or 0
+                        est_dur = s.get('estimated_duration', 0) or 0
+                        set_type = s.get('type', '')
+
+                        if set_type == 'reps' and reps > 0 and est_dur > 0:
+                            per_rep = est_dur / reps
+                            # Sanity: ignore outliers (< 0.5s or > 20s per rep)
+                            if 0.5 <= per_rep <= 20.0:
+                                exercise_samples[name.lower()].append(per_rep)
+
+        # Average across all samples per exercise
+        result = {}
+        for name, samples in exercise_samples.items():
+            result[name] = round(statistics.mean(samples), 2)
+
+        return result
+
+    def _get_pattern_duration(self, exercise):
+        """
+        Return a per-rep duration based on the PATTERN_DURATIONS lookup table.
+        Checks the exercise's movement_patterns field for matching patterns.
+        Returns the first match, or None if no match.
+        """
+        patterns_str = (exercise.movement_patterns or '').lower()
+        if not patterns_str:
+            return None
+
+        for pattern_key, duration in PATTERN_DURATIONS.items():
+            if pattern_key in patterns_str:
+                return duration
+
+        return None
+
+    def _get_category_default(self, exercise):
+        """
+        Return a per-rep duration based on the exercise's movement_patterns
+        using the more detailed CATEGORY_DEFAULTS table.
+        """
+        patterns = (exercise.movement_patterns or '').lower()
+
+        for keyword, duration in CATEGORY_DEFAULTS:
+            if keyword in patterns:
+                return duration
+
+        return DEFAULT_DURATION