#!/usr/bin/env python3 """ Reference implementation of the FlightAware-based route+schedule lookup. This is the canonical algorithm the Swift port (FlightAwareScheduleClient) mirrors. No auth, no Turnstile, no headless browser — two plain GETs per search, both hitting open FlightAware web pages. Pipeline for ("DFW", "AMS", 2026-06-06): 1. Resolve dep_icao = "KDFW", arr_icao = "EHAM" (deterministic for US, curated table for international hubs). 2. GET https://flightaware.com/analysis/route.rvt?origin=KDFW&destination=EHAM and parse the "Itemized List" table → distinct flight idents (e.g. "AAL220"). 3. For each ident, GET https://flightaware.com/live/flight/ and extract the embedded `trackpollBootstrap` JSON via a brace-balanced scan over the script body. 4. From trackpollBootstrap.flights[*].activityLog.flights, project each scheduled leg whose gateDepartureTimes.scheduled falls on the requested local-departure date. 5. Emit (flightNumber, aircraft, dep_utc, arr_utc, dep_tz, arr_tz, dep_gate, dep_terminal, arr_gate, arr_terminal, duration_min). Usage: python3 scripts/probe_flightaware.py DFW AMS 2026-06-06 """ from __future__ import annotations import json import re import subprocess import sys from datetime import date, datetime, timezone # Small IATA→ICAO map. Production lookup lives in AirportDatabase.swift — # this mirrors enough major hubs to validate the script end-to-end. IATA_TO_ICAO_INTL: dict[str, str] = { "AMS": "EHAM", "LHR": "EGLL", "CDG": "LFPG", "FRA": "EDDF", "MAD": "LEMD", "BCN": "LEBL", "FCO": "LIRF", "MUC": "EDDM", "ZRH": "LSZH", "VIE": "LOWW", "BRU": "EBBR", "DUB": "EIDW", "LIS": "LPPT", "ATH": "LGAV", "IST": "LTFM", "DOH": "OTHH", "DXB": "OMDB", "AUH": "OMAA", "HND": "RJTT", "NRT": "RJAA", "ICN": "RKSI", "PEK": "ZBAA", "PVG": "ZSPD", "HKG": "VHHH", "SIN": "WSSS", "BKK": "VTBS", "SYD": "YSSY", "MEL": "YMML", "AKL": "NZAA", "JNB": "FAOR", "GRU": "SBGR", "EZE": "SAEZ", "MEX": "MMMX", "CUN": "MMUN", } def iata_to_icao(iata: str) -> str: """US/Canada/Mexico are deterministic; international hubs use the map.""" iata = iata.upper() if len(iata) != 3: raise ValueError(f"bad IATA: {iata!r}") if iata in IATA_TO_ICAO_INTL: return IATA_TO_ICAO_INTL[iata] # Heuristic: 48 US states → K-prefix. AK/HI use P-prefix (PANC/PHNL) # which we'd put in the curated map. Same for AS/PR/VI/GU. return "K" + iata _UA = ( "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) " "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 " "Mobile/15E148 Safari/604.1" ) def fetch(url: str) -> str: """Curl with redirect-follow; URLSession in iOS follows redirects by default too, so this mirrors the runtime behaviour.""" r = subprocess.run( ["/usr/bin/curl", "-sSL", "--max-time", "25", "-A", _UA, "-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", url], capture_output=True, timeout=30, ) if r.returncode != 0: raise RuntimeError(f"curl failed: {r.stderr.decode(errors='replace')}") return r.stdout.decode("utf-8", errors="replace") # --------------------------------------------------------------------------- # Step 2: parse route.rvt → distinct flight idents # --------------------------------------------------------------------------- # Row shape inside the route.rvt "Itemized List" table: # [AP]M ... # The day column lacks delimiters in the text-stripped form but the regex # below tolerates the whitespace fuzz. # After tag-stripping the row reads # "Fri 02:46PM CDT AAL220 KDFW AMS / EHAM B772 FL350 …" # i.e. timezone abbrev between time and ident. The `.+?` between them # tolerates that (CDT / EDT / UTC / etc). _ROUTE_ROW_RE = re.compile( r"(?PSun|Mon|Tue|Wed|Thu|Fri|Sat)\s+" r"\d{1,2}:\d{2}[AP]M.+?" r"(?P[A-Z]{2,3}\d{1,4})\s+" r"(?P[A-Z]{4})\s+", re.MULTILINE, ) def parse_route_idents(route_html: str) -> list[str]: """Return distinct flight idents listed on the route analysis page.""" text = re.sub(r"<[^>]+>", " ", route_html) text = re.sub(r"\s+", " ", text) idents: list[str] = [] seen: set[str] = set() for m in _ROUTE_ROW_RE.finditer(text): ident = m.group("ident") if ident not in seen: seen.add(ident) idents.append(ident) return idents # --------------------------------------------------------------------------- # Step 3: brace-balanced extract of `var trackpollBootstrap = {...};` # --------------------------------------------------------------------------- _TRACKPOLL_RE = re.compile(r"var\s+trackpollBootstrap\s*=\s*\{") def extract_trackpoll(html: str) -> dict: m = _TRACKPOLL_RE.search(html) if not m: raise ValueError("no trackpollBootstrap blob in HTML") start = m.end() - 1 # position of opening { i = start depth = 0 in_str = False n = len(html) while i < n: c = html[i] if in_str: if c == "\\": i += 2 continue if c == '"': in_str = False else: if c == '"': in_str = True elif c == "{": depth += 1 elif c == "}": depth -= 1 if depth == 0: return json.loads(html[start:i + 1]) i += 1 raise ValueError("trackpollBootstrap blob unbalanced") # --------------------------------------------------------------------------- # Step 4–5: project scheduled flights for the requested date # --------------------------------------------------------------------------- def scheduled_flights_for(ident: str, dep_iata: str, arr_iata: str, target_date: date) -> list[dict]: """Pull and project the trackpoll JSON for a single ident.""" url = f"https://flightaware.com/live/flight/{ident}" html = fetch(url) data = extract_trackpoll(html) out: list[dict] = [] for _fid, flight in data.get("flights", {}).items(): for leg in flight.get("activityLog", {}).get("flights", []): o = leg.get("origin", {}) d = leg.get("destination", {}) if o.get("iata") != dep_iata or d.get("iata") != arr_iata: continue sched_dep = (leg.get("gateDepartureTimes") or {}).get("scheduled") sched_arr = (leg.get("gateArrivalTimes") or {}).get("scheduled") if not sched_dep or not sched_arr: continue dep_dt = datetime.fromtimestamp(sched_dep, tz=timezone.utc) arr_dt = datetime.fromtimestamp(sched_arr, tz=timezone.utc) # Filter by *local* departure date — a flight that leaves # at 23:50 in the origin TZ on the 6th appears as the 7th # in UTC for west-of-UTC airports. tz_str = (o.get("TZ") or "").lstrip(":") or "UTC" try: from zoneinfo import ZoneInfo local_dep_date = dep_dt.astimezone(ZoneInfo(tz_str)).date() except Exception: local_dep_date = dep_dt.date() if local_dep_date != target_date: continue out.append({ "ident": ident, "flightNumber": _ident_to_iata(ident), "aircraft": leg.get("aircraftType"), "aircraftFriendly": leg.get("aircraftTypeFriendly"), "depUTC": dep_dt.isoformat(), "arrUTC": arr_dt.isoformat(), "depTZ": tz_str, "arrTZ": (d.get("TZ") or "").lstrip(":") or "UTC", "depGate": o.get("gate"), "depTerminal": o.get("terminal"), "arrGate": d.get("gate"), "arrTerminal": d.get("terminal"), "durationMin": int((arr_dt - dep_dt).total_seconds() // 60), }) return out # Airline ICAO → IATA prefix for human-facing flight numbers. # Trimmed list of carriers FlightAware uses idents for. The Swift port # delegates to a fuller carriers DB. _AIRLINE_ICAO_TO_IATA = { "AAL": "AA", "DAL": "DL", "UAL": "UA", "SWA": "WN", "ASA": "AS", "JBU": "B6", "FFT": "F9", "SKW": "OO", "NKS": "NK", "RPA": "YX", "AAY": "G4", "HAL": "HA", "AWI": "9E", "ENY": "MQ", "EDV": "9E", "BAW": "BA", "DLH": "LH", "KLM": "KL", "AFR": "AF", "VIR": "VS", "IBE": "IB", "SAS": "SK", "FIN": "AY", "TAP": "TP", "AZA": "AZ", "SWR": "LX", "AUA": "OS", "LOT": "LO", "TRA": "HV", "EZY": "U2", "RYR": "FR", "WZZ": "W6", "PGT": "PC", "QFA": "QF", "VOZ": "VA", "ANZ": "NZ", "JST": "JQ", "ANA": "NH", "JAL": "JL", "ACA": "AC", "WJA": "WS", "EVA": "BR", "CAL": "CI", "CES": "MU", "CCA": "CA", "CSN": "CZ", "AAR": "OZ", "KAL": "KE", "SIA": "SQ", "THA": "TG", "CPA": "CX", "AIC": "AI", "GIA": "GA", "MAS": "MH", "PAL": "PR", "QTR": "QR", "UAE": "EK", "ETD": "EY", "RJA": "RJ", "SVA": "SV", "ETH": "ET", "MEA": "ME", "LAN": "LA", "TAM": "JJ", "AVA": "AV", "AMX": "AM", "VIV": "VB", "VOI": "Y4", "ELY": "LY", } def _ident_to_iata(ident: str) -> str: """AAL220 → 'AA220' for display.""" m = re.match(r"^([A-Z]{2,3})(\d{1,4})$", ident) if not m: return ident icao_carrier, num = m.groups() return _AIRLINE_ICAO_TO_IATA.get(icao_carrier, icao_carrier) + num # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): if len(sys.argv) < 4: print("usage: probe_flightaware.py ") sys.exit(2) dep_iata = sys.argv[1].upper() arr_iata = sys.argv[2].upper() target = datetime.strptime(sys.argv[3], "%Y-%m-%d").date() dep_icao = iata_to_icao(dep_iata) arr_icao = iata_to_icao(arr_iata) print(f"[1/4] {dep_iata}({dep_icao}) → {arr_iata}({arr_icao}) on {target}") route_url = ( "https://flightaware.com/analysis/route.rvt" f"?origin={dep_icao}&destination={arr_icao}" ) print(f"[2/4] GET {route_url}") route_html = fetch(route_url) idents = parse_route_idents(route_html) print(f" found {len(idents)} distinct idents: {idents[:10]}") print(f"[3/4] fetching trackpoll for each ident…") all_flights: list[dict] = [] for ident in idents: try: flights = scheduled_flights_for(ident, dep_iata, arr_iata, target) print(f" {ident}: {len(flights)} scheduled on {target}") all_flights.extend(flights) except Exception as e: print(f" {ident}: ERROR {type(e).__name__}: {e}") all_flights.sort(key=lambda f: f["depUTC"]) print(f"[4/4] total scheduled direct flights: {len(all_flights)}") print() for f in all_flights: dep_local = datetime.fromisoformat(f["depUTC"]).astimezone() print(f" {f['flightNumber']:8s} {f['aircraftFriendly'] or f['aircraft']}") print(f" {f['depUTC']} → {f['arrUTC']}") print(f" gate {f['depGate'] or '?'} term {f['depTerminal'] or '?'}" f" → gate {f['arrGate'] or '?'} term {f['arrTerminal'] or '?'}") print(f" {f['durationMin']} min ({f['depTZ']} → {f['arrTZ']})") print() if __name__ == "__main__": main()