Files
Flights/scripts/probe_flightaware.py
T
Trey T ba0688a412 Search: FlightAware backbone, blob catalog, diagnostic infra
route-explorer's /api/token sits behind invisible Cloudflare Turnstile
that requires Apple's Private Access Token attestation. Third-party
iOS apps don't qualify for PAT issuance, and Linux Docker containers
can't pass it either (cross-OS fingerprint, even with patchright /
Camoufox). Migrates direct-flight search to FlightAware; multi-stop
and where-can-I-go remain via embedded SFSafariViewController.

- FlightAwareScheduleClient — scrapes route.rvt + trackpoll JSON for
  real schedules without auth. T+0..2 day window. Tests against
  captured HTML fixtures.
- BlobRouteClient — pulls the public Vercel blob route catalog
  route-explorer's frontend reads (no auth, no Turnstile).
- DiagnosticLogger + LoggingURLSessionDelegate + DiagnosticsView —
  device-shareable forensic trace. Boot header captures device, OS,
  locale, UA; share-sheet export of session logs.
- TurnstileDebugView — live WKWebView gate inspector. Used to prove
  the PAT-entitlement gap on a real device.
- RouteExplorerBrowserView — SFSafariViewController wrapper. Real
  Safari clears Turnstile naturally; the in-app browser opens at
  pre-filled search URLs. Surfaced from Search ("Open in
  route-explorer") and Settings → Tools.
- RouteExplorerTokenStore + RouteExplorerSetupView — bookmarklet
  capture flow (token round-tripped via flights://routeexplorer-token
  URL scheme). Kept dormant for future use.

backend/ — Docker proxy attempts (Playwright, patchright, Camoufox).
All fail on Linux because Cloudflare auto-denies before the Turnstile
widget renders. Documented; kept as scaffolding for a future paid-
solver integration.

scripts/probe_flightaware.py — reference algorithm for the FA path.
scripts/probe_nodriver.py — local-Mac sanity check confirming the
gate clears with real macOS Chrome (proves the blocker is
fingerprint-level, not network-level).
2026-06-06 01:09:59 -05:00

287 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Reference implementation of the FlightAware-based route+schedule lookup.
This is the canonical algorithm the Swift port (FlightAwareScheduleClient)
mirrors. No auth, no Turnstile, no headless browser — two plain GETs per
search, both hitting open FlightAware web pages.
Pipeline for ("DFW", "AMS", 2026-06-06):
1. Resolve dep_icao = "KDFW", arr_icao = "EHAM" (deterministic for US,
curated table for international hubs).
2. GET https://flightaware.com/analysis/route.rvt?origin=KDFW&destination=EHAM
and parse the "Itemized List" table → distinct flight idents
(e.g. "AAL220").
3. For each ident, GET https://flightaware.com/live/flight/<ident> and
extract the embedded `trackpollBootstrap` JSON via a brace-balanced
scan over the script body.
4. From trackpollBootstrap.flights[*].activityLog.flights, project
each scheduled leg whose gateDepartureTimes.scheduled falls on the
requested local-departure date.
5. Emit (flightNumber, aircraft, dep_utc, arr_utc, dep_tz, arr_tz,
dep_gate, dep_terminal, arr_gate, arr_terminal, duration_min).
Usage:
python3 scripts/probe_flightaware.py DFW AMS 2026-06-06
"""
from __future__ import annotations
import json
import re
import subprocess
import sys
from datetime import date, datetime, timezone
# Small IATA→ICAO map. Production lookup lives in AirportDatabase.swift —
# this mirrors enough major hubs to validate the script end-to-end.
IATA_TO_ICAO_INTL: dict[str, str] = {
"AMS": "EHAM", "LHR": "EGLL", "CDG": "LFPG", "FRA": "EDDF",
"MAD": "LEMD", "BCN": "LEBL", "FCO": "LIRF", "MUC": "EDDM",
"ZRH": "LSZH", "VIE": "LOWW", "BRU": "EBBR", "DUB": "EIDW",
"LIS": "LPPT", "ATH": "LGAV", "IST": "LTFM", "DOH": "OTHH",
"DXB": "OMDB", "AUH": "OMAA", "HND": "RJTT", "NRT": "RJAA",
"ICN": "RKSI", "PEK": "ZBAA", "PVG": "ZSPD", "HKG": "VHHH",
"SIN": "WSSS", "BKK": "VTBS", "SYD": "YSSY", "MEL": "YMML",
"AKL": "NZAA", "JNB": "FAOR", "GRU": "SBGR", "EZE": "SAEZ",
"MEX": "MMMX", "CUN": "MMUN",
}
def iata_to_icao(iata: str) -> str:
"""US/Canada/Mexico are deterministic; international hubs use the map."""
iata = iata.upper()
if len(iata) != 3:
raise ValueError(f"bad IATA: {iata!r}")
if iata in IATA_TO_ICAO_INTL:
return IATA_TO_ICAO_INTL[iata]
# Heuristic: 48 US states → K-prefix. AK/HI use P-prefix (PANC/PHNL)
# which we'd put in the curated map. Same for AS/PR/VI/GU.
return "K" + iata
_UA = (
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 "
"Mobile/15E148 Safari/604.1"
)
def fetch(url: str) -> str:
"""Curl with redirect-follow; URLSession in iOS follows redirects by default
too, so this mirrors the runtime behaviour."""
r = subprocess.run(
["/usr/bin/curl", "-sSL", "--max-time", "25",
"-A", _UA,
"-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
url],
capture_output=True, timeout=30,
)
if r.returncode != 0:
raise RuntimeError(f"curl failed: {r.stderr.decode(errors='replace')}")
return r.stdout.decode("utf-8", errors="replace")
# ---------------------------------------------------------------------------
# Step 2: parse route.rvt → distinct flight idents
# ---------------------------------------------------------------------------
# Row shape inside the route.rvt "Itemized List" table:
# <day> <HH:MM>[AP]M <TZ> <IDENT> <ORIGIN_ICAO> <DEST_IATA/ICAO> ...
# The day column lacks delimiters in the text-stripped form but the regex
# below tolerates the whitespace fuzz.
# After tag-stripping the row reads
# "Fri 02:46PM CDT AAL220 KDFW AMS / EHAM B772 FL350 …"
# i.e. timezone abbrev between time and ident. The `.+?` between them
# tolerates that (CDT / EDT / UTC / etc).
_ROUTE_ROW_RE = re.compile(
r"(?P<dow>Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+"
r"\d{1,2}:\d{2}[AP]M.+?"
r"(?P<ident>[A-Z]{2,3}\d{1,4})\s+"
r"(?P<origin>[A-Z]{4})\s+",
re.MULTILINE,
)
def parse_route_idents(route_html: str) -> list[str]:
"""Return distinct flight idents listed on the route analysis page."""
text = re.sub(r"<[^>]+>", " ", route_html)
text = re.sub(r"\s+", " ", text)
idents: list[str] = []
seen: set[str] = set()
for m in _ROUTE_ROW_RE.finditer(text):
ident = m.group("ident")
if ident not in seen:
seen.add(ident)
idents.append(ident)
return idents
# ---------------------------------------------------------------------------
# Step 3: brace-balanced extract of `var trackpollBootstrap = {...};`
# ---------------------------------------------------------------------------
_TRACKPOLL_RE = re.compile(r"var\s+trackpollBootstrap\s*=\s*\{")
def extract_trackpoll(html: str) -> dict:
m = _TRACKPOLL_RE.search(html)
if not m:
raise ValueError("no trackpollBootstrap blob in HTML")
start = m.end() - 1 # position of opening {
i = start
depth = 0
in_str = False
n = len(html)
while i < n:
c = html[i]
if in_str:
if c == "\\":
i += 2
continue
if c == '"':
in_str = False
else:
if c == '"':
in_str = True
elif c == "{":
depth += 1
elif c == "}":
depth -= 1
if depth == 0:
return json.loads(html[start:i + 1])
i += 1
raise ValueError("trackpollBootstrap blob unbalanced")
# ---------------------------------------------------------------------------
# Step 45: project scheduled flights for the requested date
# ---------------------------------------------------------------------------
def scheduled_flights_for(ident: str, dep_iata: str, arr_iata: str,
target_date: date) -> list[dict]:
"""Pull and project the trackpoll JSON for a single ident."""
url = f"https://flightaware.com/live/flight/{ident}"
html = fetch(url)
data = extract_trackpoll(html)
out: list[dict] = []
for _fid, flight in data.get("flights", {}).items():
for leg in flight.get("activityLog", {}).get("flights", []):
o = leg.get("origin", {})
d = leg.get("destination", {})
if o.get("iata") != dep_iata or d.get("iata") != arr_iata:
continue
sched_dep = (leg.get("gateDepartureTimes") or {}).get("scheduled")
sched_arr = (leg.get("gateArrivalTimes") or {}).get("scheduled")
if not sched_dep or not sched_arr:
continue
dep_dt = datetime.fromtimestamp(sched_dep, tz=timezone.utc)
arr_dt = datetime.fromtimestamp(sched_arr, tz=timezone.utc)
# Filter by *local* departure date — a flight that leaves
# at 23:50 in the origin TZ on the 6th appears as the 7th
# in UTC for west-of-UTC airports.
tz_str = (o.get("TZ") or "").lstrip(":") or "UTC"
try:
from zoneinfo import ZoneInfo
local_dep_date = dep_dt.astimezone(ZoneInfo(tz_str)).date()
except Exception:
local_dep_date = dep_dt.date()
if local_dep_date != target_date:
continue
out.append({
"ident": ident,
"flightNumber": _ident_to_iata(ident),
"aircraft": leg.get("aircraftType"),
"aircraftFriendly": leg.get("aircraftTypeFriendly"),
"depUTC": dep_dt.isoformat(),
"arrUTC": arr_dt.isoformat(),
"depTZ": tz_str,
"arrTZ": (d.get("TZ") or "").lstrip(":") or "UTC",
"depGate": o.get("gate"),
"depTerminal": o.get("terminal"),
"arrGate": d.get("gate"),
"arrTerminal": d.get("terminal"),
"durationMin": int((arr_dt - dep_dt).total_seconds() // 60),
})
return out
# Airline ICAO → IATA prefix for human-facing flight numbers.
# Trimmed list of carriers FlightAware uses idents for. The Swift port
# delegates to a fuller carriers DB.
_AIRLINE_ICAO_TO_IATA = {
"AAL": "AA", "DAL": "DL", "UAL": "UA", "SWA": "WN", "ASA": "AS",
"JBU": "B6", "FFT": "F9", "SKW": "OO", "NKS": "NK", "RPA": "YX",
"AAY": "G4", "HAL": "HA", "AWI": "9E", "ENY": "MQ", "EDV": "9E",
"BAW": "BA", "DLH": "LH", "KLM": "KL", "AFR": "AF", "VIR": "VS",
"IBE": "IB", "SAS": "SK", "FIN": "AY", "TAP": "TP", "AZA": "AZ",
"SWR": "LX", "AUA": "OS", "LOT": "LO", "TRA": "HV", "EZY": "U2",
"RYR": "FR", "WZZ": "W6", "PGT": "PC",
"QFA": "QF", "VOZ": "VA", "ANZ": "NZ", "JST": "JQ",
"ANA": "NH", "JAL": "JL", "ACA": "AC", "WJA": "WS",
"EVA": "BR", "CAL": "CI", "CES": "MU", "CCA": "CA", "CSN": "CZ",
"AAR": "OZ", "KAL": "KE", "SIA": "SQ", "THA": "TG", "CPA": "CX",
"AIC": "AI", "GIA": "GA", "MAS": "MH", "PAL": "PR",
"QTR": "QR", "UAE": "EK", "ETD": "EY", "RJA": "RJ", "SVA": "SV",
"ETH": "ET", "MEA": "ME", "LAN": "LA", "TAM": "JJ", "AVA": "AV",
"AMX": "AM", "VIV": "VB", "VOI": "Y4", "ELY": "LY",
}
def _ident_to_iata(ident: str) -> str:
"""AAL220 → 'AA220' for display."""
m = re.match(r"^([A-Z]{2,3})(\d{1,4})$", ident)
if not m:
return ident
icao_carrier, num = m.groups()
return _AIRLINE_ICAO_TO_IATA.get(icao_carrier, icao_carrier) + num
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
if len(sys.argv) < 4:
print("usage: probe_flightaware.py <dep_iata> <arr_iata> <YYYY-MM-DD>")
sys.exit(2)
dep_iata = sys.argv[1].upper()
arr_iata = sys.argv[2].upper()
target = datetime.strptime(sys.argv[3], "%Y-%m-%d").date()
dep_icao = iata_to_icao(dep_iata)
arr_icao = iata_to_icao(arr_iata)
print(f"[1/4] {dep_iata}({dep_icao}) → {arr_iata}({arr_icao}) on {target}")
route_url = (
"https://flightaware.com/analysis/route.rvt"
f"?origin={dep_icao}&destination={arr_icao}"
)
print(f"[2/4] GET {route_url}")
route_html = fetch(route_url)
idents = parse_route_idents(route_html)
print(f" found {len(idents)} distinct idents: {idents[:10]}")
print(f"[3/4] fetching trackpoll for each ident…")
all_flights: list[dict] = []
for ident in idents:
try:
flights = scheduled_flights_for(ident, dep_iata, arr_iata, target)
print(f" {ident}: {len(flights)} scheduled on {target}")
all_flights.extend(flights)
except Exception as e:
print(f" {ident}: ERROR {type(e).__name__}: {e}")
all_flights.sort(key=lambda f: f["depUTC"])
print(f"[4/4] total scheduled direct flights: {len(all_flights)}")
print()
for f in all_flights:
dep_local = datetime.fromisoformat(f["depUTC"]).astimezone()
print(f" {f['flightNumber']:8s} {f['aircraftFriendly'] or f['aircraft']}")
print(f" {f['depUTC']}{f['arrUTC']}")
print(f" gate {f['depGate'] or '?'} term {f['depTerminal'] or '?'}"
f" → gate {f['arrGate'] or '?'} term {f['arrTerminal'] or '?'}")
print(f" {f['durationMin']} min ({f['depTZ']}{f['arrTZ']})")
print()
if __name__ == "__main__":
main()