Search: FlightAware backbone, blob catalog, diagnostic infra
route-explorer's /api/token sits behind invisible Cloudflare Turnstile
that requires Apple's Private Access Token attestation. Third-party
iOS apps don't qualify for PAT issuance, and Linux Docker containers
can't pass it either (cross-OS fingerprint, even with patchright /
Camoufox). Migrates direct-flight search to FlightAware; multi-stop
and where-can-I-go remain via embedded SFSafariViewController.
- FlightAwareScheduleClient — scrapes route.rvt + trackpoll JSON for
real schedules without auth. T+0..2 day window. Tests against
captured HTML fixtures.
- BlobRouteClient — pulls the public Vercel blob route catalog
route-explorer's frontend reads (no auth, no Turnstile).
- DiagnosticLogger + LoggingURLSessionDelegate + DiagnosticsView —
device-shareable forensic trace. Boot header captures device, OS,
locale, UA; share-sheet export of session logs.
- TurnstileDebugView — live WKWebView gate inspector. Used to prove
the PAT-entitlement gap on a real device.
- RouteExplorerBrowserView — SFSafariViewController wrapper. Real
Safari clears Turnstile naturally; the in-app browser opens at
pre-filled search URLs. Surfaced from Search ("Open in
route-explorer") and Settings → Tools.
- RouteExplorerTokenStore + RouteExplorerSetupView — bookmarklet
capture flow (token round-tripped via flights://routeexplorer-token
URL scheme). Kept dormant for future use.
backend/ — Docker proxy attempts (Playwright, patchright, Camoufox).
All fail on Linux because Cloudflare auto-denies before the Turnstile
widget renders. Documented; kept as scaffolding for a future paid-
solver integration.
scripts/probe_flightaware.py — reference algorithm for the FA path.
scripts/probe_nodriver.py — local-Mac sanity check confirming the
gate clears with real macOS Chrome (proves the blocker is
fingerprint-level, not network-level).
This commit is contained in:
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Reference implementation of the FlightAware-based route+schedule lookup.
|
||||
This is the canonical algorithm the Swift port (FlightAwareScheduleClient)
|
||||
mirrors. No auth, no Turnstile, no headless browser — two plain GETs per
|
||||
search, both hitting open FlightAware web pages.
|
||||
|
||||
Pipeline for ("DFW", "AMS", 2026-06-06):
|
||||
1. Resolve dep_icao = "KDFW", arr_icao = "EHAM" (deterministic for US,
|
||||
curated table for international hubs).
|
||||
2. GET https://flightaware.com/analysis/route.rvt?origin=KDFW&destination=EHAM
|
||||
and parse the "Itemized List" table → distinct flight idents
|
||||
(e.g. "AAL220").
|
||||
3. For each ident, GET https://flightaware.com/live/flight/<ident> and
|
||||
extract the embedded `trackpollBootstrap` JSON via a brace-balanced
|
||||
scan over the script body.
|
||||
4. From trackpollBootstrap.flights[*].activityLog.flights, project
|
||||
each scheduled leg whose gateDepartureTimes.scheduled falls on the
|
||||
requested local-departure date.
|
||||
5. Emit (flightNumber, aircraft, dep_utc, arr_utc, dep_tz, arr_tz,
|
||||
dep_gate, dep_terminal, arr_gate, arr_terminal, duration_min).
|
||||
|
||||
Usage:
|
||||
python3 scripts/probe_flightaware.py DFW AMS 2026-06-06
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import date, datetime, timezone
|
||||
|
||||
# Small IATA→ICAO map. Production lookup lives in AirportDatabase.swift —
|
||||
# this mirrors enough major hubs to validate the script end-to-end.
|
||||
IATA_TO_ICAO_INTL: dict[str, str] = {
|
||||
"AMS": "EHAM", "LHR": "EGLL", "CDG": "LFPG", "FRA": "EDDF",
|
||||
"MAD": "LEMD", "BCN": "LEBL", "FCO": "LIRF", "MUC": "EDDM",
|
||||
"ZRH": "LSZH", "VIE": "LOWW", "BRU": "EBBR", "DUB": "EIDW",
|
||||
"LIS": "LPPT", "ATH": "LGAV", "IST": "LTFM", "DOH": "OTHH",
|
||||
"DXB": "OMDB", "AUH": "OMAA", "HND": "RJTT", "NRT": "RJAA",
|
||||
"ICN": "RKSI", "PEK": "ZBAA", "PVG": "ZSPD", "HKG": "VHHH",
|
||||
"SIN": "WSSS", "BKK": "VTBS", "SYD": "YSSY", "MEL": "YMML",
|
||||
"AKL": "NZAA", "JNB": "FAOR", "GRU": "SBGR", "EZE": "SAEZ",
|
||||
"MEX": "MMMX", "CUN": "MMUN",
|
||||
}
|
||||
|
||||
|
||||
def iata_to_icao(iata: str) -> str:
|
||||
"""US/Canada/Mexico are deterministic; international hubs use the map."""
|
||||
iata = iata.upper()
|
||||
if len(iata) != 3:
|
||||
raise ValueError(f"bad IATA: {iata!r}")
|
||||
if iata in IATA_TO_ICAO_INTL:
|
||||
return IATA_TO_ICAO_INTL[iata]
|
||||
# Heuristic: 48 US states → K-prefix. AK/HI use P-prefix (PANC/PHNL)
|
||||
# which we'd put in the curated map. Same for AS/PR/VI/GU.
|
||||
return "K" + iata
|
||||
|
||||
|
||||
_UA = (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
)
|
||||
|
||||
|
||||
def fetch(url: str) -> str:
|
||||
"""Curl with redirect-follow; URLSession in iOS follows redirects by default
|
||||
too, so this mirrors the runtime behaviour."""
|
||||
r = subprocess.run(
|
||||
["/usr/bin/curl", "-sSL", "--max-time", "25",
|
||||
"-A", _UA,
|
||||
"-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
url],
|
||||
capture_output=True, timeout=30,
|
||||
)
|
||||
if r.returncode != 0:
|
||||
raise RuntimeError(f"curl failed: {r.stderr.decode(errors='replace')}")
|
||||
return r.stdout.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 2: parse route.rvt → distinct flight idents
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Row shape inside the route.rvt "Itemized List" table:
|
||||
# <day> <HH:MM>[AP]M <TZ> <IDENT> <ORIGIN_ICAO> <DEST_IATA/ICAO> ...
|
||||
# The day column lacks delimiters in the text-stripped form but the regex
|
||||
# below tolerates the whitespace fuzz.
|
||||
# After tag-stripping the row reads
|
||||
# "Fri 02:46PM CDT AAL220 KDFW AMS / EHAM B772 FL350 …"
|
||||
# i.e. timezone abbrev between time and ident. The `.+?` between them
|
||||
# tolerates that (CDT / EDT / UTC / etc).
|
||||
_ROUTE_ROW_RE = re.compile(
|
||||
r"(?P<dow>Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+"
|
||||
r"\d{1,2}:\d{2}[AP]M.+?"
|
||||
r"(?P<ident>[A-Z]{2,3}\d{1,4})\s+"
|
||||
r"(?P<origin>[A-Z]{4})\s+",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
|
||||
def parse_route_idents(route_html: str) -> list[str]:
|
||||
"""Return distinct flight idents listed on the route analysis page."""
|
||||
text = re.sub(r"<[^>]+>", " ", route_html)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
idents: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for m in _ROUTE_ROW_RE.finditer(text):
|
||||
ident = m.group("ident")
|
||||
if ident not in seen:
|
||||
seen.add(ident)
|
||||
idents.append(ident)
|
||||
return idents
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 3: brace-balanced extract of `var trackpollBootstrap = {...};`
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TRACKPOLL_RE = re.compile(r"var\s+trackpollBootstrap\s*=\s*\{")
|
||||
|
||||
|
||||
def extract_trackpoll(html: str) -> dict:
|
||||
m = _TRACKPOLL_RE.search(html)
|
||||
if not m:
|
||||
raise ValueError("no trackpollBootstrap blob in HTML")
|
||||
start = m.end() - 1 # position of opening {
|
||||
i = start
|
||||
depth = 0
|
||||
in_str = False
|
||||
n = len(html)
|
||||
while i < n:
|
||||
c = html[i]
|
||||
if in_str:
|
||||
if c == "\\":
|
||||
i += 2
|
||||
continue
|
||||
if c == '"':
|
||||
in_str = False
|
||||
else:
|
||||
if c == '"':
|
||||
in_str = True
|
||||
elif c == "{":
|
||||
depth += 1
|
||||
elif c == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
return json.loads(html[start:i + 1])
|
||||
i += 1
|
||||
raise ValueError("trackpollBootstrap blob unbalanced")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 4–5: project scheduled flights for the requested date
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def scheduled_flights_for(ident: str, dep_iata: str, arr_iata: str,
|
||||
target_date: date) -> list[dict]:
|
||||
"""Pull and project the trackpoll JSON for a single ident."""
|
||||
url = f"https://flightaware.com/live/flight/{ident}"
|
||||
html = fetch(url)
|
||||
data = extract_trackpoll(html)
|
||||
out: list[dict] = []
|
||||
for _fid, flight in data.get("flights", {}).items():
|
||||
for leg in flight.get("activityLog", {}).get("flights", []):
|
||||
o = leg.get("origin", {})
|
||||
d = leg.get("destination", {})
|
||||
if o.get("iata") != dep_iata or d.get("iata") != arr_iata:
|
||||
continue
|
||||
sched_dep = (leg.get("gateDepartureTimes") or {}).get("scheduled")
|
||||
sched_arr = (leg.get("gateArrivalTimes") or {}).get("scheduled")
|
||||
if not sched_dep or not sched_arr:
|
||||
continue
|
||||
dep_dt = datetime.fromtimestamp(sched_dep, tz=timezone.utc)
|
||||
arr_dt = datetime.fromtimestamp(sched_arr, tz=timezone.utc)
|
||||
# Filter by *local* departure date — a flight that leaves
|
||||
# at 23:50 in the origin TZ on the 6th appears as the 7th
|
||||
# in UTC for west-of-UTC airports.
|
||||
tz_str = (o.get("TZ") or "").lstrip(":") or "UTC"
|
||||
try:
|
||||
from zoneinfo import ZoneInfo
|
||||
local_dep_date = dep_dt.astimezone(ZoneInfo(tz_str)).date()
|
||||
except Exception:
|
||||
local_dep_date = dep_dt.date()
|
||||
if local_dep_date != target_date:
|
||||
continue
|
||||
out.append({
|
||||
"ident": ident,
|
||||
"flightNumber": _ident_to_iata(ident),
|
||||
"aircraft": leg.get("aircraftType"),
|
||||
"aircraftFriendly": leg.get("aircraftTypeFriendly"),
|
||||
"depUTC": dep_dt.isoformat(),
|
||||
"arrUTC": arr_dt.isoformat(),
|
||||
"depTZ": tz_str,
|
||||
"arrTZ": (d.get("TZ") or "").lstrip(":") or "UTC",
|
||||
"depGate": o.get("gate"),
|
||||
"depTerminal": o.get("terminal"),
|
||||
"arrGate": d.get("gate"),
|
||||
"arrTerminal": d.get("terminal"),
|
||||
"durationMin": int((arr_dt - dep_dt).total_seconds() // 60),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
# Airline ICAO → IATA prefix for human-facing flight numbers.
|
||||
# Trimmed list of carriers FlightAware uses idents for. The Swift port
|
||||
# delegates to a fuller carriers DB.
|
||||
_AIRLINE_ICAO_TO_IATA = {
|
||||
"AAL": "AA", "DAL": "DL", "UAL": "UA", "SWA": "WN", "ASA": "AS",
|
||||
"JBU": "B6", "FFT": "F9", "SKW": "OO", "NKS": "NK", "RPA": "YX",
|
||||
"AAY": "G4", "HAL": "HA", "AWI": "9E", "ENY": "MQ", "EDV": "9E",
|
||||
"BAW": "BA", "DLH": "LH", "KLM": "KL", "AFR": "AF", "VIR": "VS",
|
||||
"IBE": "IB", "SAS": "SK", "FIN": "AY", "TAP": "TP", "AZA": "AZ",
|
||||
"SWR": "LX", "AUA": "OS", "LOT": "LO", "TRA": "HV", "EZY": "U2",
|
||||
"RYR": "FR", "WZZ": "W6", "PGT": "PC",
|
||||
"QFA": "QF", "VOZ": "VA", "ANZ": "NZ", "JST": "JQ",
|
||||
"ANA": "NH", "JAL": "JL", "ACA": "AC", "WJA": "WS",
|
||||
"EVA": "BR", "CAL": "CI", "CES": "MU", "CCA": "CA", "CSN": "CZ",
|
||||
"AAR": "OZ", "KAL": "KE", "SIA": "SQ", "THA": "TG", "CPA": "CX",
|
||||
"AIC": "AI", "GIA": "GA", "MAS": "MH", "PAL": "PR",
|
||||
"QTR": "QR", "UAE": "EK", "ETD": "EY", "RJA": "RJ", "SVA": "SV",
|
||||
"ETH": "ET", "MEA": "ME", "LAN": "LA", "TAM": "JJ", "AVA": "AV",
|
||||
"AMX": "AM", "VIV": "VB", "VOI": "Y4", "ELY": "LY",
|
||||
}
|
||||
|
||||
|
||||
def _ident_to_iata(ident: str) -> str:
|
||||
"""AAL220 → 'AA220' for display."""
|
||||
m = re.match(r"^([A-Z]{2,3})(\d{1,4})$", ident)
|
||||
if not m:
|
||||
return ident
|
||||
icao_carrier, num = m.groups()
|
||||
return _AIRLINE_ICAO_TO_IATA.get(icao_carrier, icao_carrier) + num
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 4:
|
||||
print("usage: probe_flightaware.py <dep_iata> <arr_iata> <YYYY-MM-DD>")
|
||||
sys.exit(2)
|
||||
dep_iata = sys.argv[1].upper()
|
||||
arr_iata = sys.argv[2].upper()
|
||||
target = datetime.strptime(sys.argv[3], "%Y-%m-%d").date()
|
||||
|
||||
dep_icao = iata_to_icao(dep_iata)
|
||||
arr_icao = iata_to_icao(arr_iata)
|
||||
print(f"[1/4] {dep_iata}({dep_icao}) → {arr_iata}({arr_icao}) on {target}")
|
||||
|
||||
route_url = (
|
||||
"https://flightaware.com/analysis/route.rvt"
|
||||
f"?origin={dep_icao}&destination={arr_icao}"
|
||||
)
|
||||
print(f"[2/4] GET {route_url}")
|
||||
route_html = fetch(route_url)
|
||||
idents = parse_route_idents(route_html)
|
||||
print(f" found {len(idents)} distinct idents: {idents[:10]}")
|
||||
|
||||
print(f"[3/4] fetching trackpoll for each ident…")
|
||||
all_flights: list[dict] = []
|
||||
for ident in idents:
|
||||
try:
|
||||
flights = scheduled_flights_for(ident, dep_iata, arr_iata, target)
|
||||
print(f" {ident}: {len(flights)} scheduled on {target}")
|
||||
all_flights.extend(flights)
|
||||
except Exception as e:
|
||||
print(f" {ident}: ERROR {type(e).__name__}: {e}")
|
||||
|
||||
all_flights.sort(key=lambda f: f["depUTC"])
|
||||
print(f"[4/4] total scheduled direct flights: {len(all_flights)}")
|
||||
print()
|
||||
for f in all_flights:
|
||||
dep_local = datetime.fromisoformat(f["depUTC"]).astimezone()
|
||||
print(f" {f['flightNumber']:8s} {f['aircraftFriendly'] or f['aircraft']}")
|
||||
print(f" {f['depUTC']} → {f['arrUTC']}")
|
||||
print(f" gate {f['depGate'] or '?'} term {f['depTerminal'] or '?'}"
|
||||
f" → gate {f['arrGate'] or '?'} term {f['arrTerminal'] or '?'}")
|
||||
print(f" {f['durationMin']} min ({f['depTZ']} → {f['arrTZ']})")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user