Files
OFApp/server/login_helper.py
T
Trey T aa4f1157d1 Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so
direct Node fetch returns 403 even with valid cookies. Page HTML for any
forum_site with stored cookies is now fetched via a FlareSolverr browser
session opened once per scrape job.

- Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those
  cookies seed undetected_chromedriver, Turnstile auto-solves in the real
  browser, login form submits, final cookies + browser UA persist to forum_sites
- Per-site user_agent column so subsequent scraper requests match the UA the
  cookies were issued for (DDoS-Guard rejects UA mismatches)
- XenForo search rewritten as proper CSRF POST /search/search → results page
  parse, replacing the broken ?q=... GET that only returned the search form
- Pagination regex fallback in detectMaxPage catches XenForo pages that
  cheerio's class-based selectors miss
- New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering
  the page via FlareSolverr and grabbing the signed mp4 from the resolved
  <video src> attribute (gallery-dl can't extract these — obfuscated WASM)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 19:33:54 -05:00

294 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Login helper using undetected_chromedriver. The 4th argument is a JSON array of
pre-seeded cookies (from FlareSolverr) that satisfy DDoS-Guard, so Chrome lands
directly on the login form without facing the captcha. Turnstile auto-solves in
the real browser context.
Usage:
xvfb-run python3 login_helper.py <login_url> <username> <password> <cookies_json>
cookies_json: JSON array like [{"name":"__ddg9_","value":"...","domain":"..."}]
Outputs JSON to stdout:
{"ok": true, "cookies": "name=val; name2=val2", "url": "<final_url>"}
{"ok": false, "error": "reason"}
"""
import sys
import json
import time
import os
import shutil
def main():
if len(sys.argv) < 4:
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password> [cookies_json]"}))
sys.exit(1)
login_url = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
seed_cookies = []
if len(sys.argv) >= 5 and sys.argv[4]:
try:
seed_cookies = json.loads(sys.argv[4])
log(f"Got {len(seed_cookies)} seed cookies from FlareSolverr")
except Exception as e:
log(f"Could not parse seed cookies: {e}")
try:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
except ImportError as e:
print(json.dumps({"ok": False, "error": f"Missing dependency: {e}"}))
sys.exit(1)
driver = None
try:
# Find chromium binary
chromium_path = os.environ.get('CHROMIUM_PATH', '')
if not chromium_path or not os.path.exists(chromium_path):
for p in ['/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/lib/chromium/chromium']:
if os.path.exists(p):
chromium_path = p
break
# Find system chromedriver (Alpine: chromium-chromedriver package)
chromedriver_path = None
for p in ['/usr/bin/chromedriver', '/usr/lib/chromium/chromedriver']:
if os.path.exists(p):
chromedriver_path = p
break
# Get chromium version for undetected_chromedriver
version_main = None
try:
import subprocess
result = subprocess.run([chromium_path, '--version'], capture_output=True, text=True, timeout=5)
# e.g. "Chromium 131.0.6778.139" or "Chromium 131.0.6778.139 Alpine Linux"
parts = result.stdout.strip().split()
ver_str = None
for part in parts:
if '.' in part and part[0].isdigit():
ver_str = part
break
if ver_str:
version_main = int(ver_str.split('.')[0])
log(f"Chromium version: {ver_str} (major: {version_main})")
except Exception as e:
log(f"Could not detect chromium version: {e}")
options = uc.ChromeOptions()
options.binary_location = chromium_path
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920,1080')
# Don't pin a fake UA — Turnstile detects spoofed UAs and refuses to
# auto-solve. The natural Chromium UA must be matched by all scraper
# requests instead (forum.js reads it from the forum_sites row).
log(f"Chromium: {chromium_path}")
log(f"Chromedriver: {chromedriver_path}")
# Create the driver
# Use system chromedriver to avoid downloading (fails on Alpine/musl)
driver = uc.Chrome(
options=options,
driver_executable_path=chromedriver_path,
headless=False,
version_main=version_main,
)
driver.set_window_size(1920, 1080)
# Pre-seed DDoS-Guard cookies so Chrome skips the captcha challenge entirely
from urllib.parse import urlparse
parsed = urlparse(login_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
if seed_cookies:
# Must visit the domain first before add_cookie works
log(f"Visiting {base_url}/.well-known/ddos-guard/check.js to set cookie domain context...")
try:
# Use a static asset path that DDoS-Guard whitelists (returns 200 fast)
driver.get(base_url + "/favicon.ico")
except Exception:
driver.get("about:blank")
driver.execute_script(f"document.location = '{base_url}/favicon.ico';")
time.sleep(2)
log(f"Injecting {len(seed_cookies)} seed cookies...")
for c in seed_cookies:
cookie_dict = {
"name": c["name"],
"value": c["value"],
"path": c.get("path", "/"),
}
# Selenium add_cookie wants domain (without leading dot) and rejects mismatches
if c.get("domain"):
cookie_dict["domain"] = c["domain"]
if c.get("secure"):
cookie_dict["secure"] = c["secure"]
try:
driver.add_cookie(cookie_dict)
except Exception as e:
log(f" skip {c['name']}: {e}")
log(f"Navigating to {login_url}...")
driver.get(login_url)
# Wait for login form to appear
log("Waiting for login form...")
try:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
)
log("Login form found")
except Exception:
try:
log(f"Login form timeout. Title: '{driver.title}', url: {driver.current_url}")
log(f"Page source snippet: {driver.page_source[:500]}")
driver.save_screenshot('/tmp/login_timeout.png')
log("Screenshot saved to /tmp/login_timeout.png")
except Exception:
pass
raise
# Wait for Turnstile to auto-solve (should work in undetected headed mode)
log("Waiting for Turnstile to solve...")
turnstile_token = ""
for i in range(60):
try:
# Use JS .value (property), not get_attribute (which reads HTML attribute)
val = driver.execute_script(
'var el = document.querySelector(\'input[name="cf-turnstile-response"]\'); return el ? el.value : null;'
)
if val:
turnstile_token = val
break
except Exception:
pass
time.sleep(1)
if i % 10 == 9:
log(f"Still waiting for Turnstile... ({i+1}s)")
if turnstile_token:
log(f"Turnstile solved (token: {turnstile_token[:20]}...)")
else:
log("Warning: Turnstile token not found after 45s — attempting login anyway")
# Fill the login form
log("Filling login form...")
login_input = driver.find_element(By.CSS_SELECTOR, 'input[name="login"]')
login_input.clear()
# Type slowly to appear human
for ch in username:
login_input.send_keys(ch)
time.sleep(0.03)
pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="password"]')
pass_input.clear()
for ch in password:
pass_input.send_keys(ch)
time.sleep(0.03)
# Check remember checkbox
try:
remember = driver.find_element(By.CSS_SELECTOR, 'input[name="remember"]')
if not remember.is_selected():
driver.execute_script("arguments[0].checked = true;", remember)
except Exception:
pass
# Submit form
log("Submitting login form...")
try:
submit_btn = driver.find_element(By.CSS_SELECTOR,
'button[type="submit"], input[type="submit"], .button--primary')
submit_btn.click()
except Exception:
driver.execute_script("""
var form = document.querySelector('form.block-body') ||
document.querySelector('form[action*="login"]');
if (form) form.submit();
""")
# Wait for navigation after submit
log("Waiting for redirect...")
time.sleep(5)
try:
WebDriverWait(driver, 15).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
except Exception:
pass
final_url = driver.current_url
log(f"After submit: {final_url}")
# Extract cookies
cookies = driver.get_cookies()
cookie_str = "; ".join(f"{c['name']}={c['value']}" for c in cookies)
# Check for login success
has_user_cookie = any(c['name'] in ('xf_user', 'ogaddgmetaprof_user') for c in cookies)
if not has_user_cookie:
# Check for error message — XenForo shows multiple variants
error_msg = "Login failed — no user cookie returned"
for sel in ['.blockMessage--error', '.errorOverlay', '.formRow--error', '.alert--error', '.blockMessage']:
try:
el = driver.find_element(By.CSS_SELECTOR, sel)
txt = el.text.strip()
if txt:
error_msg = txt[:300]
log(f"Found error in {sel}: {error_msg}")
break
except Exception:
pass
# Also try dumping page title and any visible "Invalid"/"failed"/"incorrect" text
try:
page_title = driver.title or ''
log(f"Page title after submit: '{page_title}'")
snippet = driver.page_source[:1500]
log(f"Page source snippet: {snippet}")
driver.save_screenshot('/tmp/login_failed.png')
log("Screenshot saved to /tmp/login_failed.png")
except Exception:
pass
cookie_names = [c['name'] for c in cookies]
log(f"Cookie names: {cookie_names}")
print(json.dumps({"ok": False, "error": error_msg, "url": final_url}))
sys.exit(1)
# Capture the browser's real UA so subsequent scraper requests can match it
try:
real_ua = driver.execute_script('return navigator.userAgent;')
except Exception:
real_ua = ''
log(f"Login successful — {len(cookies)} cookies, UA: {real_ua}")
print(json.dumps({"ok": True, "cookies": cookie_str, "user_agent": real_ua, "url": final_url}))
except Exception as e:
log(f"Fatal error: {e}")
print(json.dumps({"ok": False, "error": str(e)}))
sys.exit(1)
finally:
if driver:
try:
driver.quit()
except Exception:
pass
def log(msg):
"""Log to stderr so it doesn't interfere with JSON stdout."""
print(f"[login_helper] {msg}", file=sys.stderr, flush=True)
if __name__ == "__main__":
main()