Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver

DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so
direct Node fetch returns 403 even with valid cookies. Page HTML for any
forum_site with stored cookies is now fetched via a FlareSolverr browser
session opened once per scrape job.

- Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those
  cookies seed undetected_chromedriver, Turnstile auto-solves in the real
  browser, login form submits, final cookies + browser UA persist to forum_sites
- Per-site user_agent column so subsequent scraper requests match the UA the
  cookies were issued for (DDoS-Guard rejects UA mismatches)
- XenForo search rewritten as proper CSRF POST /search/search → results page
  parse, replacing the broken ?q=... GET that only returned the search form
- Pagination regex fallback in detectMaxPage catches XenForo pages that
  cheerio's class-based selectors miss
- New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering
  the page via FlareSolverr and grabbing the signed mp4 from the resolved
  <video src> attribute (gallery-dl can't extract these — obfuscated WASM)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-29 19:33:54 -05:00
parent 236f36aae6
commit aa4f1157d1
6 changed files with 589 additions and 78 deletions
+97 -20
View File
@@ -1,10 +1,14 @@
#!/usr/bin/env python3
"""
Login helper using undetected_chromedriver to bypass Cloudflare Turnstile.
Runs Chrome in headed mode with Xvfb (virtual display) so Turnstile sees a real browser.
Login helper using undetected_chromedriver. The 4th argument is a JSON array of
pre-seeded cookies (from FlareSolverr) that satisfy DDoS-Guard, so Chrome lands
directly on the login form without facing the captcha. Turnstile auto-solves in
the real browser context.
Usage:
xvfb-run python3 login_helper.py <login_url> <username> <password>
xvfb-run python3 login_helper.py <login_url> <username> <password> <cookies_json>
cookies_json: JSON array like [{"name":"__ddg9_","value":"...","domain":"..."}]
Outputs JSON to stdout:
{"ok": true, "cookies": "name=val; name2=val2", "url": "<final_url>"}
@@ -19,12 +23,19 @@ import shutil
def main():
if len(sys.argv) < 4:
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password>"}))
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password> [cookies_json]"}))
sys.exit(1)
login_url = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
seed_cookies = []
if len(sys.argv) >= 5 and sys.argv[4]:
try:
seed_cookies = json.loads(sys.argv[4])
log(f"Got {len(seed_cookies)} seed cookies from FlareSolverr")
except Exception as e:
log(f"Could not parse seed cookies: {e}")
try:
import undetected_chromedriver as uc
@@ -76,6 +87,9 @@ def main():
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920,1080')
# Don't pin a fake UA — Turnstile detects spoofed UAs and refuses to
# auto-solve. The natural Chromium UA must be matched by all scraper
# requests instead (forum.js reads it from the forum_sites row).
log(f"Chromium: {chromium_path}")
log(f"Chromedriver: {chromedriver_path}")
@@ -90,23 +104,68 @@ def main():
)
driver.set_window_size(1920, 1080)
# Pre-seed DDoS-Guard cookies so Chrome skips the captcha challenge entirely
from urllib.parse import urlparse
parsed = urlparse(login_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
if seed_cookies:
# Must visit the domain first before add_cookie works
log(f"Visiting {base_url}/.well-known/ddos-guard/check.js to set cookie domain context...")
try:
# Use a static asset path that DDoS-Guard whitelists (returns 200 fast)
driver.get(base_url + "/favicon.ico")
except Exception:
driver.get("about:blank")
driver.execute_script(f"document.location = '{base_url}/favicon.ico';")
time.sleep(2)
log(f"Injecting {len(seed_cookies)} seed cookies...")
for c in seed_cookies:
cookie_dict = {
"name": c["name"],
"value": c["value"],
"path": c.get("path", "/"),
}
# Selenium add_cookie wants domain (without leading dot) and rejects mismatches
if c.get("domain"):
cookie_dict["domain"] = c["domain"]
if c.get("secure"):
cookie_dict["secure"] = c["secure"]
try:
driver.add_cookie(cookie_dict)
except Exception as e:
log(f" skip {c['name']}: {e}")
log(f"Navigating to {login_url}...")
driver.get(login_url)
# Wait for DDoS-Guard to solve and login form to appear
log("Waiting for login form (DDoS-Guard solving)...")
WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
)
log("Login form found")
# Wait for login form to appear
log("Waiting for login form...")
try:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
)
log("Login form found")
except Exception:
try:
log(f"Login form timeout. Title: '{driver.title}', url: {driver.current_url}")
log(f"Page source snippet: {driver.page_source[:500]}")
driver.save_screenshot('/tmp/login_timeout.png')
log("Screenshot saved to /tmp/login_timeout.png")
except Exception:
pass
raise
# Wait for Turnstile to auto-solve (should work in undetected headed mode)
log("Waiting for Turnstile to solve...")
turnstile_token = ""
for i in range(45):
for i in range(60):
try:
el = driver.find_element(By.CSS_SELECTOR, 'input[name="cf-turnstile-response"]')
val = el.get_attribute("value")
# Use JS .value (property), not get_attribute (which reads HTML attribute)
val = driver.execute_script(
'var el = document.querySelector(\'input[name="cf-turnstile-response"]\'); return el ? el.value : null;'
)
if val:
turnstile_token = val
break
@@ -178,22 +237,40 @@ def main():
has_user_cookie = any(c['name'] in ('xf_user', 'ogaddgmetaprof_user') for c in cookies)
if not has_user_cookie:
# Check for error message
# Check for error message — XenForo shows multiple variants
error_msg = "Login failed — no user cookie returned"
for sel in ['.blockMessage--error', '.errorOverlay', '.formRow--error', '.alert--error', '.blockMessage']:
try:
el = driver.find_element(By.CSS_SELECTOR, sel)
txt = el.text.strip()
if txt:
error_msg = txt[:300]
log(f"Found error in {sel}: {error_msg}")
break
except Exception:
pass
# Also try dumping page title and any visible "Invalid"/"failed"/"incorrect" text
try:
error_el = driver.find_element(By.CSS_SELECTOR, '.blockMessage--error')
error_msg = error_el.text.strip()
page_title = driver.title or ''
log(f"Page title after submit: '{page_title}'")
snippet = driver.page_source[:1500]
log(f"Page source snippet: {snippet}")
driver.save_screenshot('/tmp/login_failed.png')
log("Screenshot saved to /tmp/login_failed.png")
except Exception:
pass
# Also dump all cookie names for debugging
cookie_names = [c['name'] for c in cookies]
log(f"Cookie names: {cookie_names}")
log(f"Error: {error_msg}")
print(json.dumps({"ok": False, "error": error_msg, "url": final_url}))
sys.exit(1)
log(f"Login successful — {len(cookies)} cookies")
print(json.dumps({"ok": True, "cookies": cookie_str, "url": final_url}))
# Capture the browser's real UA so subsequent scraper requests can match it
try:
real_ua = driver.execute_script('return navigator.userAgent;')
except Exception:
real_ua = ''
log(f"Login successful — {len(cookies)} cookies, UA: {real_ua}")
print(json.dumps({"ok": True, "cookies": cookie_str, "user_agent": real_ua, "url": final_url}))
except Exception as e:
log(f"Fatal error: {e}")