aa4f1157d1
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
294 lines
11 KiB
Python
294 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Login helper using undetected_chromedriver. The 4th argument is a JSON array of
|
|
pre-seeded cookies (from FlareSolverr) that satisfy DDoS-Guard, so Chrome lands
|
|
directly on the login form without facing the captcha. Turnstile auto-solves in
|
|
the real browser context.
|
|
|
|
Usage:
|
|
xvfb-run python3 login_helper.py <login_url> <username> <password> <cookies_json>
|
|
|
|
cookies_json: JSON array like [{"name":"__ddg9_","value":"...","domain":"..."}]
|
|
|
|
Outputs JSON to stdout:
|
|
{"ok": true, "cookies": "name=val; name2=val2", "url": "<final_url>"}
|
|
{"ok": false, "error": "reason"}
|
|
"""
|
|
import sys
|
|
import json
|
|
import time
|
|
import os
|
|
import shutil
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 4:
|
|
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password> [cookies_json]"}))
|
|
sys.exit(1)
|
|
|
|
login_url = sys.argv[1]
|
|
username = sys.argv[2]
|
|
password = sys.argv[3]
|
|
seed_cookies = []
|
|
if len(sys.argv) >= 5 and sys.argv[4]:
|
|
try:
|
|
seed_cookies = json.loads(sys.argv[4])
|
|
log(f"Got {len(seed_cookies)} seed cookies from FlareSolverr")
|
|
except Exception as e:
|
|
log(f"Could not parse seed cookies: {e}")
|
|
|
|
try:
|
|
import undetected_chromedriver as uc
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
except ImportError as e:
|
|
print(json.dumps({"ok": False, "error": f"Missing dependency: {e}"}))
|
|
sys.exit(1)
|
|
|
|
driver = None
|
|
try:
|
|
# Find chromium binary
|
|
chromium_path = os.environ.get('CHROMIUM_PATH', '')
|
|
if not chromium_path or not os.path.exists(chromium_path):
|
|
for p in ['/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/lib/chromium/chromium']:
|
|
if os.path.exists(p):
|
|
chromium_path = p
|
|
break
|
|
|
|
# Find system chromedriver (Alpine: chromium-chromedriver package)
|
|
chromedriver_path = None
|
|
for p in ['/usr/bin/chromedriver', '/usr/lib/chromium/chromedriver']:
|
|
if os.path.exists(p):
|
|
chromedriver_path = p
|
|
break
|
|
|
|
# Get chromium version for undetected_chromedriver
|
|
version_main = None
|
|
try:
|
|
import subprocess
|
|
result = subprocess.run([chromium_path, '--version'], capture_output=True, text=True, timeout=5)
|
|
# e.g. "Chromium 131.0.6778.139" or "Chromium 131.0.6778.139 Alpine Linux"
|
|
parts = result.stdout.strip().split()
|
|
ver_str = None
|
|
for part in parts:
|
|
if '.' in part and part[0].isdigit():
|
|
ver_str = part
|
|
break
|
|
if ver_str:
|
|
version_main = int(ver_str.split('.')[0])
|
|
log(f"Chromium version: {ver_str} (major: {version_main})")
|
|
except Exception as e:
|
|
log(f"Could not detect chromium version: {e}")
|
|
|
|
options = uc.ChromeOptions()
|
|
options.binary_location = chromium_path
|
|
options.add_argument('--no-sandbox')
|
|
options.add_argument('--disable-dev-shm-usage')
|
|
options.add_argument('--disable-gpu')
|
|
options.add_argument('--window-size=1920,1080')
|
|
# Don't pin a fake UA — Turnstile detects spoofed UAs and refuses to
|
|
# auto-solve. The natural Chromium UA must be matched by all scraper
|
|
# requests instead (forum.js reads it from the forum_sites row).
|
|
|
|
log(f"Chromium: {chromium_path}")
|
|
log(f"Chromedriver: {chromedriver_path}")
|
|
|
|
# Create the driver
|
|
# Use system chromedriver to avoid downloading (fails on Alpine/musl)
|
|
driver = uc.Chrome(
|
|
options=options,
|
|
driver_executable_path=chromedriver_path,
|
|
headless=False,
|
|
version_main=version_main,
|
|
)
|
|
driver.set_window_size(1920, 1080)
|
|
|
|
# Pre-seed DDoS-Guard cookies so Chrome skips the captcha challenge entirely
|
|
from urllib.parse import urlparse
|
|
parsed = urlparse(login_url)
|
|
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
|
|
|
if seed_cookies:
|
|
# Must visit the domain first before add_cookie works
|
|
log(f"Visiting {base_url}/.well-known/ddos-guard/check.js to set cookie domain context...")
|
|
try:
|
|
# Use a static asset path that DDoS-Guard whitelists (returns 200 fast)
|
|
driver.get(base_url + "/favicon.ico")
|
|
except Exception:
|
|
driver.get("about:blank")
|
|
driver.execute_script(f"document.location = '{base_url}/favicon.ico';")
|
|
time.sleep(2)
|
|
|
|
log(f"Injecting {len(seed_cookies)} seed cookies...")
|
|
for c in seed_cookies:
|
|
cookie_dict = {
|
|
"name": c["name"],
|
|
"value": c["value"],
|
|
"path": c.get("path", "/"),
|
|
}
|
|
# Selenium add_cookie wants domain (without leading dot) and rejects mismatches
|
|
if c.get("domain"):
|
|
cookie_dict["domain"] = c["domain"]
|
|
if c.get("secure"):
|
|
cookie_dict["secure"] = c["secure"]
|
|
try:
|
|
driver.add_cookie(cookie_dict)
|
|
except Exception as e:
|
|
log(f" skip {c['name']}: {e}")
|
|
|
|
log(f"Navigating to {login_url}...")
|
|
driver.get(login_url)
|
|
|
|
# Wait for login form to appear
|
|
log("Waiting for login form...")
|
|
try:
|
|
WebDriverWait(driver, 30).until(
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
|
|
)
|
|
log("Login form found")
|
|
except Exception:
|
|
try:
|
|
log(f"Login form timeout. Title: '{driver.title}', url: {driver.current_url}")
|
|
log(f"Page source snippet: {driver.page_source[:500]}")
|
|
driver.save_screenshot('/tmp/login_timeout.png')
|
|
log("Screenshot saved to /tmp/login_timeout.png")
|
|
except Exception:
|
|
pass
|
|
raise
|
|
|
|
# Wait for Turnstile to auto-solve (should work in undetected headed mode)
|
|
log("Waiting for Turnstile to solve...")
|
|
turnstile_token = ""
|
|
for i in range(60):
|
|
try:
|
|
# Use JS .value (property), not get_attribute (which reads HTML attribute)
|
|
val = driver.execute_script(
|
|
'var el = document.querySelector(\'input[name="cf-turnstile-response"]\'); return el ? el.value : null;'
|
|
)
|
|
if val:
|
|
turnstile_token = val
|
|
break
|
|
except Exception:
|
|
pass
|
|
time.sleep(1)
|
|
if i % 10 == 9:
|
|
log(f"Still waiting for Turnstile... ({i+1}s)")
|
|
|
|
if turnstile_token:
|
|
log(f"Turnstile solved (token: {turnstile_token[:20]}...)")
|
|
else:
|
|
log("Warning: Turnstile token not found after 45s — attempting login anyway")
|
|
|
|
# Fill the login form
|
|
log("Filling login form...")
|
|
login_input = driver.find_element(By.CSS_SELECTOR, 'input[name="login"]')
|
|
login_input.clear()
|
|
# Type slowly to appear human
|
|
for ch in username:
|
|
login_input.send_keys(ch)
|
|
time.sleep(0.03)
|
|
|
|
pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="password"]')
|
|
pass_input.clear()
|
|
for ch in password:
|
|
pass_input.send_keys(ch)
|
|
time.sleep(0.03)
|
|
|
|
# Check remember checkbox
|
|
try:
|
|
remember = driver.find_element(By.CSS_SELECTOR, 'input[name="remember"]')
|
|
if not remember.is_selected():
|
|
driver.execute_script("arguments[0].checked = true;", remember)
|
|
except Exception:
|
|
pass
|
|
|
|
# Submit form
|
|
log("Submitting login form...")
|
|
try:
|
|
submit_btn = driver.find_element(By.CSS_SELECTOR,
|
|
'button[type="submit"], input[type="submit"], .button--primary')
|
|
submit_btn.click()
|
|
except Exception:
|
|
driver.execute_script("""
|
|
var form = document.querySelector('form.block-body') ||
|
|
document.querySelector('form[action*="login"]');
|
|
if (form) form.submit();
|
|
""")
|
|
|
|
# Wait for navigation after submit
|
|
log("Waiting for redirect...")
|
|
time.sleep(5)
|
|
try:
|
|
WebDriverWait(driver, 15).until(
|
|
lambda d: d.execute_script("return document.readyState") == "complete"
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
final_url = driver.current_url
|
|
log(f"After submit: {final_url}")
|
|
|
|
# Extract cookies
|
|
cookies = driver.get_cookies()
|
|
cookie_str = "; ".join(f"{c['name']}={c['value']}" for c in cookies)
|
|
|
|
# Check for login success
|
|
has_user_cookie = any(c['name'] in ('xf_user', 'ogaddgmetaprof_user') for c in cookies)
|
|
|
|
if not has_user_cookie:
|
|
# Check for error message — XenForo shows multiple variants
|
|
error_msg = "Login failed — no user cookie returned"
|
|
for sel in ['.blockMessage--error', '.errorOverlay', '.formRow--error', '.alert--error', '.blockMessage']:
|
|
try:
|
|
el = driver.find_element(By.CSS_SELECTOR, sel)
|
|
txt = el.text.strip()
|
|
if txt:
|
|
error_msg = txt[:300]
|
|
log(f"Found error in {sel}: {error_msg}")
|
|
break
|
|
except Exception:
|
|
pass
|
|
# Also try dumping page title and any visible "Invalid"/"failed"/"incorrect" text
|
|
try:
|
|
page_title = driver.title or ''
|
|
log(f"Page title after submit: '{page_title}'")
|
|
snippet = driver.page_source[:1500]
|
|
log(f"Page source snippet: {snippet}")
|
|
driver.save_screenshot('/tmp/login_failed.png')
|
|
log("Screenshot saved to /tmp/login_failed.png")
|
|
except Exception:
|
|
pass
|
|
cookie_names = [c['name'] for c in cookies]
|
|
log(f"Cookie names: {cookie_names}")
|
|
print(json.dumps({"ok": False, "error": error_msg, "url": final_url}))
|
|
sys.exit(1)
|
|
|
|
# Capture the browser's real UA so subsequent scraper requests can match it
|
|
try:
|
|
real_ua = driver.execute_script('return navigator.userAgent;')
|
|
except Exception:
|
|
real_ua = ''
|
|
log(f"Login successful — {len(cookies)} cookies, UA: {real_ua}")
|
|
print(json.dumps({"ok": True, "cookies": cookie_str, "user_agent": real_ua, "url": final_url}))
|
|
|
|
except Exception as e:
|
|
log(f"Fatal error: {e}")
|
|
print(json.dumps({"ok": False, "error": str(e)}))
|
|
sys.exit(1)
|
|
finally:
|
|
if driver:
|
|
try:
|
|
driver.quit()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def log(msg):
|
|
"""Log to stderr so it doesn't interfere with JSON stdout."""
|
|
print(f"[login_helper] {msg}", file=sys.stderr, flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|