Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+97
-20
@@ -1,10 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Login helper using undetected_chromedriver to bypass Cloudflare Turnstile.
|
||||
Runs Chrome in headed mode with Xvfb (virtual display) so Turnstile sees a real browser.
|
||||
Login helper using undetected_chromedriver. The 4th argument is a JSON array of
|
||||
pre-seeded cookies (from FlareSolverr) that satisfy DDoS-Guard, so Chrome lands
|
||||
directly on the login form without facing the captcha. Turnstile auto-solves in
|
||||
the real browser context.
|
||||
|
||||
Usage:
|
||||
xvfb-run python3 login_helper.py <login_url> <username> <password>
|
||||
xvfb-run python3 login_helper.py <login_url> <username> <password> <cookies_json>
|
||||
|
||||
cookies_json: JSON array like [{"name":"__ddg9_","value":"...","domain":"..."}]
|
||||
|
||||
Outputs JSON to stdout:
|
||||
{"ok": true, "cookies": "name=val; name2=val2", "url": "<final_url>"}
|
||||
@@ -19,12 +23,19 @@ import shutil
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 4:
|
||||
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password>"}))
|
||||
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password> [cookies_json]"}))
|
||||
sys.exit(1)
|
||||
|
||||
login_url = sys.argv[1]
|
||||
username = sys.argv[2]
|
||||
password = sys.argv[3]
|
||||
seed_cookies = []
|
||||
if len(sys.argv) >= 5 and sys.argv[4]:
|
||||
try:
|
||||
seed_cookies = json.loads(sys.argv[4])
|
||||
log(f"Got {len(seed_cookies)} seed cookies from FlareSolverr")
|
||||
except Exception as e:
|
||||
log(f"Could not parse seed cookies: {e}")
|
||||
|
||||
try:
|
||||
import undetected_chromedriver as uc
|
||||
@@ -76,6 +87,9 @@ def main():
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--window-size=1920,1080')
|
||||
# Don't pin a fake UA — Turnstile detects spoofed UAs and refuses to
|
||||
# auto-solve. The natural Chromium UA must be matched by all scraper
|
||||
# requests instead (forum.js reads it from the forum_sites row).
|
||||
|
||||
log(f"Chromium: {chromium_path}")
|
||||
log(f"Chromedriver: {chromedriver_path}")
|
||||
@@ -90,23 +104,68 @@ def main():
|
||||
)
|
||||
driver.set_window_size(1920, 1080)
|
||||
|
||||
# Pre-seed DDoS-Guard cookies so Chrome skips the captcha challenge entirely
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(login_url)
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
|
||||
if seed_cookies:
|
||||
# Must visit the domain first before add_cookie works
|
||||
log(f"Visiting {base_url}/.well-known/ddos-guard/check.js to set cookie domain context...")
|
||||
try:
|
||||
# Use a static asset path that DDoS-Guard whitelists (returns 200 fast)
|
||||
driver.get(base_url + "/favicon.ico")
|
||||
except Exception:
|
||||
driver.get("about:blank")
|
||||
driver.execute_script(f"document.location = '{base_url}/favicon.ico';")
|
||||
time.sleep(2)
|
||||
|
||||
log(f"Injecting {len(seed_cookies)} seed cookies...")
|
||||
for c in seed_cookies:
|
||||
cookie_dict = {
|
||||
"name": c["name"],
|
||||
"value": c["value"],
|
||||
"path": c.get("path", "/"),
|
||||
}
|
||||
# Selenium add_cookie wants domain (without leading dot) and rejects mismatches
|
||||
if c.get("domain"):
|
||||
cookie_dict["domain"] = c["domain"]
|
||||
if c.get("secure"):
|
||||
cookie_dict["secure"] = c["secure"]
|
||||
try:
|
||||
driver.add_cookie(cookie_dict)
|
||||
except Exception as e:
|
||||
log(f" skip {c['name']}: {e}")
|
||||
|
||||
log(f"Navigating to {login_url}...")
|
||||
driver.get(login_url)
|
||||
|
||||
# Wait for DDoS-Guard to solve and login form to appear
|
||||
log("Waiting for login form (DDoS-Guard solving)...")
|
||||
WebDriverWait(driver, 60).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
|
||||
)
|
||||
log("Login form found")
|
||||
# Wait for login form to appear
|
||||
log("Waiting for login form...")
|
||||
try:
|
||||
WebDriverWait(driver, 30).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
|
||||
)
|
||||
log("Login form found")
|
||||
except Exception:
|
||||
try:
|
||||
log(f"Login form timeout. Title: '{driver.title}', url: {driver.current_url}")
|
||||
log(f"Page source snippet: {driver.page_source[:500]}")
|
||||
driver.save_screenshot('/tmp/login_timeout.png')
|
||||
log("Screenshot saved to /tmp/login_timeout.png")
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
|
||||
# Wait for Turnstile to auto-solve (should work in undetected headed mode)
|
||||
log("Waiting for Turnstile to solve...")
|
||||
turnstile_token = ""
|
||||
for i in range(45):
|
||||
for i in range(60):
|
||||
try:
|
||||
el = driver.find_element(By.CSS_SELECTOR, 'input[name="cf-turnstile-response"]')
|
||||
val = el.get_attribute("value")
|
||||
# Use JS .value (property), not get_attribute (which reads HTML attribute)
|
||||
val = driver.execute_script(
|
||||
'var el = document.querySelector(\'input[name="cf-turnstile-response"]\'); return el ? el.value : null;'
|
||||
)
|
||||
if val:
|
||||
turnstile_token = val
|
||||
break
|
||||
@@ -178,22 +237,40 @@ def main():
|
||||
has_user_cookie = any(c['name'] in ('xf_user', 'ogaddgmetaprof_user') for c in cookies)
|
||||
|
||||
if not has_user_cookie:
|
||||
# Check for error message
|
||||
# Check for error message — XenForo shows multiple variants
|
||||
error_msg = "Login failed — no user cookie returned"
|
||||
for sel in ['.blockMessage--error', '.errorOverlay', '.formRow--error', '.alert--error', '.blockMessage']:
|
||||
try:
|
||||
el = driver.find_element(By.CSS_SELECTOR, sel)
|
||||
txt = el.text.strip()
|
||||
if txt:
|
||||
error_msg = txt[:300]
|
||||
log(f"Found error in {sel}: {error_msg}")
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
# Also try dumping page title and any visible "Invalid"/"failed"/"incorrect" text
|
||||
try:
|
||||
error_el = driver.find_element(By.CSS_SELECTOR, '.blockMessage--error')
|
||||
error_msg = error_el.text.strip()
|
||||
page_title = driver.title or ''
|
||||
log(f"Page title after submit: '{page_title}'")
|
||||
snippet = driver.page_source[:1500]
|
||||
log(f"Page source snippet: {snippet}")
|
||||
driver.save_screenshot('/tmp/login_failed.png')
|
||||
log("Screenshot saved to /tmp/login_failed.png")
|
||||
except Exception:
|
||||
pass
|
||||
# Also dump all cookie names for debugging
|
||||
cookie_names = [c['name'] for c in cookies]
|
||||
log(f"Cookie names: {cookie_names}")
|
||||
log(f"Error: {error_msg}")
|
||||
print(json.dumps({"ok": False, "error": error_msg, "url": final_url}))
|
||||
sys.exit(1)
|
||||
|
||||
log(f"Login successful — {len(cookies)} cookies")
|
||||
print(json.dumps({"ok": True, "cookies": cookie_str, "url": final_url}))
|
||||
# Capture the browser's real UA so subsequent scraper requests can match it
|
||||
try:
|
||||
real_ua = driver.execute_script('return navigator.userAgent;')
|
||||
except Exception:
|
||||
real_ua = ''
|
||||
log(f"Login successful — {len(cookies)} cookies, UA: {real_ua}")
|
||||
print(json.dumps({"ok": True, "cookies": cookie_str, "user_agent": real_ua, "url": final_url}))
|
||||
|
||||
except Exception as e:
|
||||
log(f"Fatal error: {e}")
|
||||
|
||||
Reference in New Issue
Block a user