Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+4
-1
@@ -137,6 +137,9 @@ if (!forumCols.includes('password')) {
|
|||||||
if (!forumCols.includes('cookie_expires_at')) {
|
if (!forumCols.includes('cookie_expires_at')) {
|
||||||
db.exec('ALTER TABLE forum_sites ADD COLUMN cookie_expires_at TEXT');
|
db.exec('ALTER TABLE forum_sites ADD COLUMN cookie_expires_at TEXT');
|
||||||
}
|
}
|
||||||
|
if (!forumCols.includes('user_agent')) {
|
||||||
|
db.exec("ALTER TABLE forum_sites ADD COLUMN user_agent TEXT DEFAULT ''");
|
||||||
|
}
|
||||||
|
|
||||||
export function getAuthConfig() {
|
export function getAuthConfig() {
|
||||||
const row = db.prepare('SELECT * FROM auth_config LIMIT 1').get();
|
const row = db.prepare('SELECT * FROM auth_config LIMIT 1').get();
|
||||||
@@ -768,7 +771,7 @@ export function createForumSite(name, baseUrl, cookies, username, password) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function updateForumSite(id, fields) {
|
export function updateForumSite(id, fields) {
|
||||||
const allowed = ['name', 'base_url', 'cookies', 'username', 'password', 'cookie_expires_at'];
|
const allowed = ['name', 'base_url', 'cookies', 'username', 'password', 'cookie_expires_at', 'user_agent'];
|
||||||
const sets = [];
|
const sets = [];
|
||||||
const vals = [];
|
const vals = [];
|
||||||
for (const [k, v] of Object.entries(fields)) {
|
for (const [k, v] of Object.entries(fields)) {
|
||||||
|
|||||||
+142
-26
@@ -13,10 +13,113 @@ const CHROMIUM_PATH = process.env.CHROMIUM_PATH || '/usr/bin/chromium-browser';
|
|||||||
const __filename = fileURLToPath(import.meta.url);
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
const __dirname = path.dirname(__filename);
|
const __dirname = path.dirname(__filename);
|
||||||
|
|
||||||
|
async function fsCall(payload, timeoutMs = 130000) {
|
||||||
|
const resp = await fetch(`${FLARESOLVERR_URL}/v1`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(payload),
|
||||||
|
signal: AbortSignal.timeout(timeoutMs),
|
||||||
|
});
|
||||||
|
const data = await resp.json();
|
||||||
|
if (data.status !== 'ok') {
|
||||||
|
throw new Error(`FlareSolverr error: ${data.message || JSON.stringify(data)}`);
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
function cookieArrayToString(cookies) {
|
||||||
|
return cookies.map(c => `${c.name}=${c.value}`).join('; ');
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Refresh forum cookies using undetected_chromedriver (Python).
|
* Create a FlareSolverr browser session and warm it up so DDoS-Guard /
|
||||||
* Runs login_helper.py via xvfb-run so Chrome runs in headed mode
|
* Cloudflare cookies are seeded for the target host. Returns a sessionId that
|
||||||
* with a virtual display — this is what lets Turnstile auto-solve.
|
* must be passed to fsGet() and finally fsDestroySession().
|
||||||
|
*/
|
||||||
|
export async function fsCreateSession(warmUpUrl) {
|
||||||
|
const sess = await fsCall({ cmd: 'sessions.create' }, 60000);
|
||||||
|
if (warmUpUrl) {
|
||||||
|
try {
|
||||||
|
await fsCall({ cmd: 'request.get', url: warmUpUrl, session: sess.session, maxTimeout: 90000 });
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(`[flaresolverr] Warm-up GET ${warmUpUrl} failed: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sess.session;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fsDestroySession(sessionId) {
|
||||||
|
if (!sessionId) return;
|
||||||
|
try {
|
||||||
|
await fsCall({ cmd: 'sessions.destroy', session: sessionId }, 30000);
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(`[flaresolverr] Failed to destroy session ${sessionId}: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET a URL through a FlareSolverr session. The site's auth cookies (as a
|
||||||
|
* cookie-string) are merged into the session's cookie jar before navigation,
|
||||||
|
* so requests are authenticated without exposing fingerprintable headers.
|
||||||
|
* Returns { status, html, finalUrl }.
|
||||||
|
*/
|
||||||
|
export async function fsGet(sessionId, url, cookieStr, retries = 1) {
|
||||||
|
const cookies = (cookieStr || '').split(';').map(c => {
|
||||||
|
const [name, ...rest] = c.trim().split('=');
|
||||||
|
return { name: name.trim(), value: rest.join('=') };
|
||||||
|
}).filter(c => c.name && c.value && !c.name.startsWith('__ddg') && c.name !== 'ddg_last_challenge');
|
||||||
|
|
||||||
|
for (let i = 0; i <= retries; i++) {
|
||||||
|
try {
|
||||||
|
const r = await fsCall({
|
||||||
|
cmd: 'request.get',
|
||||||
|
url,
|
||||||
|
session: sessionId,
|
||||||
|
cookies,
|
||||||
|
maxTimeout: 90000,
|
||||||
|
});
|
||||||
|
if (r.solution && r.solution.status) {
|
||||||
|
return { status: r.solution.status, html: r.solution.response, finalUrl: r.solution.url };
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
if (i >= retries) throw e;
|
||||||
|
}
|
||||||
|
await new Promise(r => setTimeout(r, 3000));
|
||||||
|
}
|
||||||
|
return { status: 0, html: '', finalUrl: '' };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST form-encoded data through a FlareSolverr session.
|
||||||
|
*/
|
||||||
|
export async function fsPost(sessionId, url, cookieStr, postData) {
|
||||||
|
const cookies = (cookieStr || '').split(';').map(c => {
|
||||||
|
const [name, ...rest] = c.trim().split('=');
|
||||||
|
return { name: name.trim(), value: rest.join('=') };
|
||||||
|
}).filter(c => c.name && c.value && !c.name.startsWith('__ddg') && c.name !== 'ddg_last_challenge');
|
||||||
|
|
||||||
|
const r = await fsCall({
|
||||||
|
cmd: 'request.post',
|
||||||
|
url,
|
||||||
|
session: sessionId,
|
||||||
|
cookies,
|
||||||
|
postData,
|
||||||
|
maxTimeout: 90000,
|
||||||
|
});
|
||||||
|
return { status: r.solution.status, html: r.solution.response, finalUrl: r.solution.url };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Refresh forum cookies via a hybrid flow:
|
||||||
|
* 1) FlareSolverr clears DDoS-Guard's "I'm not a robot" captcha and returns the
|
||||||
|
* __ddg* cookies in JSON form.
|
||||||
|
* 2) undetected_chromedriver is launched with those cookies pre-loaded, so it
|
||||||
|
* lands directly on the login page (skipping the captcha). Turnstile then
|
||||||
|
* auto-solves in the real browser context, the form is submitted, and we
|
||||||
|
* extract the final session cookies (including the user-identity cookie).
|
||||||
|
*
|
||||||
|
* This is the only flow we've found that handles both DDoS-Guard captcha and
|
||||||
|
* Cloudflare Turnstile without external paid services.
|
||||||
*/
|
*/
|
||||||
export async function refreshForumCookies(siteId) {
|
export async function refreshForumCookies(siteId) {
|
||||||
const site = getForumSiteById(siteId);
|
const site = getForumSiteById(siteId);
|
||||||
@@ -25,66 +128,79 @@ export async function refreshForumCookies(siteId) {
|
|||||||
throw new Error('Forum site has no saved credentials — set username and password first');
|
throw new Error('Forum site has no saved credentials — set username and password first');
|
||||||
}
|
}
|
||||||
|
|
||||||
const baseUrl = site.base_url || 'https://simpcity.su';
|
const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, '');
|
||||||
const loginUrl = `${baseUrl}/login/`;
|
const loginUrl = `${baseUrl}/login/`;
|
||||||
const helperPath = path.join(__dirname, 'login_helper.py');
|
|
||||||
|
|
||||||
console.log(`[flaresolverr] Refreshing cookies for site ${siteId} (${site.name})`);
|
console.log(`[flaresolverr] Refreshing cookies for site ${siteId} (${site.name})`);
|
||||||
console.log(`[flaresolverr] Login URL: ${loginUrl}`);
|
|
||||||
|
|
||||||
// Run the Python helper with xvfb-run for virtual display
|
// Step 1: get DDoS-Guard cookies via FlareSolverr (no login attempt yet)
|
||||||
|
let ddgCookies = [];
|
||||||
|
let fsSessionId = null;
|
||||||
|
try {
|
||||||
|
const sess = await fsCall({ cmd: 'sessions.create' }, 60000);
|
||||||
|
fsSessionId = sess.session;
|
||||||
|
const getRes = await fsCall({
|
||||||
|
cmd: 'request.get',
|
||||||
|
url: baseUrl + '/',
|
||||||
|
session: fsSessionId,
|
||||||
|
maxTimeout: 120000,
|
||||||
|
});
|
||||||
|
ddgCookies = getRes.solution.cookies || [];
|
||||||
|
console.log(`[flaresolverr] DDoS-Guard cleared, got ${ddgCookies.length} cookies`);
|
||||||
|
} finally {
|
||||||
|
if (fsSessionId) {
|
||||||
|
try { await fsCall({ cmd: 'sessions.destroy', session: fsSessionId }, 30000); } catch {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ddgCookies.length === 0) {
|
||||||
|
throw new Error('FlareSolverr returned no cookies — DDoS-Guard not bypassed');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: launch chromedriver with pre-loaded cookies + perform login
|
||||||
|
const helperPath = path.join(__dirname, 'login_helper.py');
|
||||||
|
const cookiesJson = JSON.stringify(ddgCookies);
|
||||||
|
|
||||||
// Escape arguments for shell safety
|
// Escape arguments for shell safety
|
||||||
const escapedUrl = loginUrl.replace(/'/g, "'\\''");
|
const esc = (s) => s.replace(/'/g, "'\\''");
|
||||||
const escapedUser = site.username.replace(/'/g, "'\\''");
|
const cmd = `xvfb-run --auto-servernum --server-args='-screen 0 1920x1080x24' python3 -u '${helperPath}' '${esc(loginUrl)}' '${esc(site.username)}' '${esc(site.password)}' '${esc(cookiesJson)}'`;
|
||||||
const escapedPass = site.password.replace(/'/g, "'\\''");
|
|
||||||
|
|
||||||
const cmd = `xvfb-run --auto-servernum --server-args='-screen 0 1920x1080x24' python3 '${helperPath}' '${escapedUrl}' '${escapedUser}' '${escapedPass}'`;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { stdout, stderr } = await execAsync(cmd, {
|
const { stdout, stderr } = await execAsync(cmd, {
|
||||||
timeout: 120000, // 2 minutes
|
timeout: 180000,
|
||||||
maxBuffer: 10 * 1024 * 1024,
|
maxBuffer: 10 * 1024 * 1024,
|
||||||
env: { ...process.env, CHROMIUM_PATH },
|
env: { ...process.env, CHROMIUM_PATH },
|
||||||
});
|
});
|
||||||
|
|
||||||
// Log stderr (debug output from login_helper.py)
|
|
||||||
if (stderr) {
|
if (stderr) {
|
||||||
for (const line of stderr.split('\n').filter(Boolean)) {
|
for (const line of stderr.split('\n').filter(Boolean)) {
|
||||||
console.log(`[flaresolverr] ${line}`);
|
console.log(`[flaresolverr] ${line}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse JSON from stdout
|
|
||||||
const result = JSON.parse(stdout.trim());
|
const result = JSON.parse(stdout.trim());
|
||||||
|
if (!result.ok) throw new Error(result.error || 'Login failed');
|
||||||
|
|
||||||
if (!result.ok) {
|
|
||||||
throw new Error(result.error || 'Login failed');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update DB with new cookies
|
|
||||||
const expiresAt = new Date(Date.now() + 25 * 24 * 60 * 60 * 1000).toISOString();
|
const expiresAt = new Date(Date.now() + 25 * 24 * 60 * 60 * 1000).toISOString();
|
||||||
updateForumSite(siteId, {
|
updateForumSite(siteId, {
|
||||||
cookies: result.cookies,
|
cookies: result.cookies,
|
||||||
cookie_expires_at: expiresAt,
|
cookie_expires_at: expiresAt,
|
||||||
|
user_agent: result.user_agent || '',
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(`[flaresolverr] Cookie refresh successful for site ${siteId}`);
|
console.log(`[flaresolverr] Cookie refresh successful for site ${siteId} (UA: ${result.user_agent || 'default'})`);
|
||||||
return result.cookies;
|
return result.cookies;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
// If execAsync fails, the error might have stderr info
|
|
||||||
if (err.stderr) {
|
if (err.stderr) {
|
||||||
for (const line of err.stderr.split('\n').filter(Boolean)) {
|
for (const line of err.stderr.split('\n').filter(Boolean)) {
|
||||||
console.error(`[flaresolverr] ${line}`);
|
console.error(`[flaresolverr] ${line}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Try to parse stdout for a structured error
|
|
||||||
if (err.stdout) {
|
if (err.stdout) {
|
||||||
try {
|
try {
|
||||||
const result = JSON.parse(err.stdout.trim());
|
const result = JSON.parse(err.stdout.trim());
|
||||||
if (result.error) throw new Error(result.error);
|
if (result.error) throw new Error(result.error);
|
||||||
} catch (parseErr) {
|
} catch {}
|
||||||
// Not JSON, use original error
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
throw new Error(`Cookie refresh failed: ${err.message}`);
|
throw new Error(`Cookie refresh failed: ${err.message}`);
|
||||||
}
|
}
|
||||||
|
|||||||
+97
-20
@@ -1,10 +1,14 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Login helper using undetected_chromedriver to bypass Cloudflare Turnstile.
|
Login helper using undetected_chromedriver. The 4th argument is a JSON array of
|
||||||
Runs Chrome in headed mode with Xvfb (virtual display) so Turnstile sees a real browser.
|
pre-seeded cookies (from FlareSolverr) that satisfy DDoS-Guard, so Chrome lands
|
||||||
|
directly on the login form without facing the captcha. Turnstile auto-solves in
|
||||||
|
the real browser context.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
xvfb-run python3 login_helper.py <login_url> <username> <password>
|
xvfb-run python3 login_helper.py <login_url> <username> <password> <cookies_json>
|
||||||
|
|
||||||
|
cookies_json: JSON array like [{"name":"__ddg9_","value":"...","domain":"..."}]
|
||||||
|
|
||||||
Outputs JSON to stdout:
|
Outputs JSON to stdout:
|
||||||
{"ok": true, "cookies": "name=val; name2=val2", "url": "<final_url>"}
|
{"ok": true, "cookies": "name=val; name2=val2", "url": "<final_url>"}
|
||||||
@@ -19,12 +23,19 @@ import shutil
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) < 4:
|
if len(sys.argv) < 4:
|
||||||
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password>"}))
|
print(json.dumps({"ok": False, "error": "Usage: login_helper.py <login_url> <username> <password> [cookies_json]"}))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
login_url = sys.argv[1]
|
login_url = sys.argv[1]
|
||||||
username = sys.argv[2]
|
username = sys.argv[2]
|
||||||
password = sys.argv[3]
|
password = sys.argv[3]
|
||||||
|
seed_cookies = []
|
||||||
|
if len(sys.argv) >= 5 and sys.argv[4]:
|
||||||
|
try:
|
||||||
|
seed_cookies = json.loads(sys.argv[4])
|
||||||
|
log(f"Got {len(seed_cookies)} seed cookies from FlareSolverr")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Could not parse seed cookies: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import undetected_chromedriver as uc
|
import undetected_chromedriver as uc
|
||||||
@@ -76,6 +87,9 @@ def main():
|
|||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
options.add_argument('--disable-gpu')
|
options.add_argument('--disable-gpu')
|
||||||
options.add_argument('--window-size=1920,1080')
|
options.add_argument('--window-size=1920,1080')
|
||||||
|
# Don't pin a fake UA — Turnstile detects spoofed UAs and refuses to
|
||||||
|
# auto-solve. The natural Chromium UA must be matched by all scraper
|
||||||
|
# requests instead (forum.js reads it from the forum_sites row).
|
||||||
|
|
||||||
log(f"Chromium: {chromium_path}")
|
log(f"Chromium: {chromium_path}")
|
||||||
log(f"Chromedriver: {chromedriver_path}")
|
log(f"Chromedriver: {chromedriver_path}")
|
||||||
@@ -90,23 +104,68 @@ def main():
|
|||||||
)
|
)
|
||||||
driver.set_window_size(1920, 1080)
|
driver.set_window_size(1920, 1080)
|
||||||
|
|
||||||
|
# Pre-seed DDoS-Guard cookies so Chrome skips the captcha challenge entirely
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
parsed = urlparse(login_url)
|
||||||
|
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||||
|
|
||||||
|
if seed_cookies:
|
||||||
|
# Must visit the domain first before add_cookie works
|
||||||
|
log(f"Visiting {base_url}/.well-known/ddos-guard/check.js to set cookie domain context...")
|
||||||
|
try:
|
||||||
|
# Use a static asset path that DDoS-Guard whitelists (returns 200 fast)
|
||||||
|
driver.get(base_url + "/favicon.ico")
|
||||||
|
except Exception:
|
||||||
|
driver.get("about:blank")
|
||||||
|
driver.execute_script(f"document.location = '{base_url}/favicon.ico';")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
log(f"Injecting {len(seed_cookies)} seed cookies...")
|
||||||
|
for c in seed_cookies:
|
||||||
|
cookie_dict = {
|
||||||
|
"name": c["name"],
|
||||||
|
"value": c["value"],
|
||||||
|
"path": c.get("path", "/"),
|
||||||
|
}
|
||||||
|
# Selenium add_cookie wants domain (without leading dot) and rejects mismatches
|
||||||
|
if c.get("domain"):
|
||||||
|
cookie_dict["domain"] = c["domain"]
|
||||||
|
if c.get("secure"):
|
||||||
|
cookie_dict["secure"] = c["secure"]
|
||||||
|
try:
|
||||||
|
driver.add_cookie(cookie_dict)
|
||||||
|
except Exception as e:
|
||||||
|
log(f" skip {c['name']}: {e}")
|
||||||
|
|
||||||
log(f"Navigating to {login_url}...")
|
log(f"Navigating to {login_url}...")
|
||||||
driver.get(login_url)
|
driver.get(login_url)
|
||||||
|
|
||||||
# Wait for DDoS-Guard to solve and login form to appear
|
# Wait for login form to appear
|
||||||
log("Waiting for login form (DDoS-Guard solving)...")
|
log("Waiting for login form...")
|
||||||
WebDriverWait(driver, 60).until(
|
try:
|
||||||
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
|
WebDriverWait(driver, 30).until(
|
||||||
)
|
EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]'))
|
||||||
log("Login form found")
|
)
|
||||||
|
log("Login form found")
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
log(f"Login form timeout. Title: '{driver.title}', url: {driver.current_url}")
|
||||||
|
log(f"Page source snippet: {driver.page_source[:500]}")
|
||||||
|
driver.save_screenshot('/tmp/login_timeout.png')
|
||||||
|
log("Screenshot saved to /tmp/login_timeout.png")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
raise
|
||||||
|
|
||||||
# Wait for Turnstile to auto-solve (should work in undetected headed mode)
|
# Wait for Turnstile to auto-solve (should work in undetected headed mode)
|
||||||
log("Waiting for Turnstile to solve...")
|
log("Waiting for Turnstile to solve...")
|
||||||
turnstile_token = ""
|
turnstile_token = ""
|
||||||
for i in range(45):
|
for i in range(60):
|
||||||
try:
|
try:
|
||||||
el = driver.find_element(By.CSS_SELECTOR, 'input[name="cf-turnstile-response"]')
|
# Use JS .value (property), not get_attribute (which reads HTML attribute)
|
||||||
val = el.get_attribute("value")
|
val = driver.execute_script(
|
||||||
|
'var el = document.querySelector(\'input[name="cf-turnstile-response"]\'); return el ? el.value : null;'
|
||||||
|
)
|
||||||
if val:
|
if val:
|
||||||
turnstile_token = val
|
turnstile_token = val
|
||||||
break
|
break
|
||||||
@@ -178,22 +237,40 @@ def main():
|
|||||||
has_user_cookie = any(c['name'] in ('xf_user', 'ogaddgmetaprof_user') for c in cookies)
|
has_user_cookie = any(c['name'] in ('xf_user', 'ogaddgmetaprof_user') for c in cookies)
|
||||||
|
|
||||||
if not has_user_cookie:
|
if not has_user_cookie:
|
||||||
# Check for error message
|
# Check for error message — XenForo shows multiple variants
|
||||||
error_msg = "Login failed — no user cookie returned"
|
error_msg = "Login failed — no user cookie returned"
|
||||||
|
for sel in ['.blockMessage--error', '.errorOverlay', '.formRow--error', '.alert--error', '.blockMessage']:
|
||||||
|
try:
|
||||||
|
el = driver.find_element(By.CSS_SELECTOR, sel)
|
||||||
|
txt = el.text.strip()
|
||||||
|
if txt:
|
||||||
|
error_msg = txt[:300]
|
||||||
|
log(f"Found error in {sel}: {error_msg}")
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Also try dumping page title and any visible "Invalid"/"failed"/"incorrect" text
|
||||||
try:
|
try:
|
||||||
error_el = driver.find_element(By.CSS_SELECTOR, '.blockMessage--error')
|
page_title = driver.title or ''
|
||||||
error_msg = error_el.text.strip()
|
log(f"Page title after submit: '{page_title}'")
|
||||||
|
snippet = driver.page_source[:1500]
|
||||||
|
log(f"Page source snippet: {snippet}")
|
||||||
|
driver.save_screenshot('/tmp/login_failed.png')
|
||||||
|
log("Screenshot saved to /tmp/login_failed.png")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# Also dump all cookie names for debugging
|
|
||||||
cookie_names = [c['name'] for c in cookies]
|
cookie_names = [c['name'] for c in cookies]
|
||||||
log(f"Cookie names: {cookie_names}")
|
log(f"Cookie names: {cookie_names}")
|
||||||
log(f"Error: {error_msg}")
|
|
||||||
print(json.dumps({"ok": False, "error": error_msg, "url": final_url}))
|
print(json.dumps({"ok": False, "error": error_msg, "url": final_url}))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
log(f"Login successful — {len(cookies)} cookies")
|
# Capture the browser's real UA so subsequent scraper requests can match it
|
||||||
print(json.dumps({"ok": True, "cookies": cookie_str, "url": final_url}))
|
try:
|
||||||
|
real_ua = driver.execute_script('return navigator.userAgent;')
|
||||||
|
except Exception:
|
||||||
|
real_ua = ''
|
||||||
|
log(f"Login successful — {len(cookies)} cookies, UA: {real_ua}")
|
||||||
|
print(json.dumps({"ok": True, "cookies": cookie_str, "user_agent": real_ua, "url": final_url}))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log(f"Fatal error: {e}")
|
log(f"Fatal error: {e}")
|
||||||
|
|||||||
+144
-6
@@ -1,8 +1,9 @@
|
|||||||
import { Router } from 'express';
|
import { Router } from 'express';
|
||||||
import { mkdirSync } from 'fs';
|
import { mkdirSync } from 'fs';
|
||||||
import { join } from 'path';
|
import { join } from 'path';
|
||||||
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError } from './scrapers/forum.js';
|
import * as cheerio from 'cheerio';
|
||||||
import { refreshForumCookies } from './flaresolverr.js';
|
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
|
||||||
|
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
|
||||||
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
|
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
|
||||||
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
|
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
|
||||||
import { parseMegaUrl, listAllFiles, downloadMegaFiles } from './scrapers/mega.js';
|
import { parseMegaUrl, listAllFiles, downloadMegaFiles } from './scrapers/mega.js';
|
||||||
@@ -75,14 +76,17 @@ function jobToJson(job) {
|
|||||||
async function runForumScrape(job) {
|
async function runForumScrape(job) {
|
||||||
let { url, startPage, endPage, delay, folderName, siteId, lastPageOnly } = job.config;
|
let { url, startPage, endPage, delay, folderName, siteId, lastPageOnly } = job.config;
|
||||||
let { cookies } = job.config;
|
let { cookies } = job.config;
|
||||||
|
let userAgent = job.config.userAgent || '';
|
||||||
|
|
||||||
// Load cookies from forum site record if siteId provided and no cookies passed
|
// Load cookies from forum site record if siteId provided and no cookies passed
|
||||||
if (!cookies && siteId) {
|
if (!cookies && siteId) {
|
||||||
const site = getForumSiteById(siteId);
|
const site = getForumSiteById(siteId);
|
||||||
if (site && site.cookies) {
|
if (site && site.cookies) {
|
||||||
cookies = site.cookies;
|
cookies = site.cookies;
|
||||||
|
userAgent = site.user_agent || userAgent;
|
||||||
job.config.cookies = cookies;
|
job.config.cookies = cookies;
|
||||||
addLog(job, `Loaded cookies from forum site: ${site.name}`);
|
job.config.userAgent = userAgent;
|
||||||
|
addLog(job, `Loaded cookies from forum site: ${site.name}${userAgent ? ` (UA pinned)` : ''}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,10 +96,24 @@ async function runForumScrape(job) {
|
|||||||
const downloadedSet = new Set();
|
const downloadedSet = new Set();
|
||||||
let totalImages = 0;
|
let totalImages = 0;
|
||||||
|
|
||||||
|
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
|
||||||
|
// direct fetch hits DDoS-Guard's browser-fingerprint check and gets 403.
|
||||||
|
let fsSession = null;
|
||||||
|
if (siteId && cookies) {
|
||||||
|
try {
|
||||||
|
const baseHost = new URL(url).origin;
|
||||||
|
addLog(job, `Opening FlareSolverr session for ${baseHost}...`);
|
||||||
|
fsSession = await fsCreateSession(baseHost + '/');
|
||||||
|
addLog(job, `FlareSolverr session ready (${fsSession.slice(0, 8)}...)`);
|
||||||
|
} catch (e) {
|
||||||
|
addLog(job, `FlareSolverr session failed (${e.message}) — falling back to direct fetch`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If lastPageOnly, detect the last page and only scrape that
|
// If lastPageOnly, detect the last page and only scrape that
|
||||||
if (lastPageOnly) {
|
if (lastPageOnly) {
|
||||||
addLog(job, 'Detecting last page...');
|
addLog(job, 'Detecting last page...');
|
||||||
const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies);
|
const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||||
if (maxPage) {
|
if (maxPage) {
|
||||||
startPage = maxPage;
|
startPage = maxPage;
|
||||||
endPage = maxPage;
|
endPage = maxPage;
|
||||||
@@ -122,15 +140,18 @@ async function runForumScrape(job) {
|
|||||||
|
|
||||||
let count;
|
let count;
|
||||||
try {
|
try {
|
||||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies);
|
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof CookieExpiredError && siteId) {
|
if (err instanceof CookieExpiredError && siteId) {
|
||||||
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
|
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
|
||||||
try {
|
try {
|
||||||
cookies = await refreshForumCookies(siteId);
|
cookies = await refreshForumCookies(siteId);
|
||||||
|
const refreshed = getForumSiteById(siteId);
|
||||||
|
userAgent = refreshed?.user_agent || userAgent;
|
||||||
job.config.cookies = cookies;
|
job.config.cookies = cookies;
|
||||||
|
job.config.userAgent = userAgent;
|
||||||
addLog(job, 'Cookies refreshed successfully — retrying page...');
|
addLog(job, 'Cookies refreshed successfully — retrying page...');
|
||||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies);
|
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||||
} catch (refreshErr) {
|
} catch (refreshErr) {
|
||||||
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
|
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
|
||||||
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
|
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
|
||||||
@@ -156,6 +177,10 @@ async function runForumScrape(job) {
|
|||||||
addLog(job, `Error: ${err.message}`);
|
addLog(job, `Error: ${err.message}`);
|
||||||
job.progress.errors++;
|
job.progress.errors++;
|
||||||
} finally {
|
} finally {
|
||||||
|
if (fsSession) {
|
||||||
|
await fsDestroySession(fsSession);
|
||||||
|
addLog(job, `FlareSolverr session closed`);
|
||||||
|
}
|
||||||
job.running = false;
|
job.running = false;
|
||||||
job.completedAt = new Date().toISOString();
|
job.completedAt = new Date().toISOString();
|
||||||
addLog(job, `Done! ${totalImages} files saved to ${folderName}/`);
|
addLog(job, `Done! ${totalImages} files saved to ${folderName}/`);
|
||||||
@@ -653,6 +678,119 @@ router.post('/api/scrape/forum/detect-pages', async (req, res) => {
|
|||||||
res.json({ maxPage, logs });
|
res.json({ maxPage, logs });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Search a forum site for threads matching a query, return preview images per thread
|
||||||
|
const SEARCH_SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star'];
|
||||||
|
const PREVIEW_IMG_EXTS = /\.(jpg|jpeg|png|webp|gif)(\?|$)/i;
|
||||||
|
|
||||||
|
router.post('/api/scrape/forum/search', async (req, res) => {
|
||||||
|
const { query, siteId = 2, maxThreads = 5, previewsPerThread = 4, titleOnly = true } = req.body;
|
||||||
|
if (!query) return res.status(400).json({ error: 'query is required' });
|
||||||
|
|
||||||
|
const site = getForumSiteById(siteId);
|
||||||
|
if (!site) return res.status(404).json({ error: `Forum site ${siteId} not found` });
|
||||||
|
if (!site.cookies) return res.status(400).json({ error: 'Forum site has no cookies — refresh first' });
|
||||||
|
|
||||||
|
const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, '');
|
||||||
|
const cookies = site.cookies;
|
||||||
|
|
||||||
|
let fsSession = null;
|
||||||
|
try {
|
||||||
|
fsSession = await fsCreateSession(baseUrl + '/');
|
||||||
|
|
||||||
|
// Step 1: GET search form to grab the XenForo CSRF token
|
||||||
|
const formRes = await fsGet(fsSession, baseUrl + '/search/', cookies);
|
||||||
|
if (formRes.status !== 200) {
|
||||||
|
return res.status(formRes.status).json({ error: `Search form fetch failed: HTTP ${formRes.status}` });
|
||||||
|
}
|
||||||
|
const xfMatch = formRes.html.match(/name="_xfToken"\s+value="([^"]+)"/);
|
||||||
|
if (!xfMatch) {
|
||||||
|
return res.status(503).json({ error: 'No _xfToken on search form — cookies likely expired. Refresh via /api/flaresolverr/refresh/' + siteId });
|
||||||
|
}
|
||||||
|
const xfToken = xfMatch[1];
|
||||||
|
|
||||||
|
// Step 2: POST the search; XenForo redirects to /search/<id>/ with results
|
||||||
|
const postBody = new URLSearchParams({
|
||||||
|
keywords: query,
|
||||||
|
'c[title_only]': titleOnly ? '1' : '',
|
||||||
|
'c[users]': '',
|
||||||
|
_xfToken: xfToken,
|
||||||
|
});
|
||||||
|
const postRes = await fsPost(fsSession, baseUrl + '/search/search', cookies, postBody.toString());
|
||||||
|
const html = postRes.html;
|
||||||
|
|
||||||
|
// Parse thread results from contentRow-title anchors (XenForo result layout)
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
const seen = new Set();
|
||||||
|
const threads = [];
|
||||||
|
$('h3.contentRow-title a[href*="/threads/"]').each((_, el) => {
|
||||||
|
const $a = $(el);
|
||||||
|
let href;
|
||||||
|
try { href = new URL($a.attr('href'), baseUrl).href; } catch { return; }
|
||||||
|
const m = href.match(/\/threads\/([^\/]+\.\d+)\//);
|
||||||
|
if (!m) return;
|
||||||
|
const threadRoot = `${baseUrl}/threads/${m[1]}/`;
|
||||||
|
if (seen.has(threadRoot)) return;
|
||||||
|
seen.add(threadRoot);
|
||||||
|
const title = $a.text().replace(/\s+/g, ' ').trim();
|
||||||
|
if (!title || title.length < 3) return;
|
||||||
|
threads.push({ threadUrl: threadRoot, title });
|
||||||
|
});
|
||||||
|
|
||||||
|
if (threads.length === 0) {
|
||||||
|
return res.json({ query, results: [] });
|
||||||
|
}
|
||||||
|
|
||||||
|
// For top N threads, fetch last page and pull preview image URLs
|
||||||
|
const topThreads = threads.slice(0, maxThreads);
|
||||||
|
const results = [];
|
||||||
|
for (const t of topThreads) {
|
||||||
|
try {
|
||||||
|
const maxPage = await detectMaxPage(t.threadUrl, () => {}, cookies, '', fsSession);
|
||||||
|
const lastPageUrl = maxPage && maxPage > 1 ? `${t.threadUrl}page-${maxPage}` : t.threadUrl;
|
||||||
|
const pageRes = await fsGet(fsSession, lastPageUrl, cookies);
|
||||||
|
if (pageRes.status !== 200) {
|
||||||
|
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews: [], error: `HTTP ${pageRes.status}` });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const $p = cheerio.load(pageRes.html);
|
||||||
|
const imgUrls = [];
|
||||||
|
$p('.message-body img, .bbWrapper img').each((_, el) => {
|
||||||
|
const $img = $p(el);
|
||||||
|
const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url');
|
||||||
|
if (!src) return;
|
||||||
|
let absSrc;
|
||||||
|
try { absSrc = new URL(src, lastPageUrl).href; } catch { return; }
|
||||||
|
const lower = absSrc.toLowerCase();
|
||||||
|
if (SEARCH_SKIP_PATTERNS.some(p => lower.includes(p))) return;
|
||||||
|
const $parentA = $img.closest('a');
|
||||||
|
if ($parentA.length && $parentA.attr('href')) {
|
||||||
|
try {
|
||||||
|
const aHref = new URL($parentA.attr('href'), lastPageUrl).href;
|
||||||
|
if (PREVIEW_IMG_EXTS.test(aHref)) { imgUrls.push(aHref); return; }
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
let upgraded = absSrc.replace('.th.', '.').replace('.md.', '.');
|
||||||
|
if (PREVIEW_IMG_EXTS.test(upgraded) || /\/data\/attachments|proxy\.php/.test(upgraded)) {
|
||||||
|
imgUrls.push(upgraded);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
const unique = [...new Set(imgUrls)];
|
||||||
|
const previews = unique.slice(-previewsPerThread);
|
||||||
|
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews });
|
||||||
|
} catch (err) {
|
||||||
|
results.push({ ...t, previews: [], error: err.message });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res.json({ query, results });
|
||||||
|
} catch (err) {
|
||||||
|
console.error('[scrape/forum/search]', err);
|
||||||
|
res.status(500).json({ error: err.message });
|
||||||
|
} finally {
|
||||||
|
if (fsSession) await fsDestroySession(fsSession);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// --- Forum Sites CRUD ---
|
// --- Forum Sites CRUD ---
|
||||||
|
|
||||||
router.get('/api/scrape/forum-sites', (_req, res) => {
|
router.get('/api/scrape/forum-sites', (_req, res) => {
|
||||||
|
|||||||
+64
-25
@@ -5,6 +5,8 @@ import { pipeline } from 'stream/promises';
|
|||||||
import { execFile } from 'child_process';
|
import { execFile } from 'child_process';
|
||||||
import { promisify } from 'util';
|
import { promisify } from 'util';
|
||||||
import { upsertMediaFile } from '../db.js';
|
import { upsertMediaFile } from '../db.js';
|
||||||
|
import { fsGet } from '../flaresolverr.js';
|
||||||
|
import { isTurboUrl, downloadTurbo } from './turbo.js';
|
||||||
|
|
||||||
const execFileAsync = promisify(execFile);
|
const execFileAsync = promisify(execFile);
|
||||||
|
|
||||||
@@ -20,11 +22,13 @@ export class CookieExpiredError extends Error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser
|
// Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser
|
||||||
function fixCookieIp(cookies) {
|
export function fixCookieIp(cookies) {
|
||||||
if (!cookies) return cookies;
|
if (!cookies) return cookies;
|
||||||
return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`);
|
return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const FORUM_UA = UA;
|
||||||
|
|
||||||
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
|
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
|
||||||
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']);
|
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']);
|
||||||
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails'];
|
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails'];
|
||||||
@@ -69,13 +73,20 @@ export function getPageUrl(baseUrl, pageNum) {
|
|||||||
return url.split('#')[0];
|
return url.split('#')[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function detectMaxPage(baseUrl, logFn, cookies) {
|
export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSession) {
|
||||||
try {
|
try {
|
||||||
const headers = { 'User-Agent': UA };
|
let html;
|
||||||
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
if (fsSession) {
|
||||||
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
|
const r = await fsGet(fsSession, baseUrl, cookies);
|
||||||
if (!resp.ok) return null;
|
if (r.status !== 200) return null;
|
||||||
const html = await resp.text();
|
html = r.html;
|
||||||
|
} else {
|
||||||
|
const headers = { 'User-Agent': userAgent || UA };
|
||||||
|
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
||||||
|
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
|
||||||
|
if (!resp.ok) return null;
|
||||||
|
html = await resp.text();
|
||||||
|
}
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
let maxPage = 1;
|
let maxPage = 1;
|
||||||
@@ -91,6 +102,17 @@ export async function detectMaxPage(baseUrl, logFn, cookies) {
|
|||||||
if (n > maxPage && n < 10000) maxPage = n;
|
if (n > maxPage && n < 10000) maxPage = n;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
// Final fallback: scan raw HTML for any page-N references (XenForo's
|
||||||
|
// serialized pagination sometimes only appears in href attributes that
|
||||||
|
// cheerio's class-based selectors miss).
|
||||||
|
if (maxPage === 1) {
|
||||||
|
const re = /page-(\d+)/g;
|
||||||
|
let m;
|
||||||
|
while ((m = re.exec(html)) !== null) {
|
||||||
|
const n = parseInt(m[1], 10);
|
||||||
|
if (n > maxPage && n < 10000) maxPage = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (maxPage > 1) {
|
if (maxPage > 1) {
|
||||||
logFn(`Detected ${maxPage} pages`);
|
logFn(`Detected ${maxPage} pages`);
|
||||||
@@ -123,7 +145,7 @@ function tryFullSizeUrl(thumbUrl) {
|
|||||||
return candidates;
|
return candidates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
|
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) {
|
||||||
if (downloadedSet.has(url)) return false;
|
if (downloadedSet.has(url)) return false;
|
||||||
if (!isImageUrl(url)) return false;
|
if (!isImageUrl(url)) return false;
|
||||||
const lower = url.toLowerCase();
|
const lower = url.toLowerCase();
|
||||||
@@ -142,7 +164,7 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const dlHeaders = { 'User-Agent': UA };
|
const dlHeaders = { 'User-Agent': userAgent || UA };
|
||||||
if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies);
|
if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies);
|
||||||
const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) });
|
const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) });
|
||||||
if (!resp.ok) {
|
if (!resp.ok) {
|
||||||
@@ -171,10 +193,16 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.)
|
// Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.)
|
||||||
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
|
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, userAgent, fsSession) {
|
||||||
if (downloadedSet.has(url)) return 0;
|
if (downloadedSet.has(url)) return 0;
|
||||||
downloadedSet.add(url);
|
downloadedSet.add(url);
|
||||||
|
|
||||||
|
// turbo.cr uses an obfuscated WASM player — gallery-dl can't extract the
|
||||||
|
// signed mp4 URL. Resolve via FlareSolverr (renders JS) instead.
|
||||||
|
if (isTurboUrl(url)) {
|
||||||
|
return await downloadTurbo(url, outputDir, logFn, userAgent, fsSession);
|
||||||
|
}
|
||||||
|
|
||||||
logFn(`Resolving via gallery-dl: ${url}`);
|
logFn(`Resolving via gallery-dl: ${url}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -240,23 +268,34 @@ async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies) {
|
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) {
|
||||||
logFn(`Fetching page: ${pageUrl}`);
|
logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`);
|
||||||
|
|
||||||
let html;
|
let html;
|
||||||
try {
|
try {
|
||||||
const headers = { 'User-Agent': UA };
|
if (fsSession) {
|
||||||
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
const r = await fsGet(fsSession, pageUrl, cookies);
|
||||||
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
|
if (r.status !== 200) {
|
||||||
if (!resp.ok) {
|
if (cookies && (r.status === 404 || r.status === 403)) {
|
||||||
// SimpCity returns 404 for expired sessions, 403 for blocked
|
throw new CookieExpiredError(r.status);
|
||||||
if (cookies && (resp.status === 404 || resp.status === 403)) {
|
}
|
||||||
throw new CookieExpiredError(resp.status);
|
logFn(`Failed to fetch page (${r.status})`);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
logFn(`Failed to fetch page (${resp.status})`);
|
html = r.html;
|
||||||
return 0;
|
} else {
|
||||||
|
const headers = { 'User-Agent': userAgent || UA };
|
||||||
|
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
||||||
|
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
|
||||||
|
if (!resp.ok) {
|
||||||
|
if (cookies && (resp.status === 404 || resp.status === 403)) {
|
||||||
|
throw new CookieExpiredError(resp.status);
|
||||||
|
}
|
||||||
|
logFn(`Failed to fetch page (${resp.status})`);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
html = await resp.text();
|
||||||
}
|
}
|
||||||
html = await resp.text();
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof CookieExpiredError) throw err;
|
if (err instanceof CookieExpiredError) throw err;
|
||||||
logFn(`Failed to fetch page: ${err.message}`);
|
logFn(`Failed to fetch page: ${err.message}`);
|
||||||
@@ -359,14 +398,14 @@ export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn,
|
|||||||
|
|
||||||
// Download images
|
// Download images
|
||||||
for (const imgUrl of imageUrls) {
|
for (const imgUrl of imageUrls) {
|
||||||
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies)) {
|
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download from external hosts via gallery-dl
|
// Download from external hosts (turbo.cr handled via FlareSolverr; rest via gallery-dl)
|
||||||
for (const extUrl of externalUrls) {
|
for (const extUrl of externalUrls) {
|
||||||
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn);
|
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn, userAgent, fsSession);
|
||||||
count += dlCount;
|
count += dlCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,138 @@
|
|||||||
|
import { writeFileSync, existsSync } from 'fs';
|
||||||
|
import { join, basename } from 'path';
|
||||||
|
import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js';
|
||||||
|
import { upsertMediaFile } from '../db.js';
|
||||||
|
|
||||||
|
const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i;
|
||||||
|
const TURBO_BASE = 'https://turbo.cr';
|
||||||
|
const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||||
|
|
||||||
|
export function isTurboUrl(url) {
|
||||||
|
return TURBO_HOST_RE.test(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
function unescapeHtml(s) {
|
||||||
|
return s.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"');
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractMp4FromHtml(html) {
|
||||||
|
// Plyr renders the resolved URL into <video src="..."> after WASM runs
|
||||||
|
const m = html.match(/<video[^>]+\bsrc=["']([^"']+\.mp4[^"']*)["']/i);
|
||||||
|
if (m) return unescapeHtml(m[1]);
|
||||||
|
// Fallback: any direct turbocdn mp4
|
||||||
|
const m2 = html.match(/https?:\/\/[^"'\s<>]*turbocdn[^"'\s<>]*\.mp4[^"'\s<>]*/i);
|
||||||
|
if (m2) return unescapeHtml(m2[0]);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function turboFilename(mp4Url, fallbackId) {
|
||||||
|
try {
|
||||||
|
const u = new URL(mp4Url);
|
||||||
|
const fn = u.searchParams.get('fn');
|
||||||
|
if (fn) return fn;
|
||||||
|
const base = basename(u.pathname);
|
||||||
|
if (base) return base;
|
||||||
|
} catch {}
|
||||||
|
return (fallbackId || 'turbo') + '.mp4';
|
||||||
|
}
|
||||||
|
|
||||||
|
async function downloadVideo(url, dest, ua) {
|
||||||
|
try {
|
||||||
|
const r = await fetch(url, {
|
||||||
|
headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' },
|
||||||
|
signal: AbortSignal.timeout(600000), // 10 min for big videos
|
||||||
|
});
|
||||||
|
if (!r.ok) return { ok: false, status: r.status };
|
||||||
|
const buf = Buffer.from(await r.arrayBuffer());
|
||||||
|
if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length };
|
||||||
|
writeFileSync(dest, buf);
|
||||||
|
return { ok: true, size: buf.length };
|
||||||
|
} catch (e) {
|
||||||
|
return { ok: false, error: e.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function resolveEmbed(sessionId, embedUrl) {
|
||||||
|
const r = await fsGet(sessionId, embedUrl, '');
|
||||||
|
if (r.status !== 200) return null;
|
||||||
|
return extractMp4FromHtml(r.html);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function resolveAlbumIds(sessionId, albumUrl) {
|
||||||
|
const r = await fsGet(sessionId, albumUrl, '');
|
||||||
|
if (r.status !== 200) return [];
|
||||||
|
// Album page: each video tile has data-id="<videoId>"
|
||||||
|
const ids = new Set();
|
||||||
|
const re = /data-id=["']([A-Za-z0-9_-]{6,})["']/g;
|
||||||
|
let m;
|
||||||
|
while ((m = re.exec(r.html)) !== null) ids.add(m[1]);
|
||||||
|
// Also handle direct embed links if present
|
||||||
|
const re2 = /turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/g;
|
||||||
|
while ((m = re2.exec(r.html)) !== null) ids.add(m[1]);
|
||||||
|
return [...ids];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve a turbo.cr URL (embed or album) and download all videos found.
|
||||||
|
* Pass an existing FlareSolverr sessionId to reuse it across many calls;
|
||||||
|
* otherwise one is created and destroyed per call.
|
||||||
|
*
|
||||||
|
* Returns the count of videos successfully downloaded.
|
||||||
|
*/
|
||||||
|
export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession) {
|
||||||
|
const ua = userAgent || DEFAULT_UA;
|
||||||
|
const folderName = basename(outputDir);
|
||||||
|
let ownSession = false;
|
||||||
|
let sessionId = fsSession;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!sessionId) {
|
||||||
|
sessionId = await fsCreateSession(TURBO_BASE + '/');
|
||||||
|
ownSession = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let embedIds = [];
|
||||||
|
let mEmbed = url.match(/turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/i);
|
||||||
|
if (mEmbed) {
|
||||||
|
embedIds = [mEmbed[1]];
|
||||||
|
} else if (/\/a\//i.test(url)) {
|
||||||
|
logFn(`turbo: resolving album ${url}`);
|
||||||
|
embedIds = await resolveAlbumIds(sessionId, url);
|
||||||
|
logFn(`turbo: album has ${embedIds.length} video(s)`);
|
||||||
|
} else {
|
||||||
|
logFn(`turbo: unrecognized URL ${url}`);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = 0;
|
||||||
|
for (const id of embedIds) {
|
||||||
|
const embedUrl = `${TURBO_BASE}/embed/${id}`;
|
||||||
|
try {
|
||||||
|
const mp4 = await resolveEmbed(sessionId, embedUrl);
|
||||||
|
if (!mp4) {
|
||||||
|
logFn(`turbo: could not resolve mp4 for ${id}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const filename = turboFilename(mp4, id);
|
||||||
|
const dest = join(outputDir, filename);
|
||||||
|
if (existsSync(dest)) {
|
||||||
|
logFn(`turbo: already have ${filename}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const dl = await downloadVideo(mp4, dest, ua);
|
||||||
|
if (dl.ok) {
|
||||||
|
try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {}
|
||||||
|
logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`);
|
||||||
|
count++;
|
||||||
|
} else {
|
||||||
|
logFn(`turbo: download failed ${filename} - ${dl.status || dl.error || dl.reason}`);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
logFn(`turbo: error for ${id}: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
} finally {
|
||||||
|
if (ownSession && sessionId) await fsDestroySession(sessionId);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user