import { existsSync, writeFileSync, mkdirSync, unlinkSync } from 'fs'; import { basename, join, extname } from 'path'; import { load as cheerioLoad } from 'cheerio'; import { upsertMediaFile, removeMediaFile } from '../db.js'; const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v']); export function parseMediaUrl(url) { const parsed = new URL(url); const base = `${parsed.protocol}//${parsed.hostname}`; // Support /model/{id} or /media/{id} (fapello.to JSON API) const m = parsed.pathname.match(/\/(?:model|media)\/(\d+)/); if (m) return { base, userId: m[1], mode: 'api' }; // Support fapello.com profile slug URLs like /josie-hamming-41/ const slugMatch = parsed.pathname.match(/^\/([a-zA-Z0-9_-]+)\/?$/); if (slugMatch) return { base, userId: slugMatch[1], mode: 'html' }; throw new Error(`Can't parse URL. Expected: https://fapello.to/model/12345 or https://fapello.com/username/`); } // Fetch JSON from the API endpoint // API: GET /api/media/{userId}/{page}/{order} // Requires X-Requested-With and Referer headers to avoid 403 async function fetchApiPage(base, userId, page, order, logFn) { const apiUrl = `${base}/api/media/${userId}/${page}/${order}`; try { const resp = await fetch(apiUrl, { headers: { 'User-Agent': UA, 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'Referer': `${base}/model/${userId}`, }, signal: AbortSignal.timeout(15000), }); if (!resp.ok) { if (resp.status === 404) return null; logFn(`API error (${resp.status}): ${apiUrl}`); return null; } const data = await resp.json(); return data; } catch (err) { logFn(`API fetch error: ${err.message}`); return null; } } // Collect all media items by paginating through the API export async function fetchAllMedia(base, userId, maxPages, delay, logFn, checkCancelled) { const allItems = []; const seen = new Set(); for (let page = 1; page <= maxPages; page++) { if (checkCancelled()) break; logFn(`Fetching page ${page}...`); const data = await fetchApiPage(base, userId, page, 1, logFn); if (!data || data.length === 0) { logFn(`Page ${page}: no more items — done`); break; } let newCount = 0; for (const item of data) { if (seen.has(item.id)) continue; seen.add(item.id); newCount++; // type "2" = video (newUrl is mp4), type "1" = image (newUrl is full-size jpg) const isVideo = item.type === '2' || item.type === 2; const fullUrl = item.newUrl; if (!fullUrl) continue; allItems.push({ id: item.id, url: fullUrl, thumbUrl: item.newUrlThumb || null, type: isVideo ? 'video' : 'image', }); } if (newCount === 0) { logFn(`Page ${page}: all duplicates — stopping`); break; } logFn(`Page ${page}: ${data.length} items (${newCount} new, ${allItems.length} total)`); if (page < maxPages && !checkCancelled()) { await new Promise(r => setTimeout(r, delay)); } } return allItems; } // --- HTML-based scraping (fapello.com profile pages) --- function parseMediaFromHtml(html, base) { const $ = cheerioLoad(html); const items = []; // Find all image thumbnails in the grid $('img[src*="_300px."]').each((_, el) => { const thumbUrl = $(el).attr('src'); if (!thumbUrl) return; // Convert thumbnail to full-size: remove _300px const fullUrl = thumbUrl.replace(/_300px\./, '.'); const absUrl = fullUrl.startsWith('http') ? fullUrl : `${base}${fullUrl}`; items.push({ url: absUrl, type: 'image' }); }); // Find video elements (source tags with .mp4) $('video source[src*=".mp4"], video[src*=".mp4"]').each((_, el) => { const src = $(el).attr('src'); if (!src) return; const absUrl = src.startsWith('http') ? src : `${base}${src}`; items.push({ url: absUrl, type: 'video' }); }); return items; } export async function fetchAllMediaFromHtml(base, slug, maxPages, delay, logFn, checkCancelled) { const allItems = []; const seen = new Set(); let totalPages = maxPages; // Phase 1: Fetch initial profile page to get data-max logFn(`Fetching profile page: ${base}/${slug}/`); try { const resp = await fetch(`${base}/${slug}/`, { headers: { 'User-Agent': UA }, signal: AbortSignal.timeout(15000), }); if (!resp.ok) { logFn(`Profile page error (${resp.status})`); return allItems; } const html = await resp.text(); const $ = cheerioLoad(html); // Get max pages from data-max attribute const dataMax = $('#showmore').attr('data-max'); if (dataMax) { totalPages = Math.min(parseInt(dataMax, 10) || maxPages, maxPages); logFn(`Detected ${totalPages} pages`); } // Parse initial page content const initialItems = parseMediaFromHtml(html, base); for (const item of initialItems) { if (!seen.has(item.url)) { seen.add(item.url); allItems.push({ ...item, id: seen.size }); } } logFn(`Page 1: ${initialItems.length} items (${allItems.length} total)`); } catch (err) { logFn(`Error fetching profile: ${err.message}`); return allItems; } // Phase 2: Paginate through AJAX pages for (let page = 2; page <= totalPages; page++) { if (checkCancelled()) break; const ajaxUrl = `${base}/ajax/model/${slug}/page-${page}/`; try { const resp = await fetch(ajaxUrl, { headers: { 'User-Agent': UA, 'X-Requested-With': 'XMLHttpRequest', 'Referer': `${base}/${slug}/`, }, signal: AbortSignal.timeout(15000), }); if (!resp.ok) { if (resp.status === 404) { logFn(`Page ${page}: 404 — done`); break; } logFn(`Page ${page}: error (${resp.status})`); continue; } const html = await resp.text(); if (!html || html.trim().length === 0) { logFn(`Page ${page}: empty — done`); break; } const pageItems = parseMediaFromHtml(html, base); let newCount = 0; for (const item of pageItems) { if (!seen.has(item.url)) { seen.add(item.url); allItems.push({ ...item, id: seen.size }); newCount++; } } if (newCount === 0) { logFn(`Page ${page}: all duplicates — stopping`); break; } logFn(`Page ${page}: ${pageItems.length} items (${newCount} new, ${allItems.length} total)`); } catch (err) { logFn(`Page ${page}: error — ${err.message}`); } if (page < totalPages && !checkCancelled()) { await new Promise(r => setTimeout(r, delay)); } } return allItems; } // Helper: derive filename from URL, with fallback function filenameFromUrl(url, item) { try { const name = basename(new URL(url).pathname); if (name && name !== '/') return name; } catch {} return `${item.id}.${item.type === 'video' ? 'mp4' : 'jpg'}`; } // Helper: add _md suffix before extension function mdFilename(filename) { const ext = extname(filename); return filename.slice(0, -ext.length) + '_md' + ext; } // Helper: try fetching a URL, return buffer or null async function tryFetch(url, referer) { if (!url) return null; try { const resp = await fetch(url, { headers: { 'User-Agent': UA, 'Referer': referer || 'https://fapello.to/' }, signal: AbortSignal.timeout(60000), }); if (!resp.ok) return null; const buf = Buffer.from(await resp.arrayBuffer()); if (buf.length < 500) return null; return buf; } catch { return null; } } // Download all collected media items with concurrency // Fallback: if full-res URL fails, download medium (thumbUrl) with _md suffix. // Upgrade: if _md file exists, try full-res again; replace _md on success. export async function downloadMedia(items, outputDir, workers, logFn, progressFn, checkCancelled, referer) { mkdirSync(outputDir, { recursive: true }); let completed = 0; let errors = 0; let skipped = 0; let upgraded = 0; let index = 0; async function processNext() { while (index < items.length) { if (checkCancelled()) return; const current = index++; const item = items[current]; const filename = filenameFromUrl(item.url, item); const filepath = join(outputDir, filename); const mdName = mdFilename(filename); const mdPath = join(outputDir, mdName); // Full-res already exists — skip if (existsSync(filepath)) { skipped++; progressFn(completed + skipped, errors, items.length); continue; } // Medium version exists — try to upgrade to full-res if (existsSync(mdPath)) { const buf = await tryFetch(item.url, referer); if (buf) { writeFileSync(filepath, buf); try { unlinkSync(mdPath); } catch {} const folderName = basename(outputDir); const fileType = VIDEO_EXTS.has(extname(filename).toLowerCase()) ? 'video' : 'image'; try { removeMediaFile(folderName, mdName); } catch {} try { upsertMediaFile(folderName, filename, fileType, buf.length, Date.now(), null); } catch {} upgraded++; completed++; logFn(`[${completed}/${items.length}] ${filename} (upgraded from _md, ${(buf.length / 1024).toFixed(1)} KB)`); progressFn(completed + skipped, errors, items.length); } else { skipped++; progressFn(completed + skipped, errors, items.length); } continue; } // Neither exists — try full-res, then fallback to medium const buf = await tryFetch(item.url, referer); if (buf) { writeFileSync(filepath, buf); const folderName = basename(outputDir); const fileType = VIDEO_EXTS.has(extname(filename).toLowerCase()) ? 'video' : 'image'; try { upsertMediaFile(folderName, filename, fileType, buf.length, Date.now(), null); } catch {} completed++; logFn(`[${completed}/${items.length}] ${filename} (${(buf.length / 1024).toFixed(1)} KB)`); progressFn(completed + skipped, errors, items.length); continue; } // Full-res failed — try medium (thumbUrl) if (item.thumbUrl) { const mdBuf = await tryFetch(item.thumbUrl, referer); if (mdBuf) { writeFileSync(mdPath, mdBuf); const folderName = basename(outputDir); const fileType = VIDEO_EXTS.has(extname(mdName).toLowerCase()) ? 'video' : 'image'; try { upsertMediaFile(folderName, mdName, fileType, mdBuf.length, Date.now(), null); } catch {} completed++; logFn(`[${completed}/${items.length}] ${mdName} (medium fallback, ${(mdBuf.length / 1024).toFixed(1)} KB)`); progressFn(completed + skipped, errors, items.length); continue; } } // Both failed logFn(`FAILED: ${filename} — full-res and medium both unavailable`); errors++; progressFn(completed + skipped, errors, items.length); } } const workerPromises = []; for (let i = 0; i < Math.min(workers, items.length); i++) { workerPromises.push(processNext()); } await Promise.all(workerPromises); if (upgraded > 0) logFn(`Upgraded ${upgraded} files from medium to full resolution`); return { completed, errors, skipped, total: items.length }; }