- DRM video download pipeline with pywidevine subprocess for Widevine key acquisition - Scraper system: forum threads, Coomer/Kemono API, and MediaLink (Fapello) scrapers - SQLite-backed media index for instant gallery loads with startup scan - Duplicate detection and gallery filtering/sorting - HLS video component, log viewer, and scrape management UI - Dockerfile updated for Python/pywidevine, docker-compose volume for CDM Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
231 lines
6.9 KiB
JavaScript
231 lines
6.9 KiB
JavaScript
import * as cheerio from 'cheerio';
|
|
import { createWriteStream, existsSync, mkdirSync, statSync } from 'fs';
|
|
import { basename, join, extname } from 'path';
|
|
import { pipeline } from 'stream/promises';
|
|
import { upsertMediaFile } from '../db.js';
|
|
|
|
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
|
|
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
|
|
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star'];
|
|
|
|
function isImageUrl(url) {
|
|
try {
|
|
const path = new URL(url).pathname.toLowerCase();
|
|
return [...IMAGE_EXTS].some(ext => path.endsWith(ext));
|
|
} catch { return false; }
|
|
}
|
|
|
|
export function getPageUrl(baseUrl, pageNum) {
|
|
const url = baseUrl.replace(/page-\d+/, `page-${pageNum}`);
|
|
return url.split('#')[0];
|
|
}
|
|
|
|
export async function detectMaxPage(baseUrl, logFn) {
|
|
try {
|
|
const resp = await fetch(baseUrl, { headers: { 'User-Agent': UA }, signal: AbortSignal.timeout(15000) });
|
|
if (!resp.ok) return null;
|
|
const html = await resp.text();
|
|
const $ = cheerio.load(html);
|
|
|
|
let maxPage = 1;
|
|
// XenForo-style
|
|
$('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => {
|
|
const href = $(el).attr('href') || '';
|
|
const m = href.match(/page-(\d+)/);
|
|
if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10));
|
|
});
|
|
// Generic pagination text
|
|
$('a').each((_, el) => {
|
|
const text = $(el).text().trim();
|
|
if (/^\d+$/.test(text)) {
|
|
const n = parseInt(text, 10);
|
|
if (n > maxPage && n < 10000) maxPage = n;
|
|
}
|
|
});
|
|
|
|
if (maxPage > 1) {
|
|
logFn(`Detected ${maxPage} pages`);
|
|
return maxPage;
|
|
}
|
|
return null;
|
|
} catch (err) {
|
|
logFn(`Page detection failed: ${err.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function tryFullSizeUrl(thumbUrl) {
|
|
const candidates = [];
|
|
if (thumbUrl.includes('.th.')) candidates.push(thumbUrl.replace('.th.', '.'));
|
|
if (/_thumb\./i.test(thumbUrl)) candidates.push(thumbUrl.replace(/_thumb\./i, '.'));
|
|
if (thumbUrl.includes('/thumbs/')) {
|
|
candidates.push(thumbUrl.replace('/thumbs/', '/images/'));
|
|
candidates.push(thumbUrl.replace('/thumbs/', '/full/'));
|
|
}
|
|
try {
|
|
const parsed = new URL(thumbUrl);
|
|
const base = basename(parsed.pathname);
|
|
if (base.startsWith('thumb_')) {
|
|
candidates.push(thumbUrl.replace(`/${base}`, `/${base.slice(6)}`));
|
|
}
|
|
if (parsed.search) candidates.push(thumbUrl.split('?')[0]);
|
|
} catch {}
|
|
return candidates;
|
|
}
|
|
|
|
async function downloadImage(url, outputDir, downloadedSet, logFn) {
|
|
if (downloadedSet.has(url)) return false;
|
|
if (!isImageUrl(url)) return false;
|
|
const lower = url.toLowerCase();
|
|
if (SKIP_PATTERNS.some(p => lower.includes(p))) return false;
|
|
|
|
downloadedSet.add(url);
|
|
|
|
let filename;
|
|
try {
|
|
filename = basename(new URL(url).pathname);
|
|
} catch { return false; }
|
|
if (!filename) return false;
|
|
|
|
filename = filename.replace('.th.', '.');
|
|
|
|
let filepath = join(outputDir, filename);
|
|
if (existsSync(filepath)) {
|
|
const ext = extname(filename);
|
|
const name = filename.slice(0, -ext.length);
|
|
let i = 1;
|
|
while (existsSync(filepath)) {
|
|
filepath = join(outputDir, `${name}_${i}${ext}`);
|
|
i++;
|
|
}
|
|
}
|
|
|
|
try {
|
|
const resp = await fetch(url, {
|
|
headers: { 'User-Agent': UA },
|
|
signal: AbortSignal.timeout(30000),
|
|
});
|
|
if (!resp.ok) {
|
|
logFn(`FAILED (${resp.status}): ${url}`);
|
|
return false;
|
|
}
|
|
|
|
// Read full body to check size
|
|
const buf = Buffer.from(await resp.arrayBuffer());
|
|
if (buf.length < 1000) {
|
|
downloadedSet.delete(url);
|
|
return false;
|
|
}
|
|
|
|
const { writeFileSync } = await import('fs');
|
|
writeFileSync(filepath, buf);
|
|
|
|
const savedName = basename(filepath);
|
|
const folderName = basename(outputDir);
|
|
try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch { /* ignore */ }
|
|
|
|
const sizeKb = (buf.length / 1024).toFixed(1);
|
|
logFn(`Downloaded: ${savedName} (${sizeKb} KB)`);
|
|
return true;
|
|
} catch (err) {
|
|
logFn(`FAILED: ${basename(filepath)} - ${err.message}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn) {
|
|
logFn(`Fetching page: ${pageUrl}`);
|
|
|
|
let html;
|
|
try {
|
|
const resp = await fetch(pageUrl, {
|
|
headers: { 'User-Agent': UA },
|
|
signal: AbortSignal.timeout(15000),
|
|
});
|
|
if (!resp.ok) {
|
|
logFn(`Failed to fetch page (${resp.status})`);
|
|
return 0;
|
|
}
|
|
html = await resp.text();
|
|
} catch (err) {
|
|
logFn(`Failed to fetch page: ${err.message}`);
|
|
return 0;
|
|
}
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
// Try known content selectors, fall back to whole page
|
|
const selectors = '.message-body, .post-body, .post_body, .postcontent, .messageContent, .bbWrapper, article, .entry-content, .post_message, .post-content, #posts, .threadBody';
|
|
let contentAreas = $(selectors).toArray();
|
|
if (contentAreas.length === 0) {
|
|
contentAreas = [$.root().get(0)];
|
|
}
|
|
|
|
const imageUrls = [];
|
|
|
|
for (const area of contentAreas) {
|
|
const $area = $(area);
|
|
|
|
// Pass 1: <img> tags
|
|
$area.find('img').each((_, el) => {
|
|
const $img = $(el);
|
|
const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url') || '';
|
|
if (!src) return;
|
|
|
|
let absSrc;
|
|
try { absSrc = new URL(src, pageUrl).href; } catch { return; }
|
|
|
|
// Check parent <a> for direct image link
|
|
const $parentA = $img.closest('a');
|
|
if ($parentA.length && $parentA.attr('href')) {
|
|
try {
|
|
const aHref = new URL($parentA.attr('href'), pageUrl).href;
|
|
if (isImageUrl(aHref)) {
|
|
imageUrls.push(aHref);
|
|
return;
|
|
}
|
|
} catch {}
|
|
}
|
|
|
|
// Try to derive full-size from thumbnail URL
|
|
const fullCandidates = tryFullSizeUrl(absSrc);
|
|
if (fullCandidates.length > 0) {
|
|
imageUrls.push(...fullCandidates);
|
|
} else {
|
|
imageUrls.push(absSrc);
|
|
}
|
|
|
|
// Also check data attributes
|
|
for (const attr of ['data-src', 'data-url', 'data-orig', 'data-original', 'data-full-url', 'data-zoom-src']) {
|
|
const val = $img.attr(attr);
|
|
if (val && val !== src) {
|
|
try { imageUrls.push(new URL(val, pageUrl).href); } catch {}
|
|
}
|
|
}
|
|
});
|
|
|
|
// Pass 2: <a href> pointing directly to images (no child <img>)
|
|
$area.find('a[href]').each((_, el) => {
|
|
const $a = $(el);
|
|
if ($a.find('img').length) return;
|
|
try {
|
|
const href = new URL($a.attr('href'), pageUrl).href;
|
|
if (isImageUrl(href)) imageUrls.push(href);
|
|
} catch {}
|
|
});
|
|
}
|
|
|
|
logFn(`Found ${imageUrls.length} candidate URLs`);
|
|
|
|
let count = 0;
|
|
for (const imgUrl of imageUrls) {
|
|
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn)) {
|
|
count++;
|
|
}
|
|
}
|
|
|
|
logFn(`${count} images from this page`);
|
|
return count;
|
|
}
|