Add DRM downloads, scrapers, gallery index, and UI improvements
- DRM video download pipeline with pywidevine subprocess for Widevine key acquisition - Scraper system: forum threads, Coomer/Kemono API, and MediaLink (Fapello) scrapers - SQLite-backed media index for instant gallery loads with startup scan - Duplicate detection and gallery filtering/sorting - HLS video component, log viewer, and scrape management UI - Dockerfile updated for Python/pywidevine, docker-compose volume for CDM Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
322
server/scrape.js
Normal file
322
server/scrape.js
Normal file
@@ -0,0 +1,322 @@
|
||||
import { Router } from 'express';
|
||||
import { mkdirSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import { scrapeForumPage, getPageUrl, detectMaxPage } from './scrapers/forum.js';
|
||||
import { parseUserUrl, fetchAllPosts, downloadFiles } from './scrapers/coomer.js';
|
||||
import { parseMediaUrl, fetchAllMedia, downloadMedia } from './scrapers/medialink.js';
|
||||
|
||||
const router = Router();
|
||||
const MEDIA_PATH = process.env.MEDIA_PATH || './data/media';
|
||||
|
||||
const jobsMap = new Map();
|
||||
let jobCounter = 0;
|
||||
const MAX_COMPLETED = 50;
|
||||
const MAX_LOGS = 200;
|
||||
|
||||
function createJob(type, config) {
|
||||
const id = `scrape_${Date.now()}_${++jobCounter}`;
|
||||
const job = {
|
||||
id,
|
||||
type,
|
||||
config,
|
||||
progress: { total: 0, completed: 0, errors: 0 },
|
||||
running: true,
|
||||
cancelled: false,
|
||||
logs: [],
|
||||
startedAt: new Date().toISOString(),
|
||||
completedAt: null,
|
||||
folderName: config.folderName || 'scrape',
|
||||
};
|
||||
jobsMap.set(id, job);
|
||||
return job;
|
||||
}
|
||||
|
||||
function addLog(job, msg) {
|
||||
const ts = new Date().toLocaleTimeString('en-US', { hour12: false });
|
||||
job.logs.push(`[${ts}] ${msg}`);
|
||||
if (job.logs.length > MAX_LOGS) job.logs.shift();
|
||||
}
|
||||
|
||||
function pruneCompleted() {
|
||||
const completed = [...jobsMap.values()]
|
||||
.filter(j => !j.running)
|
||||
.sort((a, b) => new Date(b.completedAt) - new Date(a.completedAt));
|
||||
if (completed.length > MAX_COMPLETED) {
|
||||
for (const old of completed.slice(MAX_COMPLETED)) {
|
||||
jobsMap.delete(old.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function jobToJson(job) {
|
||||
return {
|
||||
id: job.id,
|
||||
type: job.type,
|
||||
config: job.config,
|
||||
progress: job.progress,
|
||||
running: job.running,
|
||||
cancelled: job.cancelled,
|
||||
folderName: job.folderName,
|
||||
startedAt: job.startedAt,
|
||||
completedAt: job.completedAt,
|
||||
logCount: job.logs.length,
|
||||
};
|
||||
}
|
||||
|
||||
// --- Forum Scrape ---
|
||||
|
||||
async function runForumScrape(job) {
|
||||
const { url, startPage, endPage, delay, folderName } = job.config;
|
||||
const outputDir = join(MEDIA_PATH, folderName);
|
||||
mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
const downloadedSet = new Set();
|
||||
let totalImages = 0;
|
||||
|
||||
addLog(job, `Starting forum scrape: pages ${startPage}-${endPage}`);
|
||||
addLog(job, `Output: ${outputDir}`);
|
||||
|
||||
job.progress.total = endPage - startPage + 1;
|
||||
|
||||
try {
|
||||
for (let page = startPage; page <= endPage; page++) {
|
||||
if (job.cancelled) {
|
||||
addLog(job, 'Cancelled by user');
|
||||
break;
|
||||
}
|
||||
|
||||
const pageUrl = getPageUrl(url, page);
|
||||
addLog(job, `--- Page ${page}/${endPage} ---`);
|
||||
|
||||
const count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg));
|
||||
totalImages += count;
|
||||
job.progress.completed = page - startPage + 1;
|
||||
|
||||
if (page < endPage && !job.cancelled) {
|
||||
await new Promise(r => setTimeout(r, delay * 1000));
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
addLog(job, `Error: ${err.message}`);
|
||||
job.progress.errors++;
|
||||
} finally {
|
||||
job.running = false;
|
||||
job.completedAt = new Date().toISOString();
|
||||
addLog(job, `Done! ${totalImages} images saved to ${folderName}/`);
|
||||
pruneCompleted();
|
||||
}
|
||||
}
|
||||
|
||||
// --- Coomer Scrape ---
|
||||
|
||||
async function runCoomerScrape(job) {
|
||||
const { url, pages, workers, folderName } = job.config;
|
||||
const outputDir = join(MEDIA_PATH, folderName);
|
||||
mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
addLog(job, `Starting coomer scrape: ${url}`);
|
||||
addLog(job, `Pages: ${pages}, Workers: ${workers}`);
|
||||
|
||||
try {
|
||||
const { base, service, userId } = parseUserUrl(url);
|
||||
addLog(job, `Site: ${base}, Service: ${service}, User: ${userId}`);
|
||||
|
||||
// Phase 1: Collect files
|
||||
addLog(job, `Fetching up to ${pages} pages...`);
|
||||
const files = await fetchAllPosts(base, service, userId, pages,
|
||||
(msg) => addLog(job, msg),
|
||||
() => job.cancelled
|
||||
);
|
||||
|
||||
if (job.cancelled) {
|
||||
addLog(job, 'Cancelled by user');
|
||||
return;
|
||||
}
|
||||
|
||||
if (files.length === 0) {
|
||||
addLog(job, 'No files found');
|
||||
return;
|
||||
}
|
||||
|
||||
job.progress.total = files.length;
|
||||
addLog(job, `Found ${files.length} files. Starting downloads...`);
|
||||
|
||||
// Phase 2: Download
|
||||
const result = await downloadFiles(files, outputDir, workers,
|
||||
(msg) => addLog(job, msg),
|
||||
(completed, errors, total) => {
|
||||
job.progress.completed = completed;
|
||||
job.progress.errors = errors;
|
||||
job.progress.total = total;
|
||||
},
|
||||
() => job.cancelled
|
||||
);
|
||||
|
||||
addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`);
|
||||
} catch (err) {
|
||||
addLog(job, `Error: ${err.message}`);
|
||||
job.progress.errors++;
|
||||
} finally {
|
||||
job.running = false;
|
||||
job.completedAt = new Date().toISOString();
|
||||
pruneCompleted();
|
||||
}
|
||||
}
|
||||
|
||||
// --- MediaLink Scrape ---
|
||||
|
||||
async function runMediaLinkScrape(job) {
|
||||
const { url, pages, workers, delay, folderName } = job.config;
|
||||
const outputDir = join(MEDIA_PATH, folderName);
|
||||
mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
addLog(job, `Starting medialink scrape: ${url}`);
|
||||
addLog(job, `Pages: ${pages}, Workers: ${workers}, Delay: ${delay}ms`);
|
||||
|
||||
try {
|
||||
const { base, userId } = parseMediaUrl(url);
|
||||
addLog(job, `Site: ${base}, User ID: ${userId}`);
|
||||
|
||||
// Phase 1: Collect all media via JSON API
|
||||
addLog(job, `Fetching up to ${pages} pages from API...`);
|
||||
const items = await fetchAllMedia(base, userId, pages, delay,
|
||||
(msg) => addLog(job, msg),
|
||||
() => job.cancelled
|
||||
);
|
||||
|
||||
if (job.cancelled) {
|
||||
addLog(job, 'Cancelled by user');
|
||||
return;
|
||||
}
|
||||
|
||||
if (items.length === 0) {
|
||||
addLog(job, 'No media found');
|
||||
return;
|
||||
}
|
||||
|
||||
job.progress.total = items.length;
|
||||
addLog(job, `Found ${items.length} media items. Downloading...`);
|
||||
|
||||
// Phase 2: Download all media files
|
||||
const result = await downloadMedia(items, outputDir, workers,
|
||||
(msg) => addLog(job, msg),
|
||||
(completed, errors, total) => {
|
||||
job.progress.completed = completed;
|
||||
job.progress.errors = errors;
|
||||
job.progress.total = total;
|
||||
},
|
||||
() => job.cancelled
|
||||
);
|
||||
|
||||
addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`);
|
||||
} catch (err) {
|
||||
addLog(job, `Error: ${err.message}`);
|
||||
job.progress.errors++;
|
||||
} finally {
|
||||
job.running = false;
|
||||
job.completedAt = new Date().toISOString();
|
||||
pruneCompleted();
|
||||
}
|
||||
}
|
||||
|
||||
// --- Endpoints ---
|
||||
|
||||
router.post('/api/scrape/forum', (req, res) => {
|
||||
const { url, folderName, startPage, endPage, delay } = req.body;
|
||||
if (!url) return res.status(400).json({ error: 'URL is required' });
|
||||
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
||||
|
||||
const config = {
|
||||
url: url.includes('page-') ? url : `${url.replace(/\/$/, '')}/page-1`,
|
||||
folderName,
|
||||
startPage: parseInt(startPage) || 1,
|
||||
endPage: parseInt(endPage) || 10,
|
||||
delay: parseFloat(delay) || 1.0,
|
||||
};
|
||||
|
||||
const job = createJob('forum', config);
|
||||
runForumScrape(job).catch(err => {
|
||||
addLog(job, `Fatal error: ${err.message}`);
|
||||
job.running = false;
|
||||
job.completedAt = new Date().toISOString();
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id, message: 'Forum scrape started' });
|
||||
});
|
||||
|
||||
router.post('/api/scrape/coomer', (req, res) => {
|
||||
const { url, folderName, pages, workers } = req.body;
|
||||
if (!url) return res.status(400).json({ error: 'URL is required' });
|
||||
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
||||
|
||||
const config = {
|
||||
url,
|
||||
folderName,
|
||||
pages: parseInt(pages) || 10,
|
||||
workers: Math.min(Math.max(parseInt(workers) || 10, 1), 20),
|
||||
};
|
||||
|
||||
const job = createJob('coomer', config);
|
||||
runCoomerScrape(job).catch(err => {
|
||||
addLog(job, `Fatal error: ${err.message}`);
|
||||
job.running = false;
|
||||
job.completedAt = new Date().toISOString();
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id, message: 'Coomer scrape started' });
|
||||
});
|
||||
|
||||
router.post('/api/scrape/medialink', (req, res) => {
|
||||
const { url, folderName, pages, workers, delay } = req.body;
|
||||
if (!url) return res.status(400).json({ error: 'URL is required' });
|
||||
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
||||
|
||||
const config = {
|
||||
url,
|
||||
folderName,
|
||||
pages: parseInt(pages) || 50,
|
||||
workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10),
|
||||
delay: parseInt(delay) || 500,
|
||||
};
|
||||
|
||||
const job = createJob('medialink', config);
|
||||
runMediaLinkScrape(job).catch(err => {
|
||||
addLog(job, `Fatal error: ${err.message}`);
|
||||
job.running = false;
|
||||
job.completedAt = new Date().toISOString();
|
||||
});
|
||||
|
||||
res.json({ jobId: job.id, message: 'MediaLink scrape started' });
|
||||
});
|
||||
|
||||
router.get('/api/scrape/jobs', (_req, res) => {
|
||||
const jobs = [...jobsMap.values()].map(jobToJson);
|
||||
jobs.sort((a, b) => new Date(b.startedAt) - new Date(a.startedAt));
|
||||
res.json(jobs);
|
||||
});
|
||||
|
||||
router.get('/api/scrape/jobs/:jobId', (req, res) => {
|
||||
const job = jobsMap.get(req.params.jobId);
|
||||
if (!job) return res.status(404).json({ error: 'Job not found' });
|
||||
res.json({ ...jobToJson(job), logs: job.logs });
|
||||
});
|
||||
|
||||
router.post('/api/scrape/jobs/:jobId/cancel', (req, res) => {
|
||||
const job = jobsMap.get(req.params.jobId);
|
||||
if (!job) return res.status(404).json({ error: 'Job not found' });
|
||||
if (!job.running) return res.status(400).json({ error: 'Job is not running' });
|
||||
job.cancelled = true;
|
||||
addLog(job, 'Cancel requested');
|
||||
res.json({ message: 'Cancel requested' });
|
||||
});
|
||||
|
||||
// Auto-detect max page for forum URLs
|
||||
router.post('/api/scrape/forum/detect-pages', async (req, res) => {
|
||||
const { url } = req.body;
|
||||
if (!url) return res.status(400).json({ error: 'URL is required' });
|
||||
const logs = [];
|
||||
const maxPage = await detectMaxPage(url, (msg) => logs.push(msg));
|
||||
res.json({ maxPage, logs });
|
||||
});
|
||||
|
||||
export default router;
|
||||
Reference in New Issue
Block a user