基于v0.1.0版本修改,加入自动重命名、打包zip、多线程等技术优化用户体验,油猴脚本如下:
// ==UserScript==
// @name HAU Finance File Downloader Crawler
// @namespace https://cwc.henau.edu.cn/
// @version 0.3.0
// @description Crawl file listings, extract download links, export CSV or download all files as ZIP.
// @match https://cwc.henau.edu.cn/plus/list.php?tid=*
// @match https://cwc.henau.edu.cn/plus/list.php*tid=*
// @grant GM_xmlhttpRequest
// @connect cwc.henau.edu.cn
// @connect cdn.jsdelivr.net
// ==/UserScript==
(function () {
'use strict';
const BRAND = {
primary: '#012765',
text: '#ffffff',
gray: '#6b7280',
};
const SELECTOR = {
listContainer: '.list_box',
listItems: '.list_box ul li',
itemDate: '.datetime',
pages: '.pages',
pageInfoStrong: '.pages .pageinfo strong',
breadcrumb: '.page_pos',
sidebarCurrent: '.list_l a.cur',
content: '.content',
};
const DELAY = { minMs: 500, maxMs: 1500 };
const JSZIP_CDN = 'https://cdn.jsdelivr.net/npm/jszip@3.10.1/dist/jszip.min.js';
/** 解析详情页时的并发数(同时请求的详情页数量) */
const CONCURRENCY_DETAIL = 5;
/** 下载文件时的并发数(同时下载的文件数量) */
const CONCURRENCY_DOWNLOAD = 6;
const state = { running: false, abort: false, jszipLoaded: false };
/**
* 并发池:对 items 逐个执行 taskFn,同时最多运行 concurrency 个任务。
* @param {Array} items
* @param {number} concurrency
* @param {(item: T, index: number) => Promise<void>} taskFn
* @param {{ onProgress?: (done: number, total: number) => void }} opts
*/
async function runWithConcurrency(items, concurrency, taskFn, { onProgress } = {}) {
const total = items.length;
let doneCount = 0;
let nextIndex = 0;
async function worker() {
while (nextIndex < total && !state.abort) {
const i = nextIndex++;
const item = items[i];
try {
await taskFn(item, i);
} catch (_) {
// 单条失败不中断,由 taskFn 内部处理
} finally {
doneCount++;
if (onProgress) onProgress(doneCount, total);
}
}
}
const workers = Array.from({ length: Math.min(concurrency, total) }, () => worker());
await Promise.all(workers);
}
function isTargetListPage(doc = document) {
const urlOk = /\/plus\/list\.php\?/.test(location.href) && /[?&]tid=\d+/.test(location.href);
if (!urlOk) return false;
return Boolean(doc.querySelector(SELECTOR.listContainer) && doc.querySelector(SELECTOR.pages));
}
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function randomDelayMs(minMs, maxMs) {
return Math.floor(minMs + Math.random() * (maxMs - minMs + 1));
}
function textClean(s) {
return String(s || '').replace(/\s+/g, ' ').trim();
}
function sanitizeFilenamePart(s) {
return textClean(s)
.replace(/[\\/:*?"<>|]/g, '-')
.replace(/\s+/g, '')
.slice(0, 80);
}
function getExtensionFromUrl(url) {
try {
const pathname = new URL(url, location.href).pathname || '';
const m = pathname.match(/\.([a-z0-9]+)(?:\?|$)/i);
return m ? '.' + m[1].toLowerCase() : '';
} catch {
return '';
}
}
function parseHtml(html) {
return new DOMParser().parseFromString(html, 'text/html');
}
function absUrl(url, base) {
try {
return new URL(url, base).toString();
} catch {
return '';
}
}
function parseCharsetFromHeaders(headersText) {
const m = String(headersText || '').match(/content-type\s*:\s*[^\n;]+;\s*charset=([^\s;\n]+)/i);
return m ? String(m[1]).trim().toLowerCase() : '';
}
function parseCharsetFromHtmlSnippet(htmlSnippet) {
const snippet = String(htmlSnippet || '');
const m1 = snippet.match(/<meta\s+charset\s*=\s*['"]?([^'"\s/>]+)/i);
if (m1) return String(m1[1]).trim().toLowerCase();
const m2 = snippet.match(/<meta\s+[^>]*http-equiv\s*=\s*['"]content-type['"][^>]*content\s*=\s*['"][^'"]*charset=([^'"\s;]+)/i);
return m2 ? String(m2[1]).trim().toLowerCase() : '';
}
function normalizeCharset(cs) {
const c = String(cs || '').toLowerCase();
if (!c) return '';
if (c.includes('utf')) return 'utf-8';
if (c.includes('gbk') || c.includes('gb2312') || c.includes('gb-2312')) return 'gbk';
return c;
}
function decodeArrayBuffer(buffer, charsetHint, htmlSnippetForMeta) {
const headerCharset = normalizeCharset(charsetHint);
const metaCharset = normalizeCharset(parseCharsetFromHtmlSnippet(htmlSnippetForMeta));
const candidates = [headerCharset, metaCharset].filter(Boolean);
if (!candidates.includes('utf-8')) candidates.push('utf-8');
if (!candidates.includes('gbk')) candidates.push('gbk');
for (const cs of [...new Set(candidates)]) {
try {
const txt = new TextDecoder(cs, { fatal: false }).decode(buffer);
if (txt) return txt;
} catch {}
}
try {
return new TextDecoder('utf-8').decode(buffer);
} catch {
return '';
}
}
function gmRequestText(url) {
return new Promise((resolve, reject) => {
GM_xmlhttpRequest({
method: 'GET',
url,
responseType: 'arraybuffer',
onload: (res) => {
try {
const buf = res.response;
if (!buf) return resolve('');
const headersCharset = parseCharsetFromHeaders(res.responseHeaders || '');
const tempUtf8 = (() => {
try {
return new TextDecoder('utf-8', { fatal: false }).decode(buf.slice(0, 4096));
} catch {
return '';
}
})();
resolve(decodeArrayBuffer(buf, headersCharset, tempUtf8) || '');
} catch {
resolve('');
}
},
onerror: (err) => reject(err),
ontimeout: () => reject(new Error('GM_xmlhttpRequest timeout')),
});
});
}
function gmRequestArrayBuffer(url) {
return new Promise((resolve, reject) => {
GM_xmlhttpRequest({
method: 'GET',
url,
responseType: 'arraybuffer',
onload: (res) => {
if (res.status >= 200 && res.status < 300) {
resolve(res.response || new ArrayBuffer(0));
} else {
reject(new Error(`HTTP ${res.status}`));
}
},
onerror: (err) => reject(err),
ontimeout: () => reject(new Error('timeout')),
});
});
}
function loadScript(url) {
return new Promise((resolve, reject) => {
GM_xmlhttpRequest({
method: 'GET',
url,
onload: (res) => {
if (res.status !== 200) {
reject(new Error(`HTTP ${res.status}`));
return;
}
const script = document.createElement('script');
script.textContent = res.responseText;
(document.head || document.documentElement).appendChild(script);
script.remove();
resolve();
},
onerror: (err) => reject(err),
});
});
}
function loadJSZip() {
if (typeof JSZip !== 'undefined') {
state.jszipLoaded = true;
return Promise.resolve();
}
return loadScript(JSZIP_CDN).then(() => {
state.jszipLoaded = true;
});
}
function getCategoriesFromDocument(doc = document) {
let parent = '';
let child = '';
const breadcrumb = doc.querySelector(SELECTOR.breadcrumb);
if (breadcrumb) {
const texts = Array.from(breadcrumb.querySelectorAll('a')).map((a) => textClean(a.textContent)).filter(Boolean);
if (texts.length >= 2) parent = texts[1];
else if (texts.length === 1) parent = texts[0];
}
const sidebarCur = doc.querySelector(SELECTOR.sidebarCurrent);
if (sidebarCur) child = textClean(sidebarCur.textContent);
if (!child) child = parent;
return { parent, child };
}
function getPaginationInfoFromDocument(doc = document) {
const strongs = Array.from(doc.querySelectorAll(SELECTOR.pageInfoStrong));
let totalPages = parseInt(textClean(strongs[0]?.textContent), 10) || 1;
let totalItems = parseInt(textClean(strongs[1]?.textContent), 10) || 0;
let totalResult = '';
const anyPageLink = doc.querySelector('.pages a[href*="PageNo="]');
if (anyPageLink) {
try {
totalResult = new URL(anyPageLink.getAttribute('href'), location.href).searchParams.get('TotalResult') || '';
} catch {}
}
return { totalPages, totalItems, totalResult };
}
function buildListPageUrl(pageNo, totalResult) {
const u = new URL(location.href);
u.searchParams.delete('PageNo');
u.searchParams.delete('TotalResult');
if (pageNo > 1) {
if (totalResult) u.searchParams.set('TotalResult', String(totalResult));
u.searchParams.set('PageNo', String(pageNo));
}
return u.toString();
}
function extractListItemsFromDocument(doc, baseUrl) {
const items = [];
for (const li of doc.querySelectorAll(SELECTOR.listItems)) {
const a = li.querySelector('a[href]');
if (!a) continue;
const title = textClean(a.getAttribute('title') || a.textContent);
const detailUrl = absUrl(a.getAttribute('href'), baseUrl);
const dateEl = li.querySelector(SELECTOR.itemDate);
const publishDate = dateEl ? textClean(dateEl.textContent) : '';
if (!detailUrl || (!title && !publishDate)) continue;
items.push({ title, publishDate, detailUrl });
}
return items;
}
function extractDownloadLinksFromDetailDoc(doc, baseUrl) {
const container = doc.querySelector(SELECTOR.content) || doc;
const links = Array.from(container.querySelectorAll('a[href]'))
.map((a) => absUrl(a.getAttribute('href'), baseUrl))
.filter((href) => href && href !== baseUrl);
const filtered = links.filter((href) => {
if (/^javascript:/i.test(href) || href.endsWith('#')) return false;
if (/\/plus\/list\.php\b|\/a\//i.test(href)) return false;
const pathname = (new URL(href, baseUrl).pathname || '').toLowerCase();
if (pathname.startsWith('/uploads/')) return true;
return /\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar|7z|csv|txt|jpg|jpeg|png)($|\?)/i.test(pathname);
});
return Array.from(new Set(filtered));
}
function extractMainTextFromDetailDoc(doc) {
const container = doc.querySelector(SELECTOR.content) || doc.body || doc.documentElement;
return container ? textClean(container.textContent) : '';
}
function toCsvCell(v) {
const s = String(v ?? '');
return /[",\n\r]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
}
function buildCsv(rows) {
const header = ['所属分类(父)', '所属科室(子)', '文件名称', '发布日期', '详情页URL', '下载直链'];
const lines = [header, ...rows.map((r) => [r.parentCategory, r.childCategory, r.title, r.publishDate, r.detailUrl, r.downloadUrl])].map((cols) => cols.map(toCsvCell).join(','));
return '\uFEFF' + lines.join('\r\n');
}
function downloadBlob(filename, blob) {
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
a.remove();
setTimeout(() => URL.revokeObjectURL(url), 2000);
}
function downloadTextFile(filename, text, mime = 'text/csv;charset=utf-8') {
downloadBlob(filename, new Blob([text], { type: mime }));
}
function createButton(text) {
const btn = document.createElement('button');
btn.type = 'button';
btn.textContent = text;
btn.setAttribute('data-hau-crawler', 'btn');
Object.assign(btn.style, {
position: 'fixed',
right: '18px',
bottom: '18px',
zIndex: 99999,
background: BRAND.primary,
color: BRAND.text,
border: '0',
borderRadius: '999px',
padding: '12px 16px',
fontSize: '14px',
cursor: 'pointer',
opacity: '0.7',
boxShadow: '0 8px 22px rgba(0,0,0,.18)',
});
btn.addEventListener('mouseenter', () => (btn.style.opacity = '1.0'));
btn.addEventListener('mouseleave', () => (btn.style.opacity = '0.7'));
return btn;
}
function createProgressOverlay() {
const wrap = document.createElement('div');
wrap.setAttribute('data-hau-crawler', 'progress');
Object.assign(wrap.style, {
position: 'fixed',
left: '50%',
top: '22%',
transform: 'translateX(-50%)',
zIndex: 99999,
width: '420px',
maxWidth: '92vw',
background: 'rgba(1,39,101,0.92)',
color: BRAND.text,
borderRadius: '12px',
padding: '14px 16px',
boxShadow: '0 12px 32px rgba(0,0,0,.22)',
display: 'none',
});
const title = document.createElement('div');
title.textContent = '抓取进度';
Object.assign(title.style, { fontSize: '14px', fontWeight: '600', cursor: 'move', userSelect: 'none', paddingBottom: '4px' });
const desc = document.createElement('div');
desc.textContent = '';
Object.assign(desc.style, { fontSize: '12px', marginTop: '8px', opacity: '0.95' });
const barBg = document.createElement('div');
Object.assign(barBg.style, { width: '100%', height: '8px', borderRadius: '999px', background: 'rgba(255,255,255,0.22)', marginTop: '10px', overflow: 'hidden' });
const bar = document.createElement('div');
Object.assign(bar.style, { width: '0%', height: '100%', borderRadius: '999px', background: 'rgba(255,255,255,0.95)' });
barBg.appendChild(bar);
wrap.appendChild(title);
wrap.appendChild(desc);
wrap.appendChild(barBg);
let dragging = false;
let dragOffsetX = 0;
let dragOffsetY = 0;
function onMouseMove(e) {
if (!dragging) return;
wrap.style.left = Math.max(6, Math.min(window.innerWidth - wrap.offsetWidth - 6, e.clientX - dragOffsetX)) + 'px';
wrap.style.top = Math.max(6, Math.min(window.innerHeight - wrap.offsetHeight - 6, e.clientY - dragOffsetY)) + 'px';
wrap.style.transform = 'none';
}
function onMouseUp() {
dragging = false;
document.removeEventListener('mousemove', onMouseMove);
document.removeEventListener('mouseup', onMouseUp);
}
title.addEventListener('mousedown', (e) => {
dragging = true;
const rect = wrap.getBoundingClientRect();
dragOffsetX = e.clientX - rect.left;
dragOffsetY = e.clientY - rect.top;
document.addEventListener('mousemove', onMouseMove);
document.addEventListener('mouseup', onMouseUp);
e.preventDefault();
});
return {
el: wrap,
setVisible(v) {
wrap.style.display = v ? 'block' : 'none';
},
setProgress({ phase, pageNo, totalPages, done, total, extra }) {
const p = total > 0 ? Math.min(100, Math.round((done / total) * 100)) : 0;
bar.style.width = `${p}%`;
const pagePart = totalPages ? `页 ${pageNo}/${totalPages}` : '';
desc.textContent = extra ? `${phase}${pagePart ? ' · ' + pagePart : ''} · ${done}/${total} · ${extra}` : `${phase}${pagePart ? ' · ' + pagePart : ''} · ${done}/${total}`;
},
};
}
async function crawlCollectRows(progress) {
const { parent, child } = getCategoriesFromDocument(document);
const { totalPages, totalItems, totalResult } = getPaginationInfoFromDocument(document);
progress.setVisible(true);
progress.setProgress({ phase: '准备中', pageNo: 1, totalPages, done: 0, total: totalItems || 1, extra: '' });
const allListItems = [];
for (let pageNo = 1; pageNo <= totalPages; pageNo += 1) {
if (state.abort) break;
progress.setProgress({ phase: '抓取列表', pageNo, totalPages, done: allListItems.length, total: totalItems || 1, extra: '静默请求分页' });
const pageUrl = buildListPageUrl(pageNo, totalResult);
const html = pageNo === 1 ? document.documentElement.outerHTML : await gmRequestText(pageUrl);
const doc = pageNo === 1 ? document : parseHtml(html);
const base = pageNo === 1 ? location.href : pageUrl;
allListItems.push(...extractListItemsFromDocument(doc, base));
}
const uniqueItems = Array.from(new Map(allListItems.map((it) => [it.detailUrl, it])).values());
const rows = [];
const noAttachment = [];
await runWithConcurrency(
uniqueItems,
CONCURRENCY_DETAIL,
async (it) => {
let html = '';
try {
html = await gmRequestText(it.detailUrl);
} catch {}
if (html) {
const detailDoc = parseHtml(html);
const downloadLinks = extractDownloadLinksFromDetailDoc(detailDoc, it.detailUrl);
if (downloadLinks.length > 0) {
for (const link of downloadLinks) {
rows.push({ parentCategory: parent, childCategory: child, title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, downloadUrl: link });
}
} else {
const mainText = extractMainTextFromDetailDoc(detailDoc);
if (mainText) noAttachment.push({ title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, text: mainText });
rows.push({ parentCategory: parent, childCategory: child, title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, downloadUrl: '' });
}
} else {
rows.push({ parentCategory: parent, childCategory: child, title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, downloadUrl: '' });
}
},
{
onProgress: (done, total) =>
progress.setProgress({ phase: '解析详情(并行)', pageNo: totalPages, totalPages, done, total, extra: `${CONCURRENCY_DETAIL} 路并发` }),
}
);
const rowUniq = new Map();
for (const r of rows) {
const key = `${r.detailUrl}@@${r.downloadUrl}`;
if (!rowUniq.has(key)) rowUniq.set(key, r);
}
return { rows: Array.from(rowUniq.values()), noAttachment, parent, child };
}
async function exportCsv(progress, { rows, noAttachment, parent, child }) {
progress.setProgress({ phase: '生成CSV', pageNo: 1, totalPages: 1, done: rows.length, total: rows.length, extra: state.abort ? '已取消' : '完成' });
const filename = `${sanitizeFilenamePart(parent)}-${sanitizeFilenamePart(child)}-文件列表.csv`;
downloadTextFile(filename, buildCsv(rows));
if (noAttachment.length > 0) {
const txtLines = noAttachment.flatMap((n, idx) => [
'============================================================',
`【${idx + 1}/${noAttachment.length}】标题:${n.title || ''}`,
`日期:${n.publishDate || ''}`,
`URL:${n.detailUrl || ''}`,
'------------------------------------------------------------',
'正文:',
'',
n.text || '',
'',
]);
downloadTextFile(`${sanitizeFilenamePart(parent)}-${sanitizeFilenamePart(child)}-无附件正文.txt`, '\uFEFF' + txtLines.join('\r\n'), 'text/plain;charset=utf-8');
}
}
async function downloadAndZip(progress, { rows, parent, child }) {
const withUrl = rows.filter((r) => r.downloadUrl);
if (withUrl.length === 0) {
progress.setProgress({ phase: '下载ZIP', pageNo: 1, totalPages: 1, done: 0, total: 0, extra: '无附件,跳过打包' });
await sleep(400);
return;
}
progress.setProgress({ phase: '加载 JSZip', pageNo: 1, totalPages: 1, done: 0, total: 1, extra: '' });
try {
await loadJSZip();
} catch (e) {
progress.setProgress({ phase: '错误', pageNo: 1, totalPages: 1, done: 0, total: 1, extra: 'JSZip 加载失败' });
await sleep(2000);
return;
}
const zip = new JSZip();
const usedNames = new Set();
function uniqueZipPath(row, index) {
const base = sanitizeFilenamePart(row.title) || `file_${index}`;
const ext = getExtensionFromUrl(row.downloadUrl) || '';
let name = base + ext;
let n = 0;
while (usedNames.has(name)) {
n += 1;
name = `${base}_${n}${ext}`;
}
usedNames.add(name);
return name;
}
await runWithConcurrency(
withUrl,
CONCURRENCY_DOWNLOAD,
async (row, i) => {
const zipPath = uniqueZipPath(row, i);
try {
const buf = await gmRequestArrayBuffer(row.downloadUrl);
zip.file(zipPath, buf);
} catch {
zip.file(zipPath + '.failed.txt', `下载失败: ${row.downloadUrl}`);
}
},
{
onProgress: (done, total) =>
progress.setProgress({ phase: '下载文件(并行)', pageNo: 1, totalPages: 1, done, total, extra: `${CONCURRENCY_DOWNLOAD} 路并发` }),
}
);
if (state.abort) return;
progress.setProgress({ phase: '打包ZIP', pageNo: 1, totalPages: 1, done: withUrl.length, total: withUrl.length, extra: '' });
const blob = await zip.generateAsync({ type: 'blob' });
const zipFilename = `${sanitizeFilenamePart(parent)}-${sanitizeFilenamePart(child)}-附件.zip`;
downloadBlob(zipFilename, blob);
await sleep(400);
}
async function runMode(progress, mode) {
if (state.running) return;
state.running = true;
state.abort = false;
try {
const data = await crawlCollectRows(progress);
if (state.abort) return;
if (mode === 'csv') {
await exportCsv(progress, data);
} else {
await downloadAndZip(progress, data);
}
} finally {
progress.setVisible(false);
state.running = false;
}
}
function mountUi() {
if (!isTargetListPage(document)) return;
if (document.querySelector('[data-hau-crawler="wrap"]')) return;
const progress = createProgressOverlay();
document.body.appendChild(progress.el);
const wrap = document.createElement('div');
wrap.setAttribute('data-hau-crawler', 'wrap');
Object.assign(wrap.style, {
position: 'fixed',
right: '18px',
bottom: '18px',
zIndex: 99999,
display: 'flex',
flexDirection: 'column',
gap: '8px',
});
const btnCsv = createButton('导出CSV');
btnCsv.style.bottom = '18px';
const btnZip = createButton('下载ZIP');
btnZip.style.bottom = '54px';
function setCancel(v) {
btnCsv.textContent = v ? '取消' : '导出CSV';
btnZip.textContent = v ? '取消' : '下载ZIP';
}
function onClick(mode) {
if (state.running) {
state.abort = true;
setCancel(false);
return;
}
setCancel(true);
runMode(progress, mode).finally(() => setCancel(false));
}
btnCsv.addEventListener('click', () => onClick('csv'));
btnZip.addEventListener('click', () => onClick('zip'));
wrap.appendChild(btnZip);
wrap.appendChild(btnCsv);
document.body.appendChild(wrap);
}
mountUi();
})();
版权属于:soarli
本文链接:https://blog.soarli.top/archives/805.html
转载时须注明出处及本声明。