soarli

HAU Finance File Downloader Crawler
基于v0.1.0版本修改,加入自动重命名、打包zip、多线程等技术优化用户体验,油猴脚本如下:// ==UserS...
扫描右侧二维码阅读全文
08
2026/02

HAU Finance File Downloader Crawler

基于v0.1.0版本修改,加入自动重命名、打包zip、多线程等技术优化用户体验,油猴脚本如下:

// ==UserScript==
// @name         HAU Finance File Downloader Crawler
// @namespace    https://cwc.henau.edu.cn/
// @version      0.3.0
// @description  Crawl file listings, extract download links, export CSV or download all files as ZIP.
// @match        https://cwc.henau.edu.cn/plus/list.php?tid=*
// @match        https://cwc.henau.edu.cn/plus/list.php*tid=*
// @grant        GM_xmlhttpRequest
// @connect      cwc.henau.edu.cn
// @connect      cdn.jsdelivr.net
// ==/UserScript==

(function () {
  'use strict';

  const BRAND = {
    primary: '#012765',
    text: '#ffffff',
    gray: '#6b7280',
  };

  const SELECTOR = {
    listContainer: '.list_box',
    listItems: '.list_box ul li',
    itemDate: '.datetime',
    pages: '.pages',
    pageInfoStrong: '.pages .pageinfo strong',
    breadcrumb: '.page_pos',
    sidebarCurrent: '.list_l a.cur',
    content: '.content',
  };

  const DELAY = { minMs: 500, maxMs: 1500 };
  const JSZIP_CDN = 'https://cdn.jsdelivr.net/npm/jszip@3.10.1/dist/jszip.min.js';
  /** 解析详情页时的并发数(同时请求的详情页数量) */
  const CONCURRENCY_DETAIL = 5;
  /** 下载文件时的并发数(同时下载的文件数量) */
  const CONCURRENCY_DOWNLOAD = 6;

  const state = { running: false, abort: false, jszipLoaded: false };

  /**
   * 并发池:对 items 逐个执行 taskFn,同时最多运行 concurrency 个任务。
   * @param {Array} items
   * @param {number} concurrency
   * @param {(item: T, index: number) => Promise<void>} taskFn
   * @param {{ onProgress?: (done: number, total: number) => void }} opts
   */
  async function runWithConcurrency(items, concurrency, taskFn, { onProgress } = {}) {
    const total = items.length;
    let doneCount = 0;
    let nextIndex = 0;

    async function worker() {
      while (nextIndex < total && !state.abort) {
        const i = nextIndex++;
        const item = items[i];
        try {
          await taskFn(item, i);
        } catch (_) {
          // 单条失败不中断,由 taskFn 内部处理
        } finally {
          doneCount++;
          if (onProgress) onProgress(doneCount, total);
        }
      }
    }

    const workers = Array.from({ length: Math.min(concurrency, total) }, () => worker());
    await Promise.all(workers);
  }

  function isTargetListPage(doc = document) {
    const urlOk = /\/plus\/list\.php\?/.test(location.href) && /[?&]tid=\d+/.test(location.href);
    if (!urlOk) return false;
    return Boolean(doc.querySelector(SELECTOR.listContainer) && doc.querySelector(SELECTOR.pages));
  }

  function sleep(ms) {
    return new Promise((resolve) => setTimeout(resolve, ms));
  }

  function randomDelayMs(minMs, maxMs) {
    return Math.floor(minMs + Math.random() * (maxMs - minMs + 1));
  }

  function textClean(s) {
    return String(s || '').replace(/\s+/g, ' ').trim();
  }

  function sanitizeFilenamePart(s) {
    return textClean(s)
      .replace(/[\\/:*?"<>|]/g, '-')
      .replace(/\s+/g, '')
      .slice(0, 80);
  }

  function getExtensionFromUrl(url) {
    try {
      const pathname = new URL(url, location.href).pathname || '';
      const m = pathname.match(/\.([a-z0-9]+)(?:\?|$)/i);
      return m ? '.' + m[1].toLowerCase() : '';
    } catch {
      return '';
    }
  }

  function parseHtml(html) {
    return new DOMParser().parseFromString(html, 'text/html');
  }

  function absUrl(url, base) {
    try {
      return new URL(url, base).toString();
    } catch {
      return '';
    }
  }

  function parseCharsetFromHeaders(headersText) {
    const m = String(headersText || '').match(/content-type\s*:\s*[^\n;]+;\s*charset=([^\s;\n]+)/i);
    return m ? String(m[1]).trim().toLowerCase() : '';
  }

  function parseCharsetFromHtmlSnippet(htmlSnippet) {
    const snippet = String(htmlSnippet || '');
    const m1 = snippet.match(/<meta\s+charset\s*=\s*['"]?([^'"\s/>]+)/i);
    if (m1) return String(m1[1]).trim().toLowerCase();
    const m2 = snippet.match(/<meta\s+[^>]*http-equiv\s*=\s*['"]content-type['"][^>]*content\s*=\s*['"][^'"]*charset=([^'"\s;]+)/i);
    return m2 ? String(m2[1]).trim().toLowerCase() : '';
  }

  function normalizeCharset(cs) {
    const c = String(cs || '').toLowerCase();
    if (!c) return '';
    if (c.includes('utf')) return 'utf-8';
    if (c.includes('gbk') || c.includes('gb2312') || c.includes('gb-2312')) return 'gbk';
    return c;
  }

  function decodeArrayBuffer(buffer, charsetHint, htmlSnippetForMeta) {
    const headerCharset = normalizeCharset(charsetHint);
    const metaCharset = normalizeCharset(parseCharsetFromHtmlSnippet(htmlSnippetForMeta));
    const candidates = [headerCharset, metaCharset].filter(Boolean);
    if (!candidates.includes('utf-8')) candidates.push('utf-8');
    if (!candidates.includes('gbk')) candidates.push('gbk');
    for (const cs of [...new Set(candidates)]) {
      try {
        const txt = new TextDecoder(cs, { fatal: false }).decode(buffer);
        if (txt) return txt;
      } catch {}
    }
    try {
      return new TextDecoder('utf-8').decode(buffer);
    } catch {
      return '';
    }
  }

  function gmRequestText(url) {
    return new Promise((resolve, reject) => {
      GM_xmlhttpRequest({
        method: 'GET',
        url,
        responseType: 'arraybuffer',
        onload: (res) => {
          try {
            const buf = res.response;
            if (!buf) return resolve('');
            const headersCharset = parseCharsetFromHeaders(res.responseHeaders || '');
            const tempUtf8 = (() => {
              try {
                return new TextDecoder('utf-8', { fatal: false }).decode(buf.slice(0, 4096));
              } catch {
                return '';
              }
            })();
            resolve(decodeArrayBuffer(buf, headersCharset, tempUtf8) || '');
          } catch {
            resolve('');
          }
        },
        onerror: (err) => reject(err),
        ontimeout: () => reject(new Error('GM_xmlhttpRequest timeout')),
      });
    });
  }

  function gmRequestArrayBuffer(url) {
    return new Promise((resolve, reject) => {
      GM_xmlhttpRequest({
        method: 'GET',
        url,
        responseType: 'arraybuffer',
        onload: (res) => {
          if (res.status >= 200 && res.status < 300) {
            resolve(res.response || new ArrayBuffer(0));
          } else {
            reject(new Error(`HTTP ${res.status}`));
          }
        },
        onerror: (err) => reject(err),
        ontimeout: () => reject(new Error('timeout')),
      });
    });
  }

  function loadScript(url) {
    return new Promise((resolve, reject) => {
      GM_xmlhttpRequest({
        method: 'GET',
        url,
        onload: (res) => {
          if (res.status !== 200) {
            reject(new Error(`HTTP ${res.status}`));
            return;
          }
          const script = document.createElement('script');
          script.textContent = res.responseText;
          (document.head || document.documentElement).appendChild(script);
          script.remove();
          resolve();
        },
        onerror: (err) => reject(err),
      });
    });
  }

  function loadJSZip() {
    if (typeof JSZip !== 'undefined') {
      state.jszipLoaded = true;
      return Promise.resolve();
    }
    return loadScript(JSZIP_CDN).then(() => {
      state.jszipLoaded = true;
    });
  }

  function getCategoriesFromDocument(doc = document) {
    let parent = '';
    let child = '';
    const breadcrumb = doc.querySelector(SELECTOR.breadcrumb);
    if (breadcrumb) {
      const texts = Array.from(breadcrumb.querySelectorAll('a')).map((a) => textClean(a.textContent)).filter(Boolean);
      if (texts.length >= 2) parent = texts[1];
      else if (texts.length === 1) parent = texts[0];
    }
    const sidebarCur = doc.querySelector(SELECTOR.sidebarCurrent);
    if (sidebarCur) child = textClean(sidebarCur.textContent);
    if (!child) child = parent;
    return { parent, child };
  }

  function getPaginationInfoFromDocument(doc = document) {
    const strongs = Array.from(doc.querySelectorAll(SELECTOR.pageInfoStrong));
    let totalPages = parseInt(textClean(strongs[0]?.textContent), 10) || 1;
    let totalItems = parseInt(textClean(strongs[1]?.textContent), 10) || 0;
    let totalResult = '';
    const anyPageLink = doc.querySelector('.pages a[href*="PageNo="]');
    if (anyPageLink) {
      try {
        totalResult = new URL(anyPageLink.getAttribute('href'), location.href).searchParams.get('TotalResult') || '';
      } catch {}
    }
    return { totalPages, totalItems, totalResult };
  }

  function buildListPageUrl(pageNo, totalResult) {
    const u = new URL(location.href);
    u.searchParams.delete('PageNo');
    u.searchParams.delete('TotalResult');
    if (pageNo > 1) {
      if (totalResult) u.searchParams.set('TotalResult', String(totalResult));
      u.searchParams.set('PageNo', String(pageNo));
    }
    return u.toString();
  }

  function extractListItemsFromDocument(doc, baseUrl) {
    const items = [];
    for (const li of doc.querySelectorAll(SELECTOR.listItems)) {
      const a = li.querySelector('a[href]');
      if (!a) continue;
      const title = textClean(a.getAttribute('title') || a.textContent);
      const detailUrl = absUrl(a.getAttribute('href'), baseUrl);
      const dateEl = li.querySelector(SELECTOR.itemDate);
      const publishDate = dateEl ? textClean(dateEl.textContent) : '';
      if (!detailUrl || (!title && !publishDate)) continue;
      items.push({ title, publishDate, detailUrl });
    }
    return items;
  }

  function extractDownloadLinksFromDetailDoc(doc, baseUrl) {
    const container = doc.querySelector(SELECTOR.content) || doc;
    const links = Array.from(container.querySelectorAll('a[href]'))
      .map((a) => absUrl(a.getAttribute('href'), baseUrl))
      .filter((href) => href && href !== baseUrl);
    const filtered = links.filter((href) => {
      if (/^javascript:/i.test(href) || href.endsWith('#')) return false;
      if (/\/plus\/list\.php\b|\/a\//i.test(href)) return false;
      const pathname = (new URL(href, baseUrl).pathname || '').toLowerCase();
      if (pathname.startsWith('/uploads/')) return true;
      return /\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar|7z|csv|txt|jpg|jpeg|png)($|\?)/i.test(pathname);
    });
    return Array.from(new Set(filtered));
  }

  function extractMainTextFromDetailDoc(doc) {
    const container = doc.querySelector(SELECTOR.content) || doc.body || doc.documentElement;
    return container ? textClean(container.textContent) : '';
  }

  function toCsvCell(v) {
    const s = String(v ?? '');
    return /[",\n\r]/.test(s) ? `"${s.replace(/"/g, '""')}"` : s;
  }

  function buildCsv(rows) {
    const header = ['所属分类(父)', '所属科室(子)', '文件名称', '发布日期', '详情页URL', '下载直链'];
    const lines = [header, ...rows.map((r) => [r.parentCategory, r.childCategory, r.title, r.publishDate, r.detailUrl, r.downloadUrl])].map((cols) => cols.map(toCsvCell).join(','));
    return '\uFEFF' + lines.join('\r\n');
  }

  function downloadBlob(filename, blob) {
    const url = URL.createObjectURL(blob);
    const a = document.createElement('a');
    a.href = url;
    a.download = filename;
    document.body.appendChild(a);
    a.click();
    a.remove();
    setTimeout(() => URL.revokeObjectURL(url), 2000);
  }

  function downloadTextFile(filename, text, mime = 'text/csv;charset=utf-8') {
    downloadBlob(filename, new Blob([text], { type: mime }));
  }

  function createButton(text) {
    const btn = document.createElement('button');
    btn.type = 'button';
    btn.textContent = text;
    btn.setAttribute('data-hau-crawler', 'btn');
    Object.assign(btn.style, {
      position: 'fixed',
      right: '18px',
      bottom: '18px',
      zIndex: 99999,
      background: BRAND.primary,
      color: BRAND.text,
      border: '0',
      borderRadius: '999px',
      padding: '12px 16px',
      fontSize: '14px',
      cursor: 'pointer',
      opacity: '0.7',
      boxShadow: '0 8px 22px rgba(0,0,0,.18)',
    });
    btn.addEventListener('mouseenter', () => (btn.style.opacity = '1.0'));
    btn.addEventListener('mouseleave', () => (btn.style.opacity = '0.7'));
    return btn;
  }

  function createProgressOverlay() {
    const wrap = document.createElement('div');
    wrap.setAttribute('data-hau-crawler', 'progress');
    Object.assign(wrap.style, {
      position: 'fixed',
      left: '50%',
      top: '22%',
      transform: 'translateX(-50%)',
      zIndex: 99999,
      width: '420px',
      maxWidth: '92vw',
      background: 'rgba(1,39,101,0.92)',
      color: BRAND.text,
      borderRadius: '12px',
      padding: '14px 16px',
      boxShadow: '0 12px 32px rgba(0,0,0,.22)',
      display: 'none',
    });
    const title = document.createElement('div');
    title.textContent = '抓取进度';
    Object.assign(title.style, { fontSize: '14px', fontWeight: '600', cursor: 'move', userSelect: 'none', paddingBottom: '4px' });
    const desc = document.createElement('div');
    desc.textContent = '';
    Object.assign(desc.style, { fontSize: '12px', marginTop: '8px', opacity: '0.95' });
    const barBg = document.createElement('div');
    Object.assign(barBg.style, { width: '100%', height: '8px', borderRadius: '999px', background: 'rgba(255,255,255,0.22)', marginTop: '10px', overflow: 'hidden' });
    const bar = document.createElement('div');
    Object.assign(bar.style, { width: '0%', height: '100%', borderRadius: '999px', background: 'rgba(255,255,255,0.95)' });
    barBg.appendChild(bar);
    wrap.appendChild(title);
    wrap.appendChild(desc);
    wrap.appendChild(barBg);

    let dragging = false;
    let dragOffsetX = 0;
    let dragOffsetY = 0;
    function onMouseMove(e) {
      if (!dragging) return;
      wrap.style.left = Math.max(6, Math.min(window.innerWidth - wrap.offsetWidth - 6, e.clientX - dragOffsetX)) + 'px';
      wrap.style.top = Math.max(6, Math.min(window.innerHeight - wrap.offsetHeight - 6, e.clientY - dragOffsetY)) + 'px';
      wrap.style.transform = 'none';
    }
    function onMouseUp() {
      dragging = false;
      document.removeEventListener('mousemove', onMouseMove);
      document.removeEventListener('mouseup', onMouseUp);
    }
    title.addEventListener('mousedown', (e) => {
      dragging = true;
      const rect = wrap.getBoundingClientRect();
      dragOffsetX = e.clientX - rect.left;
      dragOffsetY = e.clientY - rect.top;
      document.addEventListener('mousemove', onMouseMove);
      document.addEventListener('mouseup', onMouseUp);
      e.preventDefault();
    });

    return {
      el: wrap,
      setVisible(v) {
        wrap.style.display = v ? 'block' : 'none';
      },
      setProgress({ phase, pageNo, totalPages, done, total, extra }) {
        const p = total > 0 ? Math.min(100, Math.round((done / total) * 100)) : 0;
        bar.style.width = `${p}%`;
        const pagePart = totalPages ? `页 ${pageNo}/${totalPages}` : '';
        desc.textContent = extra ? `${phase}${pagePart ? ' · ' + pagePart : ''} · ${done}/${total} · ${extra}` : `${phase}${pagePart ? ' · ' + pagePart : ''} · ${done}/${total}`;
      },
    };
  }

  async function crawlCollectRows(progress) {
    const { parent, child } = getCategoriesFromDocument(document);
    const { totalPages, totalItems, totalResult } = getPaginationInfoFromDocument(document);

    progress.setVisible(true);
    progress.setProgress({ phase: '准备中', pageNo: 1, totalPages, done: 0, total: totalItems || 1, extra: '' });

    const allListItems = [];
    for (let pageNo = 1; pageNo <= totalPages; pageNo += 1) {
      if (state.abort) break;
      progress.setProgress({ phase: '抓取列表', pageNo, totalPages, done: allListItems.length, total: totalItems || 1, extra: '静默请求分页' });
      const pageUrl = buildListPageUrl(pageNo, totalResult);
      const html = pageNo === 1 ? document.documentElement.outerHTML : await gmRequestText(pageUrl);
      const doc = pageNo === 1 ? document : parseHtml(html);
      const base = pageNo === 1 ? location.href : pageUrl;
      allListItems.push(...extractListItemsFromDocument(doc, base));
    }

    const uniqueItems = Array.from(new Map(allListItems.map((it) => [it.detailUrl, it])).values());
    const rows = [];
    const noAttachment = [];

    await runWithConcurrency(
      uniqueItems,
      CONCURRENCY_DETAIL,
      async (it) => {
        let html = '';
        try {
          html = await gmRequestText(it.detailUrl);
        } catch {}
        if (html) {
          const detailDoc = parseHtml(html);
          const downloadLinks = extractDownloadLinksFromDetailDoc(detailDoc, it.detailUrl);
          if (downloadLinks.length > 0) {
            for (const link of downloadLinks) {
              rows.push({ parentCategory: parent, childCategory: child, title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, downloadUrl: link });
            }
          } else {
            const mainText = extractMainTextFromDetailDoc(detailDoc);
            if (mainText) noAttachment.push({ title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, text: mainText });
            rows.push({ parentCategory: parent, childCategory: child, title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, downloadUrl: '' });
          }
        } else {
          rows.push({ parentCategory: parent, childCategory: child, title: it.title, publishDate: it.publishDate, detailUrl: it.detailUrl, downloadUrl: '' });
        }
      },
      {
        onProgress: (done, total) =>
          progress.setProgress({ phase: '解析详情(并行)', pageNo: totalPages, totalPages, done, total, extra: `${CONCURRENCY_DETAIL} 路并发` }),
      }
    );

    const rowUniq = new Map();
    for (const r of rows) {
      const key = `${r.detailUrl}@@${r.downloadUrl}`;
      if (!rowUniq.has(key)) rowUniq.set(key, r);
    }
    return { rows: Array.from(rowUniq.values()), noAttachment, parent, child };
  }

  async function exportCsv(progress, { rows, noAttachment, parent, child }) {
    progress.setProgress({ phase: '生成CSV', pageNo: 1, totalPages: 1, done: rows.length, total: rows.length, extra: state.abort ? '已取消' : '完成' });
    const filename = `${sanitizeFilenamePart(parent)}-${sanitizeFilenamePart(child)}-文件列表.csv`;
    downloadTextFile(filename, buildCsv(rows));
    if (noAttachment.length > 0) {
      const txtLines = noAttachment.flatMap((n, idx) => [
        '============================================================',
        `【${idx + 1}/${noAttachment.length}】标题:${n.title || ''}`,
        `日期:${n.publishDate || ''}`,
        `URL:${n.detailUrl || ''}`,
        '------------------------------------------------------------',
        '正文:',
        '',
        n.text || '',
        '',
      ]);
      downloadTextFile(`${sanitizeFilenamePart(parent)}-${sanitizeFilenamePart(child)}-无附件正文.txt`, '\uFEFF' + txtLines.join('\r\n'), 'text/plain;charset=utf-8');
    }
  }

  async function downloadAndZip(progress, { rows, parent, child }) {
    const withUrl = rows.filter((r) => r.downloadUrl);
    if (withUrl.length === 0) {
      progress.setProgress({ phase: '下载ZIP', pageNo: 1, totalPages: 1, done: 0, total: 0, extra: '无附件,跳过打包' });
      await sleep(400);
      return;
    }

    progress.setProgress({ phase: '加载 JSZip', pageNo: 1, totalPages: 1, done: 0, total: 1, extra: '' });
    try {
      await loadJSZip();
    } catch (e) {
      progress.setProgress({ phase: '错误', pageNo: 1, totalPages: 1, done: 0, total: 1, extra: 'JSZip 加载失败' });
      await sleep(2000);
      return;
    }

    const zip = new JSZip();
    const usedNames = new Set();

    function uniqueZipPath(row, index) {
      const base = sanitizeFilenamePart(row.title) || `file_${index}`;
      const ext = getExtensionFromUrl(row.downloadUrl) || '';
      let name = base + ext;
      let n = 0;
      while (usedNames.has(name)) {
        n += 1;
        name = `${base}_${n}${ext}`;
      }
      usedNames.add(name);
      return name;
    }

    await runWithConcurrency(
      withUrl,
      CONCURRENCY_DOWNLOAD,
      async (row, i) => {
        const zipPath = uniqueZipPath(row, i);
        try {
          const buf = await gmRequestArrayBuffer(row.downloadUrl);
          zip.file(zipPath, buf);
        } catch {
          zip.file(zipPath + '.failed.txt', `下载失败: ${row.downloadUrl}`);
        }
      },
      {
        onProgress: (done, total) =>
          progress.setProgress({ phase: '下载文件(并行)', pageNo: 1, totalPages: 1, done, total, extra: `${CONCURRENCY_DOWNLOAD} 路并发` }),
      }
    );

    if (state.abort) return;
    progress.setProgress({ phase: '打包ZIP', pageNo: 1, totalPages: 1, done: withUrl.length, total: withUrl.length, extra: '' });
    const blob = await zip.generateAsync({ type: 'blob' });
    const zipFilename = `${sanitizeFilenamePart(parent)}-${sanitizeFilenamePart(child)}-附件.zip`;
    downloadBlob(zipFilename, blob);
    await sleep(400);
  }

  async function runMode(progress, mode) {
    if (state.running) return;
    state.running = true;
    state.abort = false;
    try {
      const data = await crawlCollectRows(progress);
      if (state.abort) return;
      if (mode === 'csv') {
        await exportCsv(progress, data);
      } else {
        await downloadAndZip(progress, data);
      }
    } finally {
      progress.setVisible(false);
      state.running = false;
    }
  }

  function mountUi() {
    if (!isTargetListPage(document)) return;
    if (document.querySelector('[data-hau-crawler="wrap"]')) return;

    const progress = createProgressOverlay();
    document.body.appendChild(progress.el);

    const wrap = document.createElement('div');
    wrap.setAttribute('data-hau-crawler', 'wrap');
    Object.assign(wrap.style, {
      position: 'fixed',
      right: '18px',
      bottom: '18px',
      zIndex: 99999,
      display: 'flex',
      flexDirection: 'column',
      gap: '8px',
    });

    const btnCsv = createButton('导出CSV');
    btnCsv.style.bottom = '18px';
    const btnZip = createButton('下载ZIP');
    btnZip.style.bottom = '54px';

    function setCancel(v) {
      btnCsv.textContent = v ? '取消' : '导出CSV';
      btnZip.textContent = v ? '取消' : '下载ZIP';
    }

    function onClick(mode) {
      if (state.running) {
        state.abort = true;
        setCancel(false);
        return;
      }
      setCancel(true);
      runMode(progress, mode).finally(() => setCancel(false));
    }

    btnCsv.addEventListener('click', () => onClick('csv'));
    btnZip.addEventListener('click', () => onClick('zip'));
    wrap.appendChild(btnZip);
    wrap.appendChild(btnCsv);
    document.body.appendChild(wrap);
  }

  mountUi();
})();

附:油猴下载链接

最后修改:2026 年 02 月 08 日 07 : 10 AM

发表评论