把公众号内容抓取到hexo博客上

发表于 2025-07-14 分类于折腾 Waline：

一、下载微信公众号的文章

首先使用某个工具https://github.com/qiye45/wechatDownload

把微信公众号的文章全部下载下来，虽然支持md格式，但是有以下问题：

格式不符合hexo博客的标准
md文档内的图片地址是保存在微信公众号上面的，需要迁移到本地保存

二、转换格式

第一个问题很好解决，可以设置脚本，在每个md文件头加入标题和日期，至于tag这些就暂时不考虑了。

复制以下代码粘贴到convert_to_hexo.bat 文件

@echo off
chcp 65001 >nul
setlocal enabledelayedexpansion

echo 正在转换公众号文章为Hexo格式...
echo.

:: 创建输出目录
if not exist "hexo_posts" mkdir "hexo_posts"
if not exist "post_images" mkdir "post_images"

:: 临时文件
set "temp_file=hexo_temp.md"

:: 处理所有.md文件
for %%f in (*.md) do (
    set "fullname=%%f"
    set "name=%%~nf"
    
    :: 提取日期部分(前8位数字)
    set "fdate=!name:~0,8!"
    
    :: 验证是否为有效日期
    echo !fdate!|findstr /r "^[0-9][0-9][0-9][0-9][0-1][0-9][0-3][0-9]$">nul
    
    if errorlevel 1 (
        echo 文件 [!name!] 跳过 - 无效的日期格式
    ) else (
        :: 提取标题部分(第9个字符开始)
        set "title=!name:~8!"
        
        :: 创建对应的图片目录
        if not exist "post_images\!name!" mkdir "post_images\!name!"
        
        :: 格式化日期为YYYY-MM-DD
        set "hexo_date=!fdate:~0,4!-!fdate:~4,2!-!fdate:~6,2!"
        
        :: 生成临时文件（Front Matter部分）
        (
            echo ---
            echo title: !title!
            echo date: !hexo_date! 12:00:00
            echo tags: []
            echo categories: []
            echo ---
            echo.
        ) > "%temp_file%"
        
        :: 合并Front Matter和原文件内容
        type "%temp_file%" "!fullname!" > "hexo_posts\!fullname!"
        
        echo 已转换: !name!
    )
)

:: 清理临时文件
if exist "%temp_file%" del "%temp_file%"

echo.
echo 转换完成！共处理了 %%~f 个文件
echo Hexo格式文章保存在 hexo_posts 文件夹
echo 图片文件夹保存在 post_images 文件夹
pause

三、保存图片

第二个问题的思路就是把每个md文件的图片保存到本地，并且重新命名，命名之后再修改md文档里插入的图片路径。通过node.js脚本来实现。

处理图片时又遇到如下问题

1.后面获取的图片会把前面的图片覆盖掉

2.会把头像信息也获取进来，而且每个文章的图片文件夹是独立的，意味着每个公众号下的头像都会重复保存一次

复制以下代码粘贴到 download-images4.js

const fs = require('fs');
const path = require('path');
const https = require('https');
const http = require('http');
const crypto = require('crypto');

const IMG_REGEX = /!\[.*?\]\((http.*?)\)/g;
const mimeExtensions = {
  'image/jpeg': '.jpg',
  'image/png': '.png',
  'image/gif': '.gif',
  'image/webp': '.webp',
  'image/svg+xml': '.svg',
  'image/bmp': '.bmp',
  'image/tiff': '.tiff'
};

function getExtensionFromMime(mime) {
  return mimeExtensions[mime] || '.jpg';
}

function downloadImage(url, savePath) {
  return new Promise((resolve, reject) => {
    const mod = url.startsWith('https') ? https : http;
    mod.get(url, res => {
      if (res.statusCode !== 200) return reject(`状态码 ${res.statusCode}`);
      const ext = getExtensionFromMime(res.headers['content-type'] || '');
      const filePath = savePath + ext;
      const fileStream = fs.createWriteStream(filePath);
      res.pipe(fileStream);
      fileStream.on('finish', () => {
        fileStream.close(() => resolve(filePath));
      });
    }).on('error', reject);
  });
}

function collectImageURLs(mdFiles) {
  const urlCountMap = new Map();
  const fileMap = new Map(); // 记录每个文件里有哪些 url

  for (const file of mdFiles) {
    const content = fs.readFileSync(file, 'utf8');
    const urls = [...content.matchAll(IMG_REGEX)].map(m => m[1]);
    fileMap.set(file, urls);
    urls.forEach(url => {
      if (!url.startsWith('http')) return;
      urlCountMap.set(url, (urlCountMap.get(url) || 0) + 1);
    });
  }
  return { urlCountMap, fileMap };
}

async function processFiles(mdFiles, urlCountMap, fileMap) {
  const sharedDir = './shared';
  if (!fs.existsSync(sharedDir)) fs.mkdirSync(sharedDir);

  const sharedMap = new Map(); // URL -> local name
  let sharedIndex = 1;

  for (const file of mdFiles) {
    const content = fs.readFileSync(file, 'utf8');
    const baseName = path.basename(file, '.md');
    const folder = `./${baseName}`;
    if (!fs.existsSync(folder)) fs.mkdirSync(folder);

    let newContent = content;
    let uniqueIndex = 1;
    const urls = fileMap.get(file);

    for (const url of urls) {
      if (!url.startsWith('http')) continue;

      let newPath = '';

      if (urlCountMap.get(url) > 1) {
        // 多处使用的图片
        if (!sharedMap.has(url)) {
          const fileName = `img${sharedIndex++}`;
          try {
            const savedPath = await downloadImage(url, path.join(sharedDir, fileName));
            const relPath = `shared/${path.basename(savedPath)}`;
            sharedMap.set(url, relPath);
            console.log(`✅ [共享] 下载 ${url} -> ${relPath}`);
          } catch (err) {
            console.log(`❌ 下载失败（共享）${url}: ${err}`);
            continue;
          }
        }
        newPath = sharedMap.get(url);
      } else {
        // 独占图片
        const fileName = `img${uniqueIndex++}`;
        try {
          const savedPath = await downloadImage(url, path.join(folder, fileName));
          newPath = `${baseName}/${path.basename(savedPath)}`;
          console.log(`✅ [独占] 下载 ${url} -> ${newPath}`);
        } catch (err) {
          console.log(`❌ 下载失败（独占）${url}: ${err}`);
          continue;
        }
      }

      // 替换 md 内容
      newContent = newContent.replaceAll(url, newPath);
    }

    fs.writeFileSync(file, newContent, 'utf8');
    console.log(`📄 完成处理：${file}`);
  }
}

(async () => {
  const mdFiles = fs.readdirSync('.').filter(f => f.endsWith('.md'));
  const { urlCountMap, fileMap } = collectImageURLs(mdFiles);
  await processFiles(mdFiles, urlCountMap, fileMap);
  console.log(`✅ 全部完成！`);
})();