xhs_crawler/process_xhs_note.py

import pandas as pd
import json
import os

def excel_to_json():
    # 创建输出文件夹
    output_dir = 'sheet_notes'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 读取Excel文件
    excel_file = 'input.xlsx'
    
    # 获取所有sheet名称
    xl = pd.ExcelFile(excel_file)
    sheet_names = xl.sheet_names
    
    # 处理每个sheet
    for sheet_name in sheet_names:
        # 读取当前sheet，跳过第一行，使用第二行作为列名
        df = pd.read_excel(excel_file, 
                         sheet_name=sheet_name, 
                         header=1)
        
        # 获取"笔记标题"列的内容
        if '笔记标题' in df.columns:
            # 将笔记标题转换为列表，并去除空值
            notes = df['笔记标题'].dropna().tolist()
            
            # 保存为JSON文件
            output_file = os.path.join(output_dir, f'{sheet_name}.json')
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(notes, f, ensure_ascii=False, indent=4)
            
            print(f'已保存 {sheet_name} 的笔记标题到 {output_file}')
        else:
            print(f'警告: {sheet_name} 中没有找到"笔记标题"列')

if __name__ == '__main__':
    excel_to_json()