231 lines
7.5 KiB
Python
231 lines
7.5 KiB
Python
import json
|
||
import os
|
||
from data.xhs.json.import_xhs_notes import connect_to_database, create_table, check_record_exists
|
||
# from check_downloads import check_media_files
|
||
from mysql.connector import Error
|
||
import time
|
||
import asyncio
|
||
import subprocess
|
||
from datetime import datetime
|
||
import random
|
||
|
||
async def _run_crawler(keyword):
|
||
"""运行爬虫的异步实现"""
|
||
try:
|
||
process = await asyncio.create_subprocess_exec(
|
||
'python', 'main.py',
|
||
'--platform', 'xhs',
|
||
'--lt', 'qrcode',
|
||
'--keywords', keyword,
|
||
stdout=asyncio.subprocess.PIPE,
|
||
stderr=asyncio.subprocess.PIPE,
|
||
limit=1024*1024
|
||
)
|
||
|
||
# 读取输出流
|
||
async def read_stream(stream):
|
||
buffer = ""
|
||
while True:
|
||
chunk = await stream.read(8192)
|
||
if not chunk:
|
||
break
|
||
text = chunk.decode('utf-8', errors='ignore')
|
||
buffer += text
|
||
|
||
# 处理输出
|
||
while '\n' in buffer:
|
||
line, buffer = buffer.split('\n', 1)
|
||
if line.strip():
|
||
print(f"爬虫进度: {line.strip()}")
|
||
|
||
# 同时处理标准输出和错误输出
|
||
await asyncio.gather(
|
||
read_stream(process.stdout),
|
||
read_stream(process.stderr)
|
||
)
|
||
|
||
await process.wait()
|
||
return process.returncode == 0
|
||
|
||
except Exception as e:
|
||
print(f"爬虫执行错误: {str(e)}")
|
||
return False
|
||
|
||
def load_search_keywords():
|
||
"""从sheet_notes文件夹加载搜索关键词"""
|
||
keywords_dict = {}
|
||
json_dir = './data/xhs/json/sheet_notes'
|
||
|
||
for json_file in os.listdir(json_dir):
|
||
if not json_file.endswith('.json'):
|
||
continue
|
||
|
||
sheet_name = os.path.splitext(json_file)[0]
|
||
with open(os.path.join(json_dir, json_file), 'r', encoding='utf-8') as f:
|
||
keywords = json.load(f)
|
||
|
||
# 修护.json从第12个元素开始
|
||
# if sheet_name == '修护':
|
||
# keywords = keywords[11:]
|
||
|
||
keywords_dict[sheet_name] = keywords
|
||
|
||
return keywords_dict
|
||
|
||
def insert_note_data(connection, data, sheet_name):
|
||
"""插入笔记数据到数据库"""
|
||
insert_query = """
|
||
INSERT INTO xhs_notes (
|
||
note_id, type, title, description, video_url, time,
|
||
last_update_time, user_id, nickname, avatar,
|
||
liked_count, collected_count, comment_count, share_count,
|
||
ip_location, image_list, tag_list, last_modify_ts,
|
||
note_url, source_keyword, sheet_name, download_flag
|
||
) VALUES (
|
||
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
|
||
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
|
||
)
|
||
"""
|
||
try:
|
||
cursor = connection.cursor()
|
||
inserted_count = 0
|
||
skipped_count = 0
|
||
|
||
for item in data:
|
||
note_id = item.get('note_id')
|
||
|
||
# 检查记录是否已存在
|
||
if check_record_exists(cursor, note_id):
|
||
skipped_count += 1
|
||
continue
|
||
|
||
values = (
|
||
note_id,
|
||
item.get('type'),
|
||
item.get('title'),
|
||
item.get('desc'),
|
||
item.get('video_url'),
|
||
item.get('time'),
|
||
item.get('last_update_time'),
|
||
item.get('user_id'),
|
||
item.get('nickname'),
|
||
item.get('avatar'),
|
||
item.get('liked_count'),
|
||
item.get('collected_count'),
|
||
item.get('comment_count'),
|
||
item.get('share_count'),
|
||
item.get('ip_location'),
|
||
item.get('image_list'),
|
||
item.get('tag_list'),
|
||
item.get('last_modify_ts'),
|
||
item.get('note_url'),
|
||
item.get('source_keyword'),
|
||
sheet_name,
|
||
False # download_flag 默认为False
|
||
)
|
||
cursor.execute(insert_query, values)
|
||
inserted_count += 1
|
||
|
||
connection.commit()
|
||
print(f'成功插入 {inserted_count} 条新数据')
|
||
print(f'跳过 {skipped_count} 条已存在的数据')
|
||
|
||
except Error as e:
|
||
print(f'插入数据时出错: {e}')
|
||
connection.rollback()
|
||
|
||
def search_xhs_notes(keyword):
|
||
"""搜索小红书笔记"""
|
||
try:
|
||
# 创建事件循环
|
||
loop = asyncio.new_event_loop()
|
||
asyncio.set_event_loop(loop)
|
||
|
||
try:
|
||
# 运行爬虫
|
||
success = loop.run_until_complete(_run_crawler(keyword))
|
||
if not success:
|
||
print(f"爬虫执行失败: {keyword}")
|
||
return None
|
||
|
||
# 读取爬虫结果
|
||
json_path = f'./data/xhs/json/search_contents_{datetime.now().strftime("%Y-%m-%d")}.json'
|
||
if not os.path.exists(json_path):
|
||
print(f"找不到爬虫结果文<EFBFBD><EFBFBD>: {json_path}")
|
||
return None
|
||
|
||
with open(json_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
# 为每条记录添加来源关键词
|
||
for item in data:
|
||
item['source_keyword'] = keyword
|
||
|
||
return data
|
||
|
||
finally:
|
||
loop.close()
|
||
|
||
except Exception as e:
|
||
print(f"搜索过程发生错误: {str(e)}")
|
||
return None
|
||
|
||
def keyword_exists_in_db(connection, keyword):
|
||
"""检查关键词是否已存在于数据库中"""
|
||
query = "SELECT COUNT(*) FROM xhs_notes WHERE source_keyword = %s"
|
||
cursor = connection.cursor()
|
||
cursor.execute(query, (keyword,))
|
||
result = cursor.fetchone()
|
||
return result[0] > 0
|
||
|
||
def main():
|
||
# 连接数据库
|
||
connection = connect_to_database()
|
||
if connection is None:
|
||
return
|
||
|
||
try:
|
||
# 创建表格
|
||
create_table(connection)
|
||
|
||
# 加载搜索关键词
|
||
keywords_dict = load_search_keywords()
|
||
|
||
# 对每个sheet的关键词进行搜索
|
||
for sheet_name, keywords in keywords_dict.items():
|
||
print(f'开始处理 {sheet_name} 的关键词...')
|
||
|
||
for keyword in keywords:
|
||
# 检查关键词是否已存在
|
||
if keyword_exists_in_db(connection, keyword):
|
||
print(f'关键词已存在,跳过: {keyword}')
|
||
continue
|
||
|
||
print(f'搜索关键词: {keyword}')
|
||
|
||
# 搜索小红书笔记
|
||
search_results = search_xhs_notes(keyword)
|
||
if search_results:
|
||
# 将搜索结果保存到数据库
|
||
insert_note_data(connection, search_results, sheet_name)
|
||
else:
|
||
print(f"未获取到搜索结果: {keyword}")
|
||
|
||
# 添加延时避免请求过快(随机延时10-30秒)
|
||
time.sleep(random.uniform(10, 30))
|
||
|
||
print(f'{sheet_name} 的关键词处理完成')
|
||
|
||
# 下载所有媒体文件
|
||
# print('开始下载媒体文件...')
|
||
# check_media_files()
|
||
|
||
except Exception as e:
|
||
print(f'处理过程中出错: {e}')
|
||
finally:
|
||
if connection.is_connected():
|
||
connection.close()
|
||
print('数据库连接已关闭')
|
||
|
||
if __name__ == "__main__":
|
||
main() |