231 lines
7.5 KiB
Python
231 lines
7.5 KiB
Python
|
import json
|
|||
|
import os
|
|||
|
from data.xhs.json.import_xhs_notes import connect_to_database, create_table, check_record_exists
|
|||
|
# from check_downloads import check_media_files
|
|||
|
from mysql.connector import Error
|
|||
|
import time
|
|||
|
import asyncio
|
|||
|
import subprocess
|
|||
|
from datetime import datetime
|
|||
|
import random
|
|||
|
|
|||
|
async def _run_crawler(keyword):
|
|||
|
"""运行爬虫的异步实现"""
|
|||
|
try:
|
|||
|
process = await asyncio.create_subprocess_exec(
|
|||
|
'python', 'main.py',
|
|||
|
'--platform', 'xhs',
|
|||
|
'--lt', 'qrcode',
|
|||
|
'--keywords', keyword,
|
|||
|
stdout=asyncio.subprocess.PIPE,
|
|||
|
stderr=asyncio.subprocess.PIPE,
|
|||
|
limit=1024*1024
|
|||
|
)
|
|||
|
|
|||
|
# 读取输出流
|
|||
|
async def read_stream(stream):
|
|||
|
buffer = ""
|
|||
|
while True:
|
|||
|
chunk = await stream.read(8192)
|
|||
|
if not chunk:
|
|||
|
break
|
|||
|
text = chunk.decode('utf-8', errors='ignore')
|
|||
|
buffer += text
|
|||
|
|
|||
|
# 处理输出
|
|||
|
while '\n' in buffer:
|
|||
|
line, buffer = buffer.split('\n', 1)
|
|||
|
if line.strip():
|
|||
|
print(f"爬虫进度: {line.strip()}")
|
|||
|
|
|||
|
# 同时处理标准输出和错误输出
|
|||
|
await asyncio.gather(
|
|||
|
read_stream(process.stdout),
|
|||
|
read_stream(process.stderr)
|
|||
|
)
|
|||
|
|
|||
|
await process.wait()
|
|||
|
return process.returncode == 0
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(f"爬虫执行错误: {str(e)}")
|
|||
|
return False
|
|||
|
|
|||
|
def load_search_keywords():
|
|||
|
"""从sheet_notes文件夹加载搜索关键词"""
|
|||
|
keywords_dict = {}
|
|||
|
json_dir = './data/xhs/json/sheet_notes'
|
|||
|
|
|||
|
for json_file in os.listdir(json_dir):
|
|||
|
if not json_file.endswith('.json'):
|
|||
|
continue
|
|||
|
|
|||
|
sheet_name = os.path.splitext(json_file)[0]
|
|||
|
with open(os.path.join(json_dir, json_file), 'r', encoding='utf-8') as f:
|
|||
|
keywords = json.load(f)
|
|||
|
|
|||
|
# 修护.json从第12个元素开始
|
|||
|
# if sheet_name == '修护':
|
|||
|
# keywords = keywords[11:]
|
|||
|
|
|||
|
keywords_dict[sheet_name] = keywords
|
|||
|
|
|||
|
return keywords_dict
|
|||
|
|
|||
|
def insert_note_data(connection, data, sheet_name):
|
|||
|
"""插入笔记数据到数据库"""
|
|||
|
insert_query = """
|
|||
|
INSERT INTO xhs_notes (
|
|||
|
note_id, type, title, description, video_url, time,
|
|||
|
last_update_time, user_id, nickname, avatar,
|
|||
|
liked_count, collected_count, comment_count, share_count,
|
|||
|
ip_location, image_list, tag_list, last_modify_ts,
|
|||
|
note_url, source_keyword, sheet_name, download_flag
|
|||
|
) VALUES (
|
|||
|
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
|
|||
|
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
|
|||
|
)
|
|||
|
"""
|
|||
|
try:
|
|||
|
cursor = connection.cursor()
|
|||
|
inserted_count = 0
|
|||
|
skipped_count = 0
|
|||
|
|
|||
|
for item in data:
|
|||
|
note_id = item.get('note_id')
|
|||
|
|
|||
|
# 检查记录是否已存在
|
|||
|
if check_record_exists(cursor, note_id):
|
|||
|
skipped_count += 1
|
|||
|
continue
|
|||
|
|
|||
|
values = (
|
|||
|
note_id,
|
|||
|
item.get('type'),
|
|||
|
item.get('title'),
|
|||
|
item.get('desc'),
|
|||
|
item.get('video_url'),
|
|||
|
item.get('time'),
|
|||
|
item.get('last_update_time'),
|
|||
|
item.get('user_id'),
|
|||
|
item.get('nickname'),
|
|||
|
item.get('avatar'),
|
|||
|
item.get('liked_count'),
|
|||
|
item.get('collected_count'),
|
|||
|
item.get('comment_count'),
|
|||
|
item.get('share_count'),
|
|||
|
item.get('ip_location'),
|
|||
|
item.get('image_list'),
|
|||
|
item.get('tag_list'),
|
|||
|
item.get('last_modify_ts'),
|
|||
|
item.get('note_url'),
|
|||
|
item.get('source_keyword'),
|
|||
|
sheet_name,
|
|||
|
False # download_flag 默认为False
|
|||
|
)
|
|||
|
cursor.execute(insert_query, values)
|
|||
|
inserted_count += 1
|
|||
|
|
|||
|
connection.commit()
|
|||
|
print(f'成功插入 {inserted_count} 条新数据')
|
|||
|
print(f'跳过 {skipped_count} 条已存在的数据')
|
|||
|
|
|||
|
except Error as e:
|
|||
|
print(f'插入数据时出错: {e}')
|
|||
|
connection.rollback()
|
|||
|
|
|||
|
def search_xhs_notes(keyword):
|
|||
|
"""搜索小红书笔记"""
|
|||
|
try:
|
|||
|
# 创建事件循环
|
|||
|
loop = asyncio.new_event_loop()
|
|||
|
asyncio.set_event_loop(loop)
|
|||
|
|
|||
|
try:
|
|||
|
# 运行爬虫
|
|||
|
success = loop.run_until_complete(_run_crawler(keyword))
|
|||
|
if not success:
|
|||
|
print(f"爬虫执行失败: {keyword}")
|
|||
|
return None
|
|||
|
|
|||
|
# 读取爬虫结果
|
|||
|
json_path = f'./data/xhs/json/search_contents_{datetime.now().strftime("%Y-%m-%d")}.json'
|
|||
|
if not os.path.exists(json_path):
|
|||
|
print(f"找不到爬虫结果文<EFBFBD><EFBFBD>: {json_path}")
|
|||
|
return None
|
|||
|
|
|||
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|||
|
data = json.load(f)
|
|||
|
|
|||
|
# 为每条记录添加来源关键词
|
|||
|
for item in data:
|
|||
|
item['source_keyword'] = keyword
|
|||
|
|
|||
|
return data
|
|||
|
|
|||
|
finally:
|
|||
|
loop.close()
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(f"搜索过程发生错误: {str(e)}")
|
|||
|
return None
|
|||
|
|
|||
|
def keyword_exists_in_db(connection, keyword):
|
|||
|
"""检查关键词是否已存在于数据库中"""
|
|||
|
query = "SELECT COUNT(*) FROM xhs_notes WHERE source_keyword = %s"
|
|||
|
cursor = connection.cursor()
|
|||
|
cursor.execute(query, (keyword,))
|
|||
|
result = cursor.fetchone()
|
|||
|
return result[0] > 0
|
|||
|
|
|||
|
def main():
|
|||
|
# 连接数据库
|
|||
|
connection = connect_to_database()
|
|||
|
if connection is None:
|
|||
|
return
|
|||
|
|
|||
|
try:
|
|||
|
# 创建表格
|
|||
|
create_table(connection)
|
|||
|
|
|||
|
# 加载搜索关键词
|
|||
|
keywords_dict = load_search_keywords()
|
|||
|
|
|||
|
# 对每个sheet的关键词进行搜索
|
|||
|
for sheet_name, keywords in keywords_dict.items():
|
|||
|
print(f'开始处理 {sheet_name} 的关键词...')
|
|||
|
|
|||
|
for keyword in keywords:
|
|||
|
# 检查关键词是否已存在
|
|||
|
if keyword_exists_in_db(connection, keyword):
|
|||
|
print(f'关键词已存在,跳过: {keyword}')
|
|||
|
continue
|
|||
|
|
|||
|
print(f'搜索关键词: {keyword}')
|
|||
|
|
|||
|
# 搜索小红书笔记
|
|||
|
search_results = search_xhs_notes(keyword)
|
|||
|
if search_results:
|
|||
|
# 将搜索结果保存到数据库
|
|||
|
insert_note_data(connection, search_results, sheet_name)
|
|||
|
else:
|
|||
|
print(f"未获取到搜索结果: {keyword}")
|
|||
|
|
|||
|
# 添加延时避免请求过快(随机延时10-30秒)
|
|||
|
time.sleep(random.uniform(10, 30))
|
|||
|
|
|||
|
print(f'{sheet_name} 的关键词处理完成')
|
|||
|
|
|||
|
# 下载所有媒体文件
|
|||
|
# print('开始下载媒体文件...')
|
|||
|
# check_media_files()
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(f'处理过程中出错: {e}')
|
|||
|
finally:
|
|||
|
if connection.is_connected():
|
|||
|
connection.close()
|
|||
|
print('数据库连接已关闭')
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
main()
|