xhs_server/check_downloads.py

159 lines
5.7 KiB
Python
Raw Permalink Normal View History

2024-12-16 02:31:07 +00:00
import os
import mysql.connector
from urllib.parse import urlparse
import requests
import time
# 数据库配置
db_config = {
'user': 'root',
'password': 'zaq12wsx@9Xin',
'host': '183.11.229.79',
'port': 3316,
'database': '9Xin',
'auth_plugin': 'mysql_native_password'
}
def download_file(url, save_path):
"""下载文件"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, stream=True, timeout=10)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return True
except Exception as e:
print(f'下载文件失败 {url}: {str(e)}')
return False
def retry_download(note_id, image_list, video_url):
"""重试下载失败的媒体文件"""
media_dir = f'./data/xhs/media/{note_id}'
download_success = True
# 重试下载图片
if image_list:
image_urls = image_list.split(',')
for i, url in enumerate(image_urls):
url = url.strip()
if not url:
continue
ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
image_path = os.path.join(media_dir, f'image_{i+1}{ext}')
if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:
print(f'重试下载图片 {note_id} - {i+1}')
if not download_file(url, image_path):
download_success = False
time.sleep(1) # 添加延时避免请求过快
# 重试下载视频
if video_url and video_url.strip():
video_url = video_url.strip()
ext = os.path.splitext(urlparse(video_url).path)[1] or '.mp4'
video_path = os.path.join(media_dir, f'video{ext}')
if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:
print(f'重试下载视频 {note_id}')
if not download_file(video_url, video_path):
download_success = False
return download_success
def check_media_files():
"""检查媒体文件下载状态并更新数据库,对失败的记录进行重试下载"""
try:
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor(dictionary=True)
# 确保download_flag字段存在
try:
cursor.execute("""
ALTER TABLE xhs_notes
ADD COLUMN IF NOT EXISTS download_flag BOOLEAN DEFAULT FALSE
""")
conn.commit()
except Exception as e:
print(f"添加download_flag字段时出错: {e}")
# 获取所有记录
cursor.execute("""
SELECT note_id, image_list, video_url, download_flag
FROM xhs_notes
""")
records = cursor.fetchall()
update_query = """
UPDATE xhs_notes
SET download_flag = %s
WHERE note_id = %s
"""
total = len(records)
completed = 0
print(f"开始检查 {total} 条记录的下载状态...")
for record in records:
note_id = record['note_id']
is_complete = True
media_dir = f'./data/xhs/json/media/{note_id}'
# 检查图片和视频是否完整
if record['image_list']:
image_urls = record['image_list'].split(',')
for i, url in enumerate(image_urls):
if url.strip():
ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
image_path = os.path.join(media_dir, f'image_{i+1}{ext}')
if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:
is_complete = False
break
if record['video_url'] and record['video_url'].strip():
url = record['video_url'].strip()
ext = os.path.splitext(urlparse(url).path)[1] or '.mp4'
video_path = os.path.join(media_dir, f'video{ext}')
if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:
is_complete = False
# 如果下载不完整,尝试重新下载
if not is_complete:
print(f"发现未完成下载的记录: {note_id},开始重试下载...")
is_complete = retry_download(
note_id,
record['image_list'],
record['video_url']
)
# 更新数据库状态
if is_complete != record['download_flag']:
cursor.execute(update_query, (is_complete, note_id))
status = "完成" if is_complete else "未完成"
print(f"更新记录 {note_id} 的下载状态为: {status}")
completed += 1
if completed % 10 == 0:
print(f"进度: {completed}/{total}")
conn.commit()
print("检查和重试下载完成!")
except Exception as e:
print(f"发生错误: {e}")
finally:
if 'conn' in locals():
conn.close()
if __name__ == "__main__":
check_media_files()