159 lines
5.6 KiB
Python
159 lines
5.6 KiB
Python
|
import os
|
||
|
import mysql.connector
|
||
|
from urllib.parse import urlparse
|
||
|
import requests
|
||
|
import time
|
||
|
|
||
|
# 数据库配置
|
||
|
db_config = {
|
||
|
'user': 'root',
|
||
|
'password': 'zaq12wsx@9Xin',
|
||
|
'host': '183.11.229.79',
|
||
|
'port': 3316,
|
||
|
'database': '9xin',
|
||
|
'auth_plugin': 'mysql_native_password'
|
||
|
}
|
||
|
|
||
|
def download_file(url, save_path):
|
||
|
"""下载文件"""
|
||
|
try:
|
||
|
headers = {
|
||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||
|
}
|
||
|
|
||
|
response = requests.get(url, headers=headers, stream=True, timeout=10)
|
||
|
response.raise_for_status()
|
||
|
|
||
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||
|
|
||
|
with open(save_path, 'wb') as f:
|
||
|
for chunk in response.iter_content(chunk_size=8192):
|
||
|
if chunk:
|
||
|
f.write(chunk)
|
||
|
return True
|
||
|
except Exception as e:
|
||
|
print(f'下载文件失败 {url}: {str(e)}')
|
||
|
return False
|
||
|
|
||
|
def retry_download(note_id, image_list, video_url):
|
||
|
"""重试下载失败的媒体文件"""
|
||
|
media_dir = f'./data/xhs/json/media/{note_id}'
|
||
|
download_success = True
|
||
|
|
||
|
# 重试下载图片
|
||
|
if image_list:
|
||
|
image_urls = image_list.split(',')
|
||
|
for i, url in enumerate(image_urls):
|
||
|
url = url.strip()
|
||
|
if not url:
|
||
|
continue
|
||
|
|
||
|
ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
|
||
|
image_path = os.path.join(media_dir, f'image_{i+1}{ext}')
|
||
|
|
||
|
if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:
|
||
|
print(f'重试下载图片 {note_id} - {i+1}')
|
||
|
if not download_file(url, image_path):
|
||
|
download_success = False
|
||
|
time.sleep(0.5) # 添加延时避免请求过快
|
||
|
|
||
|
# 重试下载视频
|
||
|
if video_url and video_url.strip():
|
||
|
video_url = video_url.strip()
|
||
|
ext = os.path.splitext(urlparse(video_url).path)[1] or '.mp4'
|
||
|
video_path = os.path.join(media_dir, f'video{ext}')
|
||
|
|
||
|
if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:
|
||
|
print(f'重试下载视频 {note_id}')
|
||
|
if not download_file(video_url, video_path):
|
||
|
download_success = False
|
||
|
|
||
|
return download_success
|
||
|
|
||
|
def check_media_files():
|
||
|
"""检查媒体文件下载状态并更新数据库,对失败的记录进行重试下载"""
|
||
|
try:
|
||
|
conn = mysql.connector.connect(**db_config)
|
||
|
cursor = conn.cursor(dictionary=True)
|
||
|
|
||
|
# 确保download_flag字段存在
|
||
|
try:
|
||
|
cursor.execute("""
|
||
|
ALTER TABLE xhs_notes
|
||
|
ADD COLUMN IF NOT EXISTS download_flag BOOLEAN DEFAULT FALSE
|
||
|
""")
|
||
|
conn.commit()
|
||
|
except Exception as e:
|
||
|
print(f"添加download_flag字段时出错: {e}")
|
||
|
|
||
|
# 获取所有记录
|
||
|
cursor.execute("""
|
||
|
SELECT note_id, image_list, video_url, download_flag
|
||
|
FROM xhs_notes
|
||
|
""")
|
||
|
records = cursor.fetchall()
|
||
|
|
||
|
update_query = """
|
||
|
UPDATE xhs_notes
|
||
|
SET download_flag = %s
|
||
|
WHERE note_id = %s
|
||
|
"""
|
||
|
|
||
|
total = len(records)
|
||
|
completed = 0
|
||
|
|
||
|
print(f"开始检查 {total} 条记录的下载状态...")
|
||
|
|
||
|
for record in records:
|
||
|
note_id = record['note_id']
|
||
|
is_complete = True
|
||
|
media_dir = f'./data/xhs/json/media/{note_id}'
|
||
|
|
||
|
# 检查图片和视频是否完整
|
||
|
if record['image_list']:
|
||
|
image_urls = record['image_list'].split(',')
|
||
|
for i, url in enumerate(image_urls):
|
||
|
if url.strip():
|
||
|
ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
|
||
|
image_path = os.path.join(media_dir, f'image_{i+1}{ext}')
|
||
|
if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:
|
||
|
is_complete = False
|
||
|
break
|
||
|
|
||
|
if record['video_url'] and record['video_url'].strip():
|
||
|
url = record['video_url'].strip()
|
||
|
ext = os.path.splitext(urlparse(url).path)[1] or '.mp4'
|
||
|
video_path = os.path.join(media_dir, f'video{ext}')
|
||
|
if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:
|
||
|
is_complete = False
|
||
|
|
||
|
# 如果下载不完整,尝试重新下载
|
||
|
if not is_complete:
|
||
|
print(f"发现未完成下载的记录: {note_id},开始重试下载...")
|
||
|
is_complete = retry_download(
|
||
|
note_id,
|
||
|
record['image_list'],
|
||
|
record['video_url']
|
||
|
)
|
||
|
|
||
|
# 更新数据库状态
|
||
|
if is_complete != record['download_flag']:
|
||
|
cursor.execute(update_query, (is_complete, note_id))
|
||
|
status = "完成" if is_complete else "未完成"
|
||
|
print(f"更新记录 {note_id} 的下载状态为: {status}")
|
||
|
|
||
|
completed += 1
|
||
|
if completed % 10 == 0:
|
||
|
print(f"进度: {completed}/{total}")
|
||
|
|
||
|
conn.commit()
|
||
|
print("检查和重试下载完成!")
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f"发生错误: {e}")
|
||
|
finally:
|
||
|
if 'conn' in locals():
|
||
|
conn.close()
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
check_media_files()
|