import os import mysql.connector from urllib.parse import urlparse import requests import time # 数据库配置 db_config = { 'user': 'root', 'password': 'zaq12wsx@9Xin', 'host': '183.11.229.79', 'port': 3316, 'database': '9xin', 'auth_plugin': 'mysql_native_password' } def download_file(url, save_path): """下载文件""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, stream=True, timeout=10) response.raise_for_status() os.makedirs(os.path.dirname(save_path), exist_ok=True) with open(save_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) return True except Exception as e: print(f'下载文件失败 {url}: {str(e)}') return False def retry_download(note_id, image_list, video_url): """重试下载失败的媒体文件""" media_dir = f'./data/xhs/json/media/{note_id}' download_success = True # 重试下载图片 if image_list: image_urls = image_list.split(',') for i, url in enumerate(image_urls): url = url.strip() if not url: continue ext = os.path.splitext(urlparse(url).path)[1] or '.jpg' image_path = os.path.join(media_dir, f'image_{i+1}{ext}') if not os.path.exists(image_path) or os.path.getsize(image_path) == 0: print(f'重试下载图片 {note_id} - {i+1}') if not download_file(url, image_path): download_success = False time.sleep(0.5) # 添加延时避免请求过快 # 重试下载视频 if video_url and video_url.strip(): video_url = video_url.strip() ext = os.path.splitext(urlparse(video_url).path)[1] or '.mp4' video_path = os.path.join(media_dir, f'video{ext}') if not os.path.exists(video_path) or os.path.getsize(video_path) == 0: print(f'重试下载视频 {note_id}') if not download_file(video_url, video_path): download_success = False return download_success def check_media_files(): """检查媒体文件下载状态并更新数据库,对失败的记录进行重试下载""" try: conn = mysql.connector.connect(**db_config) cursor = conn.cursor(dictionary=True) # 确保download_flag字段存在 try: cursor.execute(""" ALTER TABLE xhs_notes ADD COLUMN IF NOT EXISTS download_flag BOOLEAN DEFAULT FALSE """) conn.commit() except Exception as e: print(f"添加download_flag字段时出错: {e}") # 获取所有记录 cursor.execute(""" SELECT note_id, image_list, video_url, download_flag FROM xhs_notes """) records = cursor.fetchall() update_query = """ UPDATE xhs_notes SET download_flag = %s WHERE note_id = %s """ total = len(records) completed = 0 print(f"开始检查 {total} 条记录的下载状态...") for record in records: note_id = record['note_id'] is_complete = True media_dir = f'./data/xhs/json/media/{note_id}' # 检查图片和视频是否完整 if record['image_list']: image_urls = record['image_list'].split(',') for i, url in enumerate(image_urls): if url.strip(): ext = os.path.splitext(urlparse(url).path)[1] or '.jpg' image_path = os.path.join(media_dir, f'image_{i+1}{ext}') if not os.path.exists(image_path) or os.path.getsize(image_path) == 0: is_complete = False break if record['video_url'] and record['video_url'].strip(): url = record['video_url'].strip() ext = os.path.splitext(urlparse(url).path)[1] or '.mp4' video_path = os.path.join(media_dir, f'video{ext}') if not os.path.exists(video_path) or os.path.getsize(video_path) == 0: is_complete = False # 如果下载不完整,尝试重新下载 if not is_complete: print(f"发现未完成下载的记录: {note_id},开始重试下载...") is_complete = retry_download( note_id, record['image_list'], record['video_url'] ) # 更新数据库状态 if is_complete != record['download_flag']: cursor.execute(update_query, (is_complete, note_id)) status = "完成" if is_complete else "未完成" print(f"更新记录 {note_id} 的下载状态为: {status}") completed += 1 if completed % 10 == 0: print(f"进度: {completed}/{total}") conn.commit() print("检查和重试下载完成!") except Exception as e: print(f"发生错误: {e}") finally: if 'conn' in locals(): conn.close() if __name__ == "__main__": check_media_files()