xhs_server/check_downloads.py

import os
import mysql.connector
from urllib.parse import urlparse
import requests
import time

# 数据库配置
db_config = {
    'user': 'root',
    'password': 'zaq12wsx@9Xin',
    'host': '183.11.229.79',
    'port': 3316,
    'database': '9Xin',
    'auth_plugin': 'mysql_native_password'
}

def download_file(url, save_path):
    """下载文件"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, stream=True, timeout=10)
        response.raise_for_status()
        
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        return True
    except Exception as e:
        print(f'下载文件失败 {url}: {str(e)}')
        return False

def retry_download(note_id, image_list, video_url):
    """重试下载失败的媒体文件"""
    media_dir = f'./data/xhs/media/{note_id}'
    download_success = True
    
    # 重试下载图片
    if image_list:
        image_urls = image_list.split(',')
        for i, url in enumerate(image_urls):
            url = url.strip()
            if not url:
                continue
                
            ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
            image_path = os.path.join(media_dir, f'image_{i+1}{ext}')
            
            if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:
                print(f'重试下载图片 {note_id} - {i+1}')
                if not download_file(url, image_path):
                    download_success = False
                time.sleep(1)  # 添加延时避免请求过快
    
    # 重试下载视频
    if video_url and video_url.strip():
        video_url = video_url.strip()
        ext = os.path.splitext(urlparse(video_url).path)[1] or '.mp4'
        video_path = os.path.join(media_dir, f'video{ext}')
        
        if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:
            print(f'重试下载视频 {note_id}')
            if not download_file(video_url, video_path):
                download_success = False
    
    return download_success

def check_media_files():
    """检查媒体文件下载状态并更新数据库，对失败的记录进行重试下载"""
    try:
        conn = mysql.connector.connect(**db_config)
        cursor = conn.cursor(dictionary=True)
        
        # 确保download_flag字段存在
        try:
            cursor.execute("""
                ALTER TABLE xhs_notes 
                ADD COLUMN IF NOT EXISTS download_flag BOOLEAN DEFAULT FALSE
            """)
            conn.commit()
        except Exception as e:
            print(f"添加download_flag字段时出错: {e}")
            
        # 获取所有记录
        cursor.execute("""
            SELECT note_id, image_list, video_url, download_flag
            FROM xhs_notes
        """)
        records = cursor.fetchall()
        
        update_query = """
            UPDATE xhs_notes 
            SET download_flag = %s 
            WHERE note_id = %s
        """
        
        total = len(records)
        completed = 0
        
        print(f"开始检查 {total} 条记录的下载状态...")
        
        for record in records:
            note_id = record['note_id']
            is_complete = True
            media_dir = f'./data/xhs/json/media/{note_id}'
            
            # 检查图片和视频是否完整
            if record['image_list']:
                image_urls = record['image_list'].split(',')
                for i, url in enumerate(image_urls):
                    if url.strip():
                        ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
                        image_path = os.path.join(media_dir, f'image_{i+1}{ext}')
                        if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:
                            is_complete = False
                            break
            
            if record['video_url'] and record['video_url'].strip():
                url = record['video_url'].strip()
                ext = os.path.splitext(urlparse(url).path)[1] or '.mp4'
                video_path = os.path.join(media_dir, f'video{ext}')
                if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:
                    is_complete = False
            
            # 如果下载不完整，尝试重新下载
            if not is_complete:
                print(f"发现未完成下载的记录: {note_id}，开始重试下载...")
                is_complete = retry_download(
                    note_id, 
                    record['image_list'], 
                    record['video_url']
                )
            
            # 更新数据库状态
            if is_complete != record['download_flag']:
                cursor.execute(update_query, (is_complete, note_id))
                status = "完成" if is_complete else "未完成"
                print(f"更新记录 {note_id} 的下载状态为: {status}")
            
            completed += 1
            if completed % 10 == 0:
                print(f"进度: {completed}/{total}")
            
        conn.commit()
        print("检查和重试下载完成！")
        
    except Exception as e:
        print(f"发生错误: {e}")
    finally:
        if 'conn' in locals():
            conn.close()

if __name__ == "__main__":
    check_media_files()
first commit 2024-12-16 02:31:07 +00:00			`import os`
			`import mysql.connector`
			`from urllib.parse import urlparse`
			`import requests`
			`import time`

			`# 数据库配置`
			`db_config = {`
			`'user': 'root',`
			`'password': 'zaq12wsx@9Xin',`
			`'host': '183.11.229.79',`
			`'port': 3316,`
			`'database': '9Xin',`
			`'auth_plugin': 'mysql_native_password'`
			`}`

			`def download_file(url, save_path):`
			`"""下载文件"""`
			`try:`
			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'`
			`}`

			`response = requests.get(url, headers=headers, stream=True, timeout=10)`
			`response.raise_for_status()`

			`os.makedirs(os.path.dirname(save_path), exist_ok=True)`

			`with open(save_path, 'wb') as f:`
			`for chunk in response.iter_content(chunk_size=8192):`
			`if chunk:`
			`f.write(chunk)`
			`return True`
			`except Exception as e:`
			`print(f'下载文件失败 {url}: {str(e)}')`
			`return False`

			`def retry_download(note_id, image_list, video_url):`
			`"""重试下载失败的媒体文件"""`
			`media_dir = f'./data/xhs/media/{note_id}'`
			`download_success = True`

			`# 重试下载图片`
			`if image_list:`
			`image_urls = image_list.split(',')`
			`for i, url in enumerate(image_urls):`
			`url = url.strip()`
			`if not url:`
			`continue`

			`ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'`
			`image_path = os.path.join(media_dir, f'image_{i+1}{ext}')`

			`if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:`
			`print(f'重试下载图片 {note_id} - {i+1}')`
			`if not download_file(url, image_path):`
			`download_success = False`
			`time.sleep(1) # 添加延时避免请求过快`

			`# 重试下载视频`
			`if video_url and video_url.strip():`
			`video_url = video_url.strip()`
			`ext = os.path.splitext(urlparse(video_url).path)[1] or '.mp4'`
			`video_path = os.path.join(media_dir, f'video{ext}')`

			`if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:`
			`print(f'重试下载视频 {note_id}')`
			`if not download_file(video_url, video_path):`
			`download_success = False`

			`return download_success`

			`def check_media_files():`
			`"""检查媒体文件下载状态并更新数据库，对失败的记录进行重试下载"""`
			`try:`
			`conn = mysql.connector.connect(**db_config)`
			`cursor = conn.cursor(dictionary=True)`

			`# 确保download_flag字段存在`
			`try:`
			`cursor.execute("""`
			`ALTER TABLE xhs_notes`
			`ADD COLUMN IF NOT EXISTS download_flag BOOLEAN DEFAULT FALSE`
			`""")`
			`conn.commit()`
			`except Exception as e:`
			`print(f"添加download_flag字段时出错: {e}")`

			`# 获取所有记录`
			`cursor.execute("""`
			`SELECT note_id, image_list, video_url, download_flag`
			`FROM xhs_notes`
			`""")`
			`records = cursor.fetchall()`

			`update_query = """`
			`UPDATE xhs_notes`
			`SET download_flag = %s`
			`WHERE note_id = %s`
			`"""`

			`total = len(records)`
			`completed = 0`

			`print(f"开始检查 {total} 条记录的下载状态...")`

			`for record in records:`
			`note_id = record['note_id']`
			`is_complete = True`
			`media_dir = f'./data/xhs/json/media/{note_id}'`

			`# 检查图片和视频是否完整`
			`if record['image_list']:`
			`image_urls = record['image_list'].split(',')`
			`for i, url in enumerate(image_urls):`
			`if url.strip():`
			`ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'`
			`image_path = os.path.join(media_dir, f'image_{i+1}{ext}')`
			`if not os.path.exists(image_path) or os.path.getsize(image_path) == 0:`
			`is_complete = False`
			`break`

			`if record['video_url'] and record['video_url'].strip():`
			`url = record['video_url'].strip()`
			`ext = os.path.splitext(urlparse(url).path)[1] or '.mp4'`
			`video_path = os.path.join(media_dir, f'video{ext}')`
			`if not os.path.exists(video_path) or os.path.getsize(video_path) == 0:`
			`is_complete = False`

			`# 如果下载不完整，尝试重新下载`
			`if not is_complete:`
			`print(f"发现未完成下载的记录: {note_id}，开始重试下载...")`
			`is_complete = retry_download(`
			`note_id,`
			`record['image_list'],`
			`record['video_url']`
			`)`

			`# 更新数据库状态`
			`if is_complete != record['download_flag']:`
			`cursor.execute(update_query, (is_complete, note_id))`
			`status = "完成" if is_complete else "未完成"`
			`print(f"更新记录 {note_id} 的下载状态为: {status}")`

			`completed += 1`
			`if completed % 10 == 0:`
			`print(f"进度: {completed}/{total}")`

			`conn.commit()`
			`print("检查和重试下载完成！")`

			`except Exception as e:`
			`print(f"发生错误: {e}")`
			`finally:`
			`if 'conn' in locals():`
			`conn.close()`

			`if __name__ == "__main__":`
			`check_media_files()`