diff --git a/config/base_config.py b/config/base_config.py index d6be4ed..80d53eb 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -26,6 +26,14 @@ CRAWLER_MAX_NOTES_COUNT = 20 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 10 +# 抖音每个视频抓取评论最大条数 (为0则不限制) +DY_MAX_COMMENTS_PER_POST = 10 + +# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制) +DY_COMMENT_KEYWORDS = [ + "我" + # ........................ +] # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index de4ed6d..a37796c 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -1,7 +1,7 @@ import asyncio import copy import urllib.parse -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, List import execjs import httpx @@ -54,7 +54,7 @@ class DOUYINClient: "platform": "PC", "screen_width": "1920", "screen_height": "1200", - #" webid": douyin_js_obj.call("get_web_id"), + # " webid": douyin_js_obj.call("get_web_id"), # "msToken": local_storage.get("xmst"), # "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK", } @@ -167,30 +167,59 @@ class DOUYINClient: crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None, + max_comments: int = None, # 新增参数来限制评论数 + keywords: List[str] = None # 新增参数,用于关键字筛选 ): """ - get note all comments include sub comments - :param aweme_id: - :param crawl_interval: - :param is_fetch_sub_comments: - :param callback: - :return: + 获取帖子的所有评论,包括子评论 + :param aweme_id: 帖子ID + :param crawl_interval: 抓取间隔 + :param is_fetch_sub_comments: 是否抓取子评论 + :param callback: 回调函数,用于处理抓取到的评论 + :param max_comments: 最大评论数限制,如果为0,则不限制评论数 + :param keywords: 需要过滤的关键字列表 + :return: 评论列表 """ result = [] comments_has_more = 1 comments_cursor = 0 - while comments_has_more: + collected_comments_count = 0 # 已收集的评论数 + + while comments_has_more and ( + max_comments is None or collected_comments_count < max_comments or max_comments == 0): comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_has_more = comments_res.get("has_more", 0) - comments_cursor = comments_res.get("cursor", comments_cursor + 20) - comments = comments_res.get("comments") + comments_cursor = comments_res.get("cursor", 0) + comments = comments_res.get("comments", []) if not comments: continue + + # 在添加评论到结果列表之前进行关键字筛选 + if keywords: + filtered_comments = [comment for comment in comments if + not any(keyword in comment.get("text", "") for keyword in keywords)] + else: + filtered_comments = comments + + # 如果设置了最大评论数限制,并且不为0,只添加未超过该限制的评论 + if max_comments is not None and max_comments > 0: + remaining_quota = max_comments - collected_comments_count + comments_to_add = filtered_comments[:remaining_quota] + result.extend(comments_to_add) + collected_comments_count += len(comments_to_add) + else: + result.extend(filtered_comments) + collected_comments_count += len(filtered_comments) + if callback: # 如果有回调函数,就执行回调函数 await callback(aweme_id, comments) + + # 如果已经达到最大评论数(且最大评论数不为0),或者不需要子评论,结束循环 + if max_comments is not None and 0 < max_comments <= collected_comments_count: + break + await asyncio.sleep(crawl_interval) if not is_fetch_sub_comments: - result.extend(comments) continue # todo fetch sub comments return result diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 45478a3..5276418 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -128,18 +128,23 @@ class DouYinCrawler(AbstractCrawler): task_list: List[Task] = [] semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) for aweme_id in aweme_list: - task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id) + task = asyncio.create_task( + self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id) task_list.append(task) await asyncio.wait(task_list) - async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None: + async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_comments: int = None) -> None: async with semaphore: try: - await self.dy_client.get_aweme_all_comments( + # 将关键词列表传递给 get_aweme_all_comments 方法 + comments = await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, - callback=douyin.batch_update_dy_aweme_comments, + max_comments=max_comments, # 最大数量 + keywords=config.DY_COMMENT_KEYWORDS # 关键词列表 ) - utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...") + # 现在返回的 comments 已经是经过关键词筛选的 + await douyin.batch_update_dy_aweme_comments(aweme_id, comments) + utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...") except DataFetchError as e: utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")