Merge pull request #89 from PeanutSplash/main

添加功能:抖音每个视频抓取评论最大条数限制,抖音评论关键词筛选
2023-12-05 12:42:32 +08:00 · 2023-12-05 12:42:32 +08:00 · 6ae511bb52
commit 6ae511bb52
parent 8f04943105 ab1a10bac1
3 changed files with 59 additions and 17 deletions
--- a/config/base_config.py
+++ b/config/base_config.py
@ -26,6 +26,14 @@ CRAWLER_MAX_NOTES_COUNT = 20
 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 10
 # 抖音每个视频抓取评论最大条数 (为0则不限制)
 DY_MAX_COMMENTS_PER_POST = 10
 # 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制)
 DY_COMMENT_KEYWORDS = [
    "我"
    # ........................
 ]
 # 指定小红书需要爬虫的笔记ID列表
 XHS_SPECIFIED_ID_LIST = [
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@ -1,7 +1,7 @@
 import asyncio
 import copy
 import urllib.parse
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, List
 import execjs
 import httpx
@ -54,7 +54,7 @@ class DOUYINClient:
            "platform": "PC",
            "screen_width": "1920",
            "screen_height": "1200",
-            #" webid": douyin_js_obj.call("get_web_id"),
+            # " webid": douyin_js_obj.call("get_web_id"),
            # "msToken": local_storage.get("xmst"),
            # "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK",
        }
@ -167,30 +167,59 @@ class DOUYINClient:
            crawl_interval: float = 1.0,
            is_fetch_sub_comments=False,
            callback: Optional[Callable] = None,
            max_comments: int = None,  # 新增参数来限制评论数
            keywords: List[str] = None  # 新增参数，用于关键字筛选
    ):
        """
-        get note all comments include sub comments
+        获取帖子的所有评论，包括子评论
-        :param aweme_id:
+        :param aweme_id: 帖子ID
-        :param crawl_interval:
+        :param crawl_interval: 抓取间隔
-        :param is_fetch_sub_comments:
+        :param is_fetch_sub_comments: 是否抓取子评论
-        :param callback:
+        :param callback: 回调函数，用于处理抓取到的评论
-        :return:
+        :param max_comments: 最大评论数限制，如果为0，则不限制评论数
        :param keywords: 需要过滤的关键字列表
        :return: 评论列表
        """
        result = []
        comments_has_more = 1
        comments_cursor = 0
-        while comments_has_more:
+        collected_comments_count = 0  # 已收集的评论数
        while comments_has_more and (
                max_comments is None or collected_comments_count < max_comments or max_comments == 0):
            comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
            comments_has_more = comments_res.get("has_more", 0)
-            comments_cursor = comments_res.get("cursor", comments_cursor + 20)
+            comments_cursor = comments_res.get("cursor", 0)
-            comments = comments_res.get("comments")
+            comments = comments_res.get("comments", [])
            if not comments:
                continue
            # 在添加评论到结果列表之前进行关键字筛选
            if keywords:
                filtered_comments = [comment for comment in comments if
                                     not any(keyword in comment.get("text", "") for keyword in keywords)]
            else:
                filtered_comments = comments
            # 如果设置了最大评论数限制，并且不为0，只添加未超过该限制的评论
            if max_comments is not None and max_comments > 0:
                remaining_quota = max_comments - collected_comments_count
                comments_to_add = filtered_comments[:remaining_quota]
                result.extend(comments_to_add)
                collected_comments_count += len(comments_to_add)
            else:
                result.extend(filtered_comments)
                collected_comments_count += len(filtered_comments)
            if callback:  # 如果有回调函数，就执行回调函数
                await callback(aweme_id, comments)
            # 如果已经达到最大评论数（且最大评论数不为0），或者不需要子评论，结束循环
            if max_comments is not None and 0 < max_comments <= collected_comments_count:
                break
            await asyncio.sleep(crawl_interval)
            if not is_fetch_sub_comments:
                result.extend(comments)
                continue
            # todo fetch sub comments
        return result
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -128,18 +128,23 @@ class DouYinCrawler(AbstractCrawler):
        task_list: List[Task] = []
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        for aweme_id in aweme_list:
-            task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
+            task = asyncio.create_task(
                self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id)
            task_list.append(task)
        await asyncio.wait(task_list)
-    async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
+    async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_comments: int = None) -> None:
        async with semaphore:
            try:
-                await self.dy_client.get_aweme_all_comments(
+                # 将关键词列表传递给 get_aweme_all_comments 方法
                comments = await self.dy_client.get_aweme_all_comments(
                    aweme_id=aweme_id,
-                    callback=douyin.batch_update_dy_aweme_comments,
+                    max_comments=max_comments, # 最大数量
                    keywords=config.DY_COMMENT_KEYWORDS  # 关键词列表
                )
-                utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
+                # 现在返回的 comments 已经是经过关键词筛选的
                await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
                utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...")
            except DataFetchError as e:
                utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")