From 0ba68809a51af50e5a159a6caff8a1e75639feeb Mon Sep 17 00:00:00 2001 From: ZuWard Date: Wed, 29 May 2024 06:35:37 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8A=96=E9=9F=B3=E4=BA=8C=E7=BA=A7=E8=AF=84?= =?UTF-8?q?=E8=AE=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 12 +++++----- media_platform/douyin/client.py | 39 ++++++++++++++++++++++++++++++++- media_platform/douyin/core.py | 2 +- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index d6739d2..e332276 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,12 @@ > 下面不支持的项目,相关的代码架构已经搭建好,只需要实现对应的方法即可,欢迎大家提交PR | 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | -|-----|-------|----------|------|--------|-------|-------| -| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 抖音 | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | -| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | -| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | +|-----|-------|----------|-----|--------|-------|-------| +| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | +| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | +| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ## 使用方法 diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index a8596d3..ede6049 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -165,6 +165,23 @@ class DOUYINClient(AbstractApiClient): headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') return await self.get(uri, params) + async def get_sub_comments(self, comment_id: str, cursor: int = 0): + """ + 获取子评论 + """ + uri = "/aweme/v1/web/comment/list/reply/" + params = { + 'comment_id': comment_id, + "cursor": cursor, + "count": 20, + "item_type": 0, + } + keywords = request_keyword_var.get() + referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general' + headers = copy.copy(self.headers) + headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') + return await self.get(uri, params) + async def get_aweme_all_comments( self, aweme_id: str, @@ -197,7 +214,27 @@ class DOUYINClient(AbstractApiClient): await asyncio.sleep(crawl_interval) if not is_fetch_sub_comments: continue - # todo fetch sub comments + # 获取二级评论 + for comment in comments: + reply_comment_total = comment.get("reply_comment_total") + + if reply_comment_total > 0: + comment_id = comment.get("cid") + sub_comments_has_more = 1 + sub_comments_cursor = 0 + + while sub_comments_has_more: + sub_comments_res = await self.get_sub_comments(comment_id, sub_comments_cursor) + sub_comments_has_more = sub_comments_res.get("has_more", 0) + sub_comments_cursor = sub_comments_res.get("cursor", 0) + sub_comments = sub_comments_res.get("comments", []) + + if not sub_comments: + continue + result.extend(sub_comments) + if callback: # 如果有回调函数,就执行回调函数 + await callback(aweme_id, sub_comments) + await asyncio.sleep(crawl_interval) return result async def get_user_info(self, sec_user_id: str): diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index d97314a..9823baa 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -177,8 +177,8 @@ class DouYinCrawler(AbstractCrawler): await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, crawl_interval=random.random(), + is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, callback=douyin_store.batch_update_dy_aweme_comments - ) utils.logger.info( f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")