Merge pull request #89 from PeanutSplash/main

添加功能:抖音每个视频抓取评论最大条数限制,抖音评论关键词筛选
This commit is contained in:
relakkes 2023-12-05 12:42:32 +08:00 committed by GitHub
commit 6ae511bb52
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 59 additions and 17 deletions

View File

@ -26,6 +26,14 @@ CRAWLER_MAX_NOTES_COUNT = 20
# 并发爬虫数量控制 # 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 10 MAX_CONCURRENCY_NUM = 10
# 抖音每个视频抓取评论最大条数 (为0则不限制)
DY_MAX_COMMENTS_PER_POST = 10
# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制)
DY_COMMENT_KEYWORDS = [
""
# ........................
]
# 指定小红书需要爬虫的笔记ID列表 # 指定小红书需要爬虫的笔记ID列表
XHS_SPECIFIED_ID_LIST = [ XHS_SPECIFIED_ID_LIST = [

View File

@ -1,7 +1,7 @@
import asyncio import asyncio
import copy import copy
import urllib.parse import urllib.parse
from typing import Any, Callable, Dict, Optional from typing import Any, Callable, Dict, Optional, List
import execjs import execjs
import httpx import httpx
@ -167,30 +167,59 @@ class DOUYINClient:
crawl_interval: float = 1.0, crawl_interval: float = 1.0,
is_fetch_sub_comments=False, is_fetch_sub_comments=False,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
max_comments: int = None, # 新增参数来限制评论数
keywords: List[str] = None # 新增参数,用于关键字筛选
): ):
""" """
get note all comments include sub comments 获取帖子的所有评论包括子评论
:param aweme_id: :param aweme_id: 帖子ID
:param crawl_interval: :param crawl_interval: 抓取间隔
:param is_fetch_sub_comments: :param is_fetch_sub_comments: 是否抓取子评论
:param callback: :param callback: 回调函数用于处理抓取到的评论
:return: :param max_comments: 最大评论数限制如果为0则不限制评论数
:param keywords: 需要过滤的关键字列表
:return: 评论列表
""" """
result = [] result = []
comments_has_more = 1 comments_has_more = 1
comments_cursor = 0 comments_cursor = 0
while comments_has_more: collected_comments_count = 0 # 已收集的评论数
while comments_has_more and (
max_comments is None or collected_comments_count < max_comments or max_comments == 0):
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
comments_has_more = comments_res.get("has_more", 0) comments_has_more = comments_res.get("has_more", 0)
comments_cursor = comments_res.get("cursor", comments_cursor + 20) comments_cursor = comments_res.get("cursor", 0)
comments = comments_res.get("comments") comments = comments_res.get("comments", [])
if not comments: if not comments:
continue continue
# 在添加评论到结果列表之前进行关键字筛选
if keywords:
filtered_comments = [comment for comment in comments if
not any(keyword in comment.get("text", "") for keyword in keywords)]
else:
filtered_comments = comments
# 如果设置了最大评论数限制并且不为0只添加未超过该限制的评论
if max_comments is not None and max_comments > 0:
remaining_quota = max_comments - collected_comments_count
comments_to_add = filtered_comments[:remaining_quota]
result.extend(comments_to_add)
collected_comments_count += len(comments_to_add)
else:
result.extend(filtered_comments)
collected_comments_count += len(filtered_comments)
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(aweme_id, comments) await callback(aweme_id, comments)
# 如果已经达到最大评论数且最大评论数不为0或者不需要子评论结束循环
if max_comments is not None and 0 < max_comments <= collected_comments_count:
break
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments: if not is_fetch_sub_comments:
result.extend(comments)
continue continue
# todo fetch sub comments # todo fetch sub comments
return result return result

View File

@ -128,18 +128,23 @@ class DouYinCrawler(AbstractCrawler):
task_list: List[Task] = [] task_list: List[Task] = []
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list: for aweme_id in aweme_list:
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id) task = asyncio.create_task(
self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id)
task_list.append(task) task_list.append(task)
await asyncio.wait(task_list) await asyncio.wait(task_list)
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None: async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_comments: int = None) -> None:
async with semaphore: async with semaphore:
try: try:
await self.dy_client.get_aweme_all_comments( # 将关键词列表传递给 get_aweme_all_comments 方法
comments = await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id, aweme_id=aweme_id,
callback=douyin.batch_update_dy_aweme_comments, max_comments=max_comments, # 最大数量
keywords=config.DY_COMMENT_KEYWORDS # 关键词列表
) )
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...") # 现在返回的 comments 已经是经过关键词筛选的
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e: except DataFetchError as e:
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")