Merge pull request #94 from PeanutSplash/main
添加功能:(哔哩哔哩,快手,小红书)每个视频/帖子抓取评论最大条数限制,评论关键词筛选
This commit is contained in:
commit
c4d68f868a
@ -29,11 +29,11 @@ CRAWLER_MAX_NOTES_COUNT = 20
|
|||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 10
|
MAX_CONCURRENCY_NUM = 10
|
||||||
|
|
||||||
# 抖音每个视频抓取评论最大条数 (为0则不限制)
|
# 每个视频/帖子抓取评论最大条数 (为0则不限制)
|
||||||
DY_MAX_COMMENTS_PER_POST = 10
|
MAX_COMMENTS_PER_POST = 10
|
||||||
|
|
||||||
# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制)
|
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
|
||||||
DY_COMMENT_KEYWORDS = [
|
COMMENT_KEYWORDS = [
|
||||||
"我"
|
"我"
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
@ -146,18 +146,38 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
"""
|
"""
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
try:
|
try:
|
||||||
utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...")
|
utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
|
||||||
await self.bili_client.get_video_all_comments(
|
# Read keyword and quantity from config
|
||||||
|
keywords = config.COMMENT_KEYWORDS
|
||||||
|
max_comments = config.MAX_COMMENTS_PER_POST
|
||||||
|
|
||||||
|
# Download comments
|
||||||
|
all_comments = await self.bili_client.get_video_all_comments(
|
||||||
video_id=video_id,
|
video_id=video_id,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=random.random(),
|
||||||
callback=bilibili.batch_update_bilibili_video_comments
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Filter comments by keyword
|
||||||
|
if keywords:
|
||||||
|
filtered_comments = [
|
||||||
|
comment for comment in all_comments if
|
||||||
|
any(keyword in comment["content"]["message"] for keyword in keywords)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
filtered_comments = all_comments
|
||||||
|
|
||||||
|
# Limit the number of comments
|
||||||
|
if max_comments > 0:
|
||||||
|
filtered_comments = filtered_comments[:max_comments]
|
||||||
|
|
||||||
|
# Update bilibili video comments
|
||||||
|
await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)
|
||||||
|
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
|
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[get_comments] may be been blocked, err:", e)
|
utils.logger.error(f"[get_comments] may be been blocked, err:", e)
|
||||||
|
|
||||||
|
|
||||||
async def get_specified_videos(self):
|
async def get_specified_videos(self):
|
||||||
"""
|
"""
|
||||||
get specified videos info
|
get specified videos info
|
||||||
|
@ -132,7 +132,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
for aweme_id in aweme_list:
|
for aweme_id in aweme_list:
|
||||||
task = asyncio.create_task(
|
task = asyncio.create_task(
|
||||||
self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id)
|
self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id)
|
||||||
task_list.append(task)
|
task_list.append(task)
|
||||||
await asyncio.wait(task_list)
|
await asyncio.wait(task_list)
|
||||||
|
|
||||||
@ -143,7 +143,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
comments = await self.dy_client.get_aweme_all_comments(
|
comments = await self.dy_client.get_aweme_all_comments(
|
||||||
aweme_id=aweme_id,
|
aweme_id=aweme_id,
|
||||||
max_comments=max_comments, # 最大数量
|
max_comments=max_comments, # 最大数量
|
||||||
keywords=config.DY_COMMENT_KEYWORDS # 关键词列表
|
keywords=config.COMMENT_KEYWORDS # 关键词列表
|
||||||
)
|
)
|
||||||
# 现在返回的 comments 已经是经过关键词筛选的
|
# 现在返回的 comments 已经是经过关键词筛选的
|
||||||
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
|
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
|
||||||
|
@ -7,6 +7,7 @@ from urllib.parse import urlencode
|
|||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
|
|
||||||
|
import config
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
from .exception import DataFetchError, IPBlockError
|
from .exception import DataFetchError, IPBlockError
|
||||||
@ -124,7 +125,7 @@ class KuaiShouClient:
|
|||||||
return await self.post("", post_data)
|
return await self.post("", post_data)
|
||||||
|
|
||||||
async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
||||||
callback: Optional[Callable] = None, ):
|
callback: Optional[Callable] = None):
|
||||||
"""
|
"""
|
||||||
get video all comments include sub comments
|
get video all comments include sub comments
|
||||||
:param photo_id:
|
:param photo_id:
|
||||||
@ -136,18 +137,33 @@ class KuaiShouClient:
|
|||||||
|
|
||||||
result = []
|
result = []
|
||||||
pcursor = ""
|
pcursor = ""
|
||||||
while pcursor != "no_more":
|
count = 0 # 计数器,记录已获取的评论数量
|
||||||
|
|
||||||
|
while pcursor != "no_more" and (
|
||||||
|
config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
|
||||||
comments_res = await self.get_video_comments(photo_id, pcursor)
|
comments_res = await self.get_video_comments(photo_id, pcursor)
|
||||||
vision_commen_list = comments_res.get("visionCommentList", {})
|
vision_commen_list = comments_res.get("visionCommentList", {})
|
||||||
pcursor = vision_commen_list.get("pcursor", "")
|
pcursor = vision_commen_list.get("pcursor", "")
|
||||||
comments = vision_commen_list.get("rootComments", [])
|
comments = vision_commen_list.get("rootComments", [])
|
||||||
|
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
filtered_comments = [] # 存储经过关键词筛选后的评论
|
||||||
await callback(photo_id, comments)
|
|
||||||
|
|
||||||
|
for comment in comments:
|
||||||
|
content = comment.get("content", "")
|
||||||
|
|
||||||
|
if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
|
||||||
|
filtered_comments.append(comment)
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
|
||||||
|
break
|
||||||
|
|
||||||
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
|
await callback(photo_id, filtered_comments)
|
||||||
|
|
||||||
|
result.extend(filtered_comments)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
if not is_fetch_sub_comments:
|
if not is_fetch_sub_comments:
|
||||||
result.extend(comments)
|
|
||||||
continue
|
continue
|
||||||
# todo handle get sub comments
|
# todo handle get sub comments
|
||||||
return result
|
return result
|
||||||
|
@ -152,11 +152,27 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
await asyncio.gather(*task_list)
|
await asyncio.gather(*task_list)
|
||||||
|
|
||||||
async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
|
async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
|
||||||
"""Get note comments"""
|
"""Get note comments with keyword filtering and quantity limitation"""
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
utils.logger.info(f"Begin get note id comments {note_id}")
|
utils.logger.info(f"Begin get note id comments {note_id}")
|
||||||
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
|
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
|
||||||
|
|
||||||
|
# 从配置文件中读取关键词和数量限制
|
||||||
|
keywords = getattr(config, 'COMMENT_KEYWORDS', [])
|
||||||
|
max_comments = getattr(config, 'MAX_COMMENTS_PER_POST', 0)
|
||||||
|
|
||||||
|
# 过滤评论
|
||||||
|
filtered_comments = []
|
||||||
for comment in all_comments:
|
for comment in all_comments:
|
||||||
|
# 检查评论内容是否包含关键词
|
||||||
|
if not keywords or any(keyword in comment['content'] for keyword in keywords):
|
||||||
|
filtered_comments.append(comment)
|
||||||
|
# 如果达到最大评论数量限制,则停止添加更多评论
|
||||||
|
if max_comments and len(filtered_comments) >= max_comments:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 更新或保存过滤后的评论
|
||||||
|
for comment in filtered_comments:
|
||||||
await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
|
await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
Loading…
Reference in New Issue
Block a user