diff --git a/README.md b/README.md index 9950b4e..e71f46f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ > 点击查看更为详细的免责声明。[点击跳转](#disclaimer) # 仓库描述 -**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**...。 +**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**,**百度贴吧**...。 目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。 原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数 @@ -22,6 +22,7 @@ | 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | +| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ## 使用方法 @@ -99,14 +100,51 @@ - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html) - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) - - ## 感谢下列Sponsors对本仓库赞助 - 通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰,这个插件我用了大半年,作为谷歌上最火的一款插件,体验非常不错。 > 安装并注册该浏览器插件之后保留一天即可,我就可以获得3元的推广奖励,谢谢大家,支持我继续开源项目。 成为赞助者,展示你的产品在这里,联系作者wx:yzglan +## 打赏 + +如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力! + +打赏时您可以备注名称,我会将您添加至打赏列表中。 +
+ + +
+ +## 捐赠信息 + +PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉) + +| 捐赠者 | 捐赠金额 | 捐赠日期 | +|-------------|-------|------------| +| *皓 | 50 元 | 2024-03-18 | +| *刚 | 50 元 | 2024-03-18 | +| *乐 | 20 元 | 2024-03-17 | +| *木 | 20 元 | 2024-03-17 | +| *诚 | 20 元 | 2024-03-17 | +| Strem Gamer | 20 元 | 2024-03-16 | +| *鑫 | 20 元 | 2024-03-14 | +| Yuzu | 20 元 | 2024-03-07 | +| **宁 | 100 元 | 2024-03-03 | +| **媛 | 20 元 | 2024-03-03 | +| Scarlett | 20 元 | 2024-02-16 | +| Asun | 20 元 | 2024-01-30 | +| 何* | 100 元 | 2024-01-21 | +| allen | 20 元 | 2024-01-10 | +| llllll | 20 元 | 2024-01-07 | +| 邝*元 | 20 元 | 2023-12-29 | +| 50chen | 50 元 | 2023-12-22 | +| xiongot | 20 元 | 2023-12-17 | +| atom.hu | 20 元 | 2023-12-16 | +| 一呆 | 20 元 | 2023-12-01 | +| 坠落 | 50 元 | 2023-11-08 | + + ## MediaCrawler爬虫项目交流群: > 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群) diff --git a/config/base_config.py b/config/base_config.py index 53dc8bf..cefc711 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -28,7 +28,7 @@ HEADLESS = False SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "db" # csv or db or json +SAVE_DATA_OPTION = "csv" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name @@ -46,18 +46,18 @@ MAX_CONCURRENCY_NUM = 1 ENABLE_GET_IMAGES = False # 是否开启爬评论模式, 默认不开启爬评论 -ENABLE_GET_COMMENTS = True +ENABLE_GET_COMMENTS = False # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 -ENABLE_GET_SUB_COMMENTS = True +ENABLE_GET_SUB_COMMENTS = False # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ "6422c2750000000027000d88", "64ca1b73000000000b028dd2", "630d5b85000000001203ab41", - "668fe13000000000030241fa", # 图文混合 + "668fe13000000000030241fa", # 图文混合 # ........................ ] @@ -93,6 +93,10 @@ TIEBA_SPECIFIED_ID_LIST = [ ] +# 指定贴吧名称列表,爬取该贴吧下的帖子 +TIEBA_NAME_LIST = [ + # "盗墓笔记" +] # 指定小红书创作者ID列表 XHS_CREATOR_ID_LIST = [ @@ -118,19 +122,18 @@ KS_CREATOR_ID_LIST = [ # ........................ ] - -#词云相关 -#是否开启生成评论词云图 +# 词云相关 +# 是否开启生成评论词云图 ENABLE_GET_WORDCLOUD = False # 自定义词语及其分组 -#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 +# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 CUSTOM_WORDS = { '零几': '年份', # 将“零几”识别为一个整体 '高频词': '专业术语' # 示例自定义词 } -#停用(禁用)词文件路径 +# 停用(禁用)词文件路径 STOP_WORDS_FILE = "./docs/hit_stopwords.txt" -#中文字体文件路径 -FONT_PATH= "./docs/STZHONGS.TTF" +# 中文字体文件路径 +FONT_PATH = "./docs/STZHONGS.TTF" diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 2ae4304..daa1c4c 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -1,17 +1,15 @@ import asyncio import json -import random from typing import Any, Callable, Dict, List, Optional, Union from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext -from tenacity import (RetryError, retry, stop_after_attempt, - wait_fixed) +from tenacity import RetryError, retry, stop_after_attempt, wait_fixed import config from base.base_crawler import AbstractApiClient -from model.m_baidu_tieba import TiebaNote, TiebaComment +from model.m_baidu_tieba import TiebaComment, TiebaNote from proxy.proxy_ip_pool import ProxyIpPool from tools import utils @@ -103,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient): return res utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") - raise e + raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ @@ -248,28 +246,44 @@ class BaiduTieBaClient(AbstractApiClient): # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") all_sub_comments: List[TiebaComment] = [] - for comment in comments: - if comment.sub_comment_count == 0: + for parment_comment in comments: + if parment_comment.sub_comment_count == 0: continue current_page = 1 - max_sub_page_num = comment.sub_comment_count // 10 + 1 + max_sub_page_num = parment_comment.sub_comment_count // 10 + 1 while max_sub_page_num >= current_page: params = { - "tid": comment.note_id, # 帖子ID - "pid": comment.comment_id, # 父级评论ID - "fid": comment.tieba_id, # 贴吧ID + "tid": parment_comment.note_id, # 帖子ID + "pid": parment_comment.comment_id, # 父级评论ID + "fid": parment_comment.tieba_id, # 贴吧ID "pn": current_page # 页码 } page_content = await self.get(uri, params=params, return_ori_content=True) sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, - parent_comment=comment) + parent_comment=parment_comment) if not sub_comments: break if callback: - await callback(comment.note_id, sub_comments) + await callback(parment_comment.note_id, sub_comments) all_sub_comments.extend(sub_comments) await asyncio.sleep(crawl_interval) current_page += 1 return all_sub_comments + + + + async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: + """ + 根据贴吧名称获取帖子列表 + Args: + tieba_name: 贴吧名称 + page_num: 分页数量 + + Returns: + + """ + uri = f"/f?kw={tieba_name}&pn={page_num}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_tieba_note_list(page_content) diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 2d10a0a..c8b8764 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -53,6 +53,7 @@ class TieBaCrawler(AbstractCrawler): if config.CRAWLER_TYPE == "search": # Search for notes and retrieve their comment information. await self.search() + await self.get_specified_tieba_notes() elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_notes() @@ -92,7 +93,7 @@ class TieBaCrawler(AbstractCrawler): if not notes_list: utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty") break - utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}") + utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}") await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list]) page += 1 except Exception as ex: @@ -100,6 +101,34 @@ class TieBaCrawler(AbstractCrawler): f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}") break + async def get_specified_tieba_notes(self): + """ + Get the information and comments of the specified post by tieba name + Returns: + + """ + tieba_limit_count = 50 + if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count + for tieba_name in config.TIEBA_NAME_LIST: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}") + page_number = 0 + while page_number <= config.CRAWLER_MAX_NOTES_COUNT: + note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name( + tieba_name=tieba_name, + page_num=page_number + ) + if not note_list: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty") + break + + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}") + await self.get_specified_notes([note.note_id for note in note_list]) + page_number += tieba_limit_count + async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST): """ Get the information and comments of the specified post diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index b46081d..4f3fe15 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- -import re -import json import html -from typing import List, Dict, Tuple +import json +import re +from typing import Dict, List, Tuple from parsel import Selector -from model.m_baidu_tieba import TiebaNote, TiebaComment from constant import baidu_tieba as const +from model.m_baidu_tieba import TiebaComment, TiebaNote from tools import utils @@ -43,6 +43,42 @@ class TieBaExtractor: result.append(tieba_note) return result + def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]: + """ + 提取贴吧帖子列表 + Args: + page_content: + + Returns: + + """ + page_content = page_content.replace(' + + + + + + + +