feat: 百度贴吧done

2024-08-08 14:19:32 +08:00 · 2024-08-08 14:19:32 +08:00 · 3f42368c02
commit 3f42368c02
parent df0f5c1113
10 changed files with 3800 additions and 39 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 > 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
 # 仓库描述

-**小红书爬虫**，**抖音爬虫**， **快手爬虫**， **B站爬虫**， **微博爬虫**...。  
+**小红书爬虫**，**抖音爬虫**， **快手爬虫**， **B站爬虫**， **微博爬虫**，**百度贴吧**...。  
 目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。

 原理：利用[playwright](https://playwright.dev/)搭桥，保留登录成功后的上下文浏览器环境，通过执行JS表达式获取一些加密参数
@ -22,6 +22,7 @@
 | 快手  | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | B 站 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 微博  | ✅     | ✅        | ✅   | ❌      | ✅     | ✅     | ✅    |
+| 贴吧  | ✅     | ✅        | ✅   | ❌      | ✅     | ✅     | ✅    |


 ## 使用方法
@ -99,14 +100,51 @@
  - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
  - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) 
  
-  
-
 ## 感谢下列Sponsors对本仓库赞助
 - <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰，这个插件我用了大半年，作为谷歌上最火的一款插件，体验非常不错。</a>
 > 安装并注册该浏览器插件之后保留一天即可，我就可以获得3元的推广奖励，谢谢大家，支持我继续开源项目。

 成为赞助者，展示你的产品在这里，联系作者wx：yzglan

+## 打赏
+
+如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力！
+
+打赏时您可以备注名称，我会将您添加至打赏列表中。
+<p>
+  <img alt="打赏-微信" src="static/images/wechat_pay.jpeg" style="width: 200px;margin-right: 140px;" />
+  <img alt="打赏-支付宝" src="static/images/zfb_pay.png" style="width: 200px" />
+</p>
+
+## 捐赠信息
+
+PS：如果打赏时请备注捐赠者，如有遗漏请联系我添加（有时候消息多可能会漏掉，十分抱歉）
+
+| 捐赠者         | 捐赠金额  | 捐赠日期       |
+|-------------|-------|------------|
+| *皓          | 50 元  | 2024-03-18 |
+| *刚          | 50 元  | 2024-03-18 |
+| *乐          | 20 元  | 2024-03-17 |
+| *木          | 20 元  | 2024-03-17 |
+| *诚          | 20 元  | 2024-03-17 |
+| Strem Gamer | 20 元  | 2024-03-16 |
+| *鑫          | 20 元  | 2024-03-14 |
+| Yuzu        | 20 元  | 2024-03-07 |
+| **宁         | 100 元 | 2024-03-03 |
+| **媛         | 20 元  | 2024-03-03 |
+| Scarlett    | 20 元  | 2024-02-16 |
+| Asun        | 20 元  | 2024-01-30 |
+| 何*          | 100 元 | 2024-01-21 |
+| allen       | 20 元  | 2024-01-10 |
+| llllll      | 20 元  | 2024-01-07 |
+| 邝*元         | 20 元  | 2023-12-29 |
+| 50chen      | 50 元  | 2023-12-22 |
+| xiongot     | 20 元  | 2023-12-17 |
+| atom.hu     | 20 元  | 2023-12-16 |
+| 一呆          | 20 元  | 2023-12-01 |
+| 坠落          | 50 元  | 2023-11-08 |
+
+

 ## MediaCrawler爬虫项目交流群：
 > 扫描下方我的个人微信，备注：github，拉你进MediaCrawler项目交流群(请一定备注：github，会有wx小助手自动拉群)
--- a/config/base_config.py
+++ b/config/base_config.py
@ -28,7 +28,7 @@ HEADLESS = False
 SAVE_LOGIN_STATE = True

 # 数据保存类型选项配置,支持三种类型：csv、db、json
-SAVE_DATA_OPTION = "db"  # csv or db or json
+SAVE_DATA_OPTION = "csv"  # csv or db or json

 # 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
@ -46,18 +46,18 @@ MAX_CONCURRENCY_NUM = 1
 ENABLE_GET_IMAGES = False

 # 是否开启爬评论模式, 默认不开启爬评论
-ENABLE_GET_COMMENTS = True
+ENABLE_GET_COMMENTS = False

 # 是否开启爬二级评论模式, 默认不开启爬二级评论
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
-ENABLE_GET_SUB_COMMENTS = True
+ENABLE_GET_SUB_COMMENTS = False

 # 指定小红书需要爬虫的笔记ID列表
 XHS_SPECIFIED_ID_LIST = [
    "6422c2750000000027000d88",
    "64ca1b73000000000b028dd2",
    "630d5b85000000001203ab41",
-    "668fe13000000000030241fa", # 图文混合
+    "668fe13000000000030241fa",  # 图文混合
    # ........................
 ]

@ -93,6 +93,10 @@ TIEBA_SPECIFIED_ID_LIST = [

 ]

+# 指定贴吧名称列表，爬取该贴吧下的帖子
+TIEBA_NAME_LIST = [
+    # "盗墓笔记"
+]

 # 指定小红书创作者ID列表
 XHS_CREATOR_ID_LIST = [
@ -118,19 +122,18 @@ KS_CREATOR_ID_LIST = [
    # ........................
 ]

-
-#词云相关
-#是否开启生成评论词云图
+# 词云相关
+# 是否开启生成评论词云图
 ENABLE_GET_WORDCLOUD = False
 # 自定义词语及其分组
-#添加规则：xx:yy 其中xx为自定义添加的词组，yy为将xx该词组分到的组名。
+# 添加规则：xx:yy 其中xx为自定义添加的词组，yy为将xx该词组分到的组名。
 CUSTOM_WORDS = {
    '零几': '年份',  # 将“零几”识别为一个整体
    '高频词': '专业术语'  # 示例自定义词
 }

-#停用(禁用)词文件路径
+# 停用(禁用)词文件路径
 STOP_WORDS_FILE = "./docs/hit_stopwords.txt"

-#中文字体文件路径
-FONT_PATH= "./docs/STZHONGS.TTF"
+# 中文字体文件路径
+FONT_PATH = "./docs/STZHONGS.TTF"
--- a/media_platform/tieba/client.py
+++ b/media_platform/tieba/client.py
@ -1,17 +1,15 @@
 import asyncio
 import json
-import random
 from typing import Any, Callable, Dict, List, Optional, Union
 from urllib.parse import urlencode

 import httpx
 from playwright.async_api import BrowserContext
-from tenacity import (RetryError, retry, stop_after_attempt,
-                      wait_fixed)
+from tenacity import RetryError, retry, stop_after_attempt, wait_fixed

 import config
 from base.base_crawler import AbstractApiClient
-from model.m_baidu_tieba import TiebaNote, TiebaComment
+from model.m_baidu_tieba import TiebaComment, TiebaNote
 from proxy.proxy_ip_pool import ProxyIpPool
 from tools import utils

@ -103,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient):
                return res

            utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数，IP已经被Block，请尝试更换新的IP代理: {e}")
-            raise e
+            raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数，IP已经被Block，请尝试更换新的IP代理: {e}")

    async def post(self, uri: str, data: dict, **kwargs) -> Dict:
        """
@ -248,28 +246,44 @@ class BaiduTieBaClient(AbstractApiClient):
        #     raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")

        all_sub_comments: List[TiebaComment] = []
-        for comment in comments:
-            if comment.sub_comment_count == 0:
+        for parment_comment in comments:
+            if parment_comment.sub_comment_count == 0:
                continue

            current_page = 1
-            max_sub_page_num = comment.sub_comment_count // 10 + 1
+            max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
            while max_sub_page_num >= current_page:
                params = {
-                    "tid": comment.note_id,  # 帖子ID
-                    "pid": comment.comment_id,  # 父级评论ID
-                    "fid": comment.tieba_id,  # 贴吧ID
+                    "tid": parment_comment.note_id,  # 帖子ID
+                    "pid": parment_comment.comment_id,  # 父级评论ID
+                    "fid": parment_comment.tieba_id,  # 贴吧ID
                    "pn": current_page  # 页码
                }
                page_content = await self.get(uri, params=params, return_ori_content=True)
                sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
-                                                                                    parent_comment=comment)
+                                                                                    parent_comment=parment_comment)

                if not sub_comments:
                    break
                if callback:
-                    await callback(comment.note_id, sub_comments)
+                    await callback(parment_comment.note_id, sub_comments)
                all_sub_comments.extend(sub_comments)
                await asyncio.sleep(crawl_interval)
                current_page += 1
        return all_sub_comments
+
+
+
+    async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
+        """
+        根据贴吧名称获取帖子列表
+        Args:
+            tieba_name: 贴吧名称
+            page_num: 分页数量
+
+        Returns:
+
+        """
+        uri = f"/f?kw={tieba_name}&pn={page_num}"
+        page_content = await self.get(uri, return_ori_content=True)
+        return self._page_extractor.extract_tieba_note_list(page_content)
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@ -53,6 +53,7 @@ class TieBaCrawler(AbstractCrawler):
        if config.CRAWLER_TYPE == "search":
            # Search for notes and retrieve their comment information.
            await self.search()
+            await self.get_specified_tieba_notes()
        elif config.CRAWLER_TYPE == "detail":
            # Get the information and comments of the specified post
            await self.get_specified_notes()
@ -92,7 +93,7 @@ class TieBaCrawler(AbstractCrawler):
                    if not notes_list:
                        utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
                        break
-                    utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}")
+                    utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}")
                    await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
                    page += 1
                except Exception as ex:
@ -100,6 +101,34 @@ class TieBaCrawler(AbstractCrawler):
                        f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
                    break

+    async def get_specified_tieba_notes(self):
+        """
+        Get the information and comments of the specified post by tieba name
+        Returns:
+
+        """
+        tieba_limit_count = 50
+        if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
+            config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
+        for tieba_name in config.TIEBA_NAME_LIST:
+            utils.logger.info(
+                f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}")
+            page_number = 0
+            while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
+                note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name(
+                    tieba_name=tieba_name,
+                    page_num=page_number
+                )
+                if not note_list:
+                    utils.logger.info(
+                        f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty")
+                    break
+
+                utils.logger.info(
+                    f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}")
+                await self.get_specified_notes([note.note_id for note in note_list])
+                page_number += tieba_limit_count
+
    async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
        """
        Get the information and comments of the specified post
--- a/media_platform/tieba/help.py
+++ b/media_platform/tieba/help.py
@ -1,13 +1,13 @@
 # -*- coding: utf-8 -*-
-import re
-import json
 import html
-from typing import List, Dict, Tuple
+import json
+import re
+from typing import Dict, List, Tuple

 from parsel import Selector

-from model.m_baidu_tieba import TiebaNote, TiebaComment
 from constant import baidu_tieba as const
+from model.m_baidu_tieba import TiebaComment, TiebaNote
 from tools import utils


@ -43,6 +43,42 @@ class TieBaExtractor:
            result.append(tieba_note)
        return result

+    def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
+        """
+        提取贴吧帖子列表
+        Args:
+            page_content:
+
+        Returns:
+
+        """
+        page_content = page_content.replace('<!--', "")
+        content_selector = Selector(text=page_content)
+        xpath_selector = "//ul[@id='thread_list']/li"
+        post_list = content_selector.xpath(xpath_selector)
+        result: List[TiebaNote] = []
+        for post_selector in post_list:
+            post_field_value: Dict = self.extract_data_field_value(post_selector)
+            if not post_field_value:
+                continue
+            note_id = str(post_field_value.get("id"))
+            tieba_note = TiebaNote(
+                note_id=note_id,
+                title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
+                desc=post_selector.xpath(".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
+                    default='').strip(),
+                note_url=const.TIEBA_URL + f"/p/{note_id}",
+                user_link=const.TIEBA_URL + post_selector.xpath(
+                    ".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
+                user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get("author_name"),
+                tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
+                tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(
+                    default=''),
+                total_replay_num=post_field_value.get("reply_num", 0)
+            )
+            result.append(tieba_note)
+        return result
+
    def extract_note_detail(self, page_content: str) -> TiebaNote:
        """
        提取贴吧帖子详情
@ -124,8 +160,7 @@ class TieBaExtractor:
            result.append(tieba_comment)
        return result

-
-    def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
+    def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
        """
        提取贴吧帖子二级评论
        Args:
@ -144,7 +179,8 @@ class TieBaExtractor:
            if not comment_value:
                continue
            comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
-            content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
+            content = utils.extract_text_from_html(
+                comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
            comment = TiebaComment(
                comment_id=str(comment_value.get("spid")),
                content=content,
@ -227,6 +263,7 @@ def test_extract_tieba_note_parment_comments():
        result = extractor.extract_tieba_note_parment_comments(content, "123456")
        print(result)

+
 def test_extract_tieba_note_sub_comments():
    with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
        content = f.read()
@ -244,11 +281,21 @@ def test_extract_tieba_note_sub_comments():
            tieba_id="tieba_id",
            tieba_name="tieba_name",
        )
-        result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment)
+        result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
        print(result)

+
+def test_extract_tieba_note_list():
+    with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
+        content = f.read()
+        extractor = TieBaExtractor()
+        result = extractor.extract_tieba_note_list(content)
+        print(result)
+    pass
+
+
 if __name__ == '__main__':
    # test_extract_search_note_list()
    # test_extract_note_detail()
    # test_extract_tieba_note_parment_comments()
-    test_extract_tieba_note_sub_comments()
+    test_extract_tieba_note_list()
--- a/media_platform/tieba/test_data/tieba_note_list.html
+++ b/media_platform/tieba/test_data/tieba_note_list.html
--- a/model/m_baidu_tieba.py
+++ b/model/m_baidu_tieba.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from typing import Optional
+
 from pydantic import BaseModel, Field


--- a/store/tieba/init.py
+++ b/store/tieba/init.py
@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 from typing import List

-from model.m_baidu_tieba import TiebaNote, TiebaComment
+from model.m_baidu_tieba import TiebaComment, TiebaNote
+
 from . import tieba_store_impl
 from .tieba_store_impl import *

--- a/tools/crawler_util.py
+++ b/tools/crawler_util.py
@ -14,6 +14,7 @@ from PIL import Image, ImageDraw
 from playwright.async_api import Cookie, Page

 from proxy import IpInfoModel
+
 from . import utils


--- a/tools/utils.py
+++ b/tools/utils.py
@ -10,7 +10,7 @@ def init_loging_config():
    level = logging.INFO
    logging.basicConfig(
        level=level,
-        format="%(asctime)s [%(threadName)s] %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
+        format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    _logger = logging.getLogger("MediaCrawler")