feat: 百度贴吧done

This commit is contained in:
Relakkes 2024-08-08 14:19:32 +08:00
parent df0f5c1113
commit 3f42368c02
10 changed files with 3800 additions and 39 deletions

View File

@ -7,7 +7,7 @@
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer) > 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
# 仓库描述 # 仓库描述
**小红书爬虫****抖音爬虫** **快手爬虫** **B站爬虫** **微博爬虫**...。 **小红书爬虫****抖音爬虫** **快手爬虫** **B站爬虫** **微博爬虫****百度贴吧**...。
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。 目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。
原理:利用[playwright](https://playwright.dev/)搭桥保留登录成功后的上下文浏览器环境通过执行JS表达式获取一些加密参数 原理:利用[playwright](https://playwright.dev/)搭桥保留登录成功后的上下文浏览器环境通过执行JS表达式获取一些加密参数
@ -22,6 +22,7 @@
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
## 使用方法 ## 使用方法
@ -99,14 +100,51 @@
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html) - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
## 感谢下列Sponsors对本仓库赞助 ## 感谢下列Sponsors对本仓库赞助
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰这个插件我用了大半年作为谷歌上最火的一款插件体验非常不错。</a> - <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰这个插件我用了大半年作为谷歌上最火的一款插件体验非常不错。</a>
> 安装并注册该浏览器插件之后保留一天即可我就可以获得3元的推广奖励谢谢大家支持我继续开源项目。 > 安装并注册该浏览器插件之后保留一天即可我就可以获得3元的推广奖励谢谢大家支持我继续开源项目。
成为赞助者展示你的产品在这里联系作者wxyzglan 成为赞助者展示你的产品在这里联系作者wxyzglan
## 打赏
如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力!
打赏时您可以备注名称,我会将您添加至打赏列表中。
<p>
<img alt="打赏-微信" src="static/images/wechat_pay.jpeg" style="width: 200px;margin-right: 140px;" />
<img alt="打赏-支付宝" src="static/images/zfb_pay.png" style="width: 200px" />
</p>
## 捐赠信息
PS如果打赏时请备注捐赠者如有遗漏请联系我添加有时候消息多可能会漏掉十分抱歉
| 捐赠者 | 捐赠金额 | 捐赠日期 |
|-------------|-------|------------|
| *皓 | 50 元 | 2024-03-18 |
| *刚 | 50 元 | 2024-03-18 |
| *乐 | 20 元 | 2024-03-17 |
| *木 | 20 元 | 2024-03-17 |
| *诚 | 20 元 | 2024-03-17 |
| Strem Gamer | 20 元 | 2024-03-16 |
| *鑫 | 20 元 | 2024-03-14 |
| Yuzu | 20 元 | 2024-03-07 |
| **宁 | 100 元 | 2024-03-03 |
| **媛 | 20 元 | 2024-03-03 |
| Scarlett | 20 元 | 2024-02-16 |
| Asun | 20 元 | 2024-01-30 |
| 何* | 100 元 | 2024-01-21 |
| allen | 20 元 | 2024-01-10 |
| llllll | 20 元 | 2024-01-07 |
| 邝*元 | 20 元 | 2023-12-29 |
| 50chen | 50 元 | 2023-12-22 |
| xiongot | 20 元 | 2023-12-17 |
| atom.hu | 20 元 | 2023-12-16 |
| 一呆 | 20 元 | 2023-12-01 |
| 坠落 | 50 元 | 2023-11-08 |
## MediaCrawler爬虫项目交流群 ## MediaCrawler爬虫项目交流群
> 扫描下方我的个人微信备注github拉你进MediaCrawler项目交流群(请一定备注github会有wx小助手自动拉群) > 扫描下方我的个人微信备注github拉你进MediaCrawler项目交流群(请一定备注github会有wx小助手自动拉群)

View File

@ -28,7 +28,7 @@ HEADLESS = False
SAVE_LOGIN_STATE = True SAVE_LOGIN_STATE = True
# 数据保存类型选项配置,支持三种类型csv、db、json # 数据保存类型选项配置,支持三种类型csv、db、json
SAVE_DATA_OPTION = "db" # csv or db or json SAVE_DATA_OPTION = "csv" # csv or db or json
# 用户浏览器缓存的浏览器文件配置 # 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
@ -46,11 +46,11 @@ MAX_CONCURRENCY_NUM = 1
ENABLE_GET_IMAGES = False ENABLE_GET_IMAGES = False
# 是否开启爬评论模式, 默认不开启爬评论 # 是否开启爬评论模式, 默认不开启爬评论
ENABLE_GET_COMMENTS = True ENABLE_GET_COMMENTS = False
# 是否开启爬二级评论模式, 默认不开启爬二级评论 # 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
ENABLE_GET_SUB_COMMENTS = True ENABLE_GET_SUB_COMMENTS = False
# 指定小红书需要爬虫的笔记ID列表 # 指定小红书需要爬虫的笔记ID列表
XHS_SPECIFIED_ID_LIST = [ XHS_SPECIFIED_ID_LIST = [
@ -93,6 +93,10 @@ TIEBA_SPECIFIED_ID_LIST = [
] ]
# 指定贴吧名称列表,爬取该贴吧下的帖子
TIEBA_NAME_LIST = [
# "盗墓笔记"
]
# 指定小红书创作者ID列表 # 指定小红书创作者ID列表
XHS_CREATOR_ID_LIST = [ XHS_CREATOR_ID_LIST = [
@ -118,19 +122,18 @@ KS_CREATOR_ID_LIST = [
# ........................ # ........................
] ]
# 词云相关
#词云相关 # 是否开启生成评论词云图
#是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False ENABLE_GET_WORDCLOUD = False
# 自定义词语及其分组 # 自定义词语及其分组
#添加规则xx:yy 其中xx为自定义添加的词组yy为将xx该词组分到的组名。 # 添加规则xx:yy 其中xx为自定义添加的词组yy为将xx该词组分到的组名。
CUSTOM_WORDS = { CUSTOM_WORDS = {
'零几': '年份', # 将“零几”识别为一个整体 '零几': '年份', # 将“零几”识别为一个整体
'高频词': '专业术语' # 示例自定义词 '高频词': '专业术语' # 示例自定义词
} }
#停用(禁用)词文件路径 # 停用(禁用)词文件路径
STOP_WORDS_FILE = "./docs/hit_stopwords.txt" STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
#中文字体文件路径 # 中文字体文件路径
FONT_PATH= "./docs/STZHONGS.TTF" FONT_PATH = "./docs/STZHONGS.TTF"

View File

@ -1,17 +1,15 @@
import asyncio import asyncio
import json import json
import random
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode from urllib.parse import urlencode
import httpx import httpx
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
from tenacity import (RetryError, retry, stop_after_attempt, from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
wait_fixed)
import config import config
from base.base_crawler import AbstractApiClient from base.base_crawler import AbstractApiClient
from model.m_baidu_tieba import TiebaNote, TiebaComment from model.m_baidu_tieba import TiebaComment, TiebaNote
from proxy.proxy_ip_pool import ProxyIpPool from proxy.proxy_ip_pool import ProxyIpPool
from tools import utils from tools import utils
@ -103,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient):
return res return res
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数IP已经被Block请尝试更换新的IP代理: {e}") utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数IP已经被Block请尝试更换新的IP代理: {e}")
raise e raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数IP已经被Block请尝试更换新的IP代理: {e}")
async def post(self, uri: str, data: dict, **kwargs) -> Dict: async def post(self, uri: str, data: dict, **kwargs) -> Dict:
""" """
@ -248,28 +246,44 @@ class BaiduTieBaClient(AbstractApiClient):
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
all_sub_comments: List[TiebaComment] = [] all_sub_comments: List[TiebaComment] = []
for comment in comments: for parment_comment in comments:
if comment.sub_comment_count == 0: if parment_comment.sub_comment_count == 0:
continue continue
current_page = 1 current_page = 1
max_sub_page_num = comment.sub_comment_count // 10 + 1 max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
while max_sub_page_num >= current_page: while max_sub_page_num >= current_page:
params = { params = {
"tid": comment.note_id, # 帖子ID "tid": parment_comment.note_id, # 帖子ID
"pid": comment.comment_id, # 父级评论ID "pid": parment_comment.comment_id, # 父级评论ID
"fid": comment.tieba_id, # 贴吧ID "fid": parment_comment.tieba_id, # 贴吧ID
"pn": current_page # 页码 "pn": current_page # 页码
} }
page_content = await self.get(uri, params=params, return_ori_content=True) page_content = await self.get(uri, params=params, return_ori_content=True)
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
parent_comment=comment) parent_comment=parment_comment)
if not sub_comments: if not sub_comments:
break break
if callback: if callback:
await callback(comment.note_id, sub_comments) await callback(parment_comment.note_id, sub_comments)
all_sub_comments.extend(sub_comments) all_sub_comments.extend(sub_comments)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
current_page += 1 current_page += 1
return all_sub_comments return all_sub_comments
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
"""
根据贴吧名称获取帖子列表
Args:
tieba_name: 贴吧名称
page_num: 分页数量
Returns:
"""
uri = f"/f?kw={tieba_name}&pn={page_num}"
page_content = await self.get(uri, return_ori_content=True)
return self._page_extractor.extract_tieba_note_list(page_content)

View File

@ -53,6 +53,7 @@ class TieBaCrawler(AbstractCrawler):
if config.CRAWLER_TYPE == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
await self.get_specified_tieba_notes()
elif config.CRAWLER_TYPE == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_notes() await self.get_specified_notes()
@ -92,7 +93,7 @@ class TieBaCrawler(AbstractCrawler):
if not notes_list: if not notes_list:
utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty") utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
break break
utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}") utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}")
await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list]) await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
page += 1 page += 1
except Exception as ex: except Exception as ex:
@ -100,6 +101,34 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}") f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
break break
async def get_specified_tieba_notes(self):
"""
Get the information and comments of the specified post by tieba name
Returns:
"""
tieba_limit_count = 50
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
for tieba_name in config.TIEBA_NAME_LIST:
utils.logger.info(
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}")
page_number = 0
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name(
tieba_name=tieba_name,
page_num=page_number
)
if not note_list:
utils.logger.info(
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty")
break
utils.logger.info(
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}")
await self.get_specified_notes([note.note_id for note in note_list])
page_number += tieba_limit_count
async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST): async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
""" """
Get the information and comments of the specified post Get the information and comments of the specified post

View File

@ -1,13 +1,13 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
import json
import html import html
from typing import List, Dict, Tuple import json
import re
from typing import Dict, List, Tuple
from parsel import Selector from parsel import Selector
from model.m_baidu_tieba import TiebaNote, TiebaComment
from constant import baidu_tieba as const from constant import baidu_tieba as const
from model.m_baidu_tieba import TiebaComment, TiebaNote
from tools import utils from tools import utils
@ -43,6 +43,42 @@ class TieBaExtractor:
result.append(tieba_note) result.append(tieba_note)
return result return result
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
"""
提取贴吧帖子列表
Args:
page_content:
Returns:
"""
page_content = page_content.replace('<!--', "")
content_selector = Selector(text=page_content)
xpath_selector = "//ul[@id='thread_list']/li"
post_list = content_selector.xpath(xpath_selector)
result: List[TiebaNote] = []
for post_selector in post_list:
post_field_value: Dict = self.extract_data_field_value(post_selector)
if not post_field_value:
continue
note_id = str(post_field_value.get("id"))
tieba_note = TiebaNote(
note_id=note_id,
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
desc=post_selector.xpath(".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
default='').strip(),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + post_selector.xpath(
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get("author_name"),
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(
default=''),
total_replay_num=post_field_value.get("reply_num", 0)
)
result.append(tieba_note)
return result
def extract_note_detail(self, page_content: str) -> TiebaNote: def extract_note_detail(self, page_content: str) -> TiebaNote:
""" """
提取贴吧帖子详情 提取贴吧帖子详情
@ -124,8 +160,7 @@ class TieBaExtractor:
result.append(tieba_comment) result.append(tieba_comment)
return result return result
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
""" """
提取贴吧帖子二级评论 提取贴吧帖子二级评论
Args: Args:
@ -144,7 +179,8 @@ class TieBaExtractor:
if not comment_value: if not comment_value:
continue continue
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0] comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default="")) content = utils.extract_text_from_html(
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
comment = TiebaComment( comment = TiebaComment(
comment_id=str(comment_value.get("spid")), comment_id=str(comment_value.get("spid")),
content=content, content=content,
@ -227,6 +263,7 @@ def test_extract_tieba_note_parment_comments():
result = extractor.extract_tieba_note_parment_comments(content, "123456") result = extractor.extract_tieba_note_parment_comments(content, "123456")
print(result) print(result)
def test_extract_tieba_note_sub_comments(): def test_extract_tieba_note_sub_comments():
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f: with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
content = f.read() content = f.read()
@ -244,11 +281,21 @@ def test_extract_tieba_note_sub_comments():
tieba_id="tieba_id", tieba_id="tieba_id",
tieba_name="tieba_name", tieba_name="tieba_name",
) )
result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment) result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
print(result) print(result)
def test_extract_tieba_note_list():
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
result = extractor.extract_tieba_note_list(content)
print(result)
pass
if __name__ == '__main__': if __name__ == '__main__':
# test_extract_search_note_list() # test_extract_search_note_list()
# test_extract_note_detail() # test_extract_note_detail()
# test_extract_tieba_note_parment_comments() # test_extract_tieba_note_parment_comments()
test_extract_tieba_note_sub_comments() test_extract_tieba_note_list()

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from typing import Optional from typing import Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field

View File

@ -1,7 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from typing import List from typing import List
from model.m_baidu_tieba import TiebaNote, TiebaComment from model.m_baidu_tieba import TiebaComment, TiebaNote
from . import tieba_store_impl from . import tieba_store_impl
from .tieba_store_impl import * from .tieba_store_impl import *

View File

@ -14,6 +14,7 @@ from PIL import Image, ImageDraw
from playwright.async_api import Cookie, Page from playwright.async_api import Cookie, Page
from proxy import IpInfoModel from proxy import IpInfoModel
from . import utils from . import utils

View File

@ -10,7 +10,7 @@ def init_loging_config():
level = logging.INFO level = logging.INFO
logging.basicConfig( logging.basicConfig(
level=level, level=level,
format="%(asctime)s [%(threadName)s] %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s", format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
datefmt='%Y-%m-%d %H:%M:%S' datefmt='%Y-%m-%d %H:%M:%S'
) )
_logger = logging.getLogger("MediaCrawler") _logger = logging.getLogger("MediaCrawler")