feat: 百度贴吧done
This commit is contained in:
parent
df0f5c1113
commit
3f42368c02
44
README.md
44
README.md
@ -7,7 +7,7 @@
|
|||||||
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
|
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
|
||||||
# 仓库描述
|
# 仓库描述
|
||||||
|
|
||||||
**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**...。
|
**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**,**百度贴吧**...。
|
||||||
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。
|
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。
|
||||||
|
|
||||||
原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数
|
原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数
|
||||||
@ -22,6 +22,7 @@
|
|||||||
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||||
|
| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||||
|
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
@ -99,14 +100,51 @@
|
|||||||
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
|
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
|
||||||
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
|
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 感谢下列Sponsors对本仓库赞助
|
## 感谢下列Sponsors对本仓库赞助
|
||||||
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰,这个插件我用了大半年,作为谷歌上最火的一款插件,体验非常不错。</a>
|
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰,这个插件我用了大半年,作为谷歌上最火的一款插件,体验非常不错。</a>
|
||||||
> 安装并注册该浏览器插件之后保留一天即可,我就可以获得3元的推广奖励,谢谢大家,支持我继续开源项目。
|
> 安装并注册该浏览器插件之后保留一天即可,我就可以获得3元的推广奖励,谢谢大家,支持我继续开源项目。
|
||||||
|
|
||||||
成为赞助者,展示你的产品在这里,联系作者wx:yzglan
|
成为赞助者,展示你的产品在这里,联系作者wx:yzglan
|
||||||
|
|
||||||
|
## 打赏
|
||||||
|
|
||||||
|
如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力!
|
||||||
|
|
||||||
|
打赏时您可以备注名称,我会将您添加至打赏列表中。
|
||||||
|
<p>
|
||||||
|
<img alt="打赏-微信" src="static/images/wechat_pay.jpeg" style="width: 200px;margin-right: 140px;" />
|
||||||
|
<img alt="打赏-支付宝" src="static/images/zfb_pay.png" style="width: 200px" />
|
||||||
|
</p>
|
||||||
|
|
||||||
|
## 捐赠信息
|
||||||
|
|
||||||
|
PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉)
|
||||||
|
|
||||||
|
| 捐赠者 | 捐赠金额 | 捐赠日期 |
|
||||||
|
|-------------|-------|------------|
|
||||||
|
| *皓 | 50 元 | 2024-03-18 |
|
||||||
|
| *刚 | 50 元 | 2024-03-18 |
|
||||||
|
| *乐 | 20 元 | 2024-03-17 |
|
||||||
|
| *木 | 20 元 | 2024-03-17 |
|
||||||
|
| *诚 | 20 元 | 2024-03-17 |
|
||||||
|
| Strem Gamer | 20 元 | 2024-03-16 |
|
||||||
|
| *鑫 | 20 元 | 2024-03-14 |
|
||||||
|
| Yuzu | 20 元 | 2024-03-07 |
|
||||||
|
| **宁 | 100 元 | 2024-03-03 |
|
||||||
|
| **媛 | 20 元 | 2024-03-03 |
|
||||||
|
| Scarlett | 20 元 | 2024-02-16 |
|
||||||
|
| Asun | 20 元 | 2024-01-30 |
|
||||||
|
| 何* | 100 元 | 2024-01-21 |
|
||||||
|
| allen | 20 元 | 2024-01-10 |
|
||||||
|
| llllll | 20 元 | 2024-01-07 |
|
||||||
|
| 邝*元 | 20 元 | 2023-12-29 |
|
||||||
|
| 50chen | 50 元 | 2023-12-22 |
|
||||||
|
| xiongot | 20 元 | 2023-12-17 |
|
||||||
|
| atom.hu | 20 元 | 2023-12-16 |
|
||||||
|
| 一呆 | 20 元 | 2023-12-01 |
|
||||||
|
| 坠落 | 50 元 | 2023-11-08 |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## MediaCrawler爬虫项目交流群:
|
## MediaCrawler爬虫项目交流群:
|
||||||
> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
|
> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
|
||||||
|
@ -28,7 +28,7 @@ HEADLESS = False
|
|||||||
SAVE_LOGIN_STATE = True
|
SAVE_LOGIN_STATE = True
|
||||||
|
|
||||||
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
||||||
SAVE_DATA_OPTION = "db" # csv or db or json
|
SAVE_DATA_OPTION = "csv" # csv or db or json
|
||||||
|
|
||||||
# 用户浏览器缓存的浏览器文件配置
|
# 用户浏览器缓存的浏览器文件配置
|
||||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||||
@ -46,11 +46,11 @@ MAX_CONCURRENCY_NUM = 1
|
|||||||
ENABLE_GET_IMAGES = False
|
ENABLE_GET_IMAGES = False
|
||||||
|
|
||||||
# 是否开启爬评论模式, 默认不开启爬评论
|
# 是否开启爬评论模式, 默认不开启爬评论
|
||||||
ENABLE_GET_COMMENTS = True
|
ENABLE_GET_COMMENTS = False
|
||||||
|
|
||||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||||
ENABLE_GET_SUB_COMMENTS = True
|
ENABLE_GET_SUB_COMMENTS = False
|
||||||
|
|
||||||
# 指定小红书需要爬虫的笔记ID列表
|
# 指定小红书需要爬虫的笔记ID列表
|
||||||
XHS_SPECIFIED_ID_LIST = [
|
XHS_SPECIFIED_ID_LIST = [
|
||||||
@ -93,6 +93,10 @@ TIEBA_SPECIFIED_ID_LIST = [
|
|||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# 指定贴吧名称列表,爬取该贴吧下的帖子
|
||||||
|
TIEBA_NAME_LIST = [
|
||||||
|
# "盗墓笔记"
|
||||||
|
]
|
||||||
|
|
||||||
# 指定小红书创作者ID列表
|
# 指定小红书创作者ID列表
|
||||||
XHS_CREATOR_ID_LIST = [
|
XHS_CREATOR_ID_LIST = [
|
||||||
@ -118,19 +122,18 @@ KS_CREATOR_ID_LIST = [
|
|||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# 词云相关
|
||||||
#词云相关
|
# 是否开启生成评论词云图
|
||||||
#是否开启生成评论词云图
|
|
||||||
ENABLE_GET_WORDCLOUD = False
|
ENABLE_GET_WORDCLOUD = False
|
||||||
# 自定义词语及其分组
|
# 自定义词语及其分组
|
||||||
#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||||
CUSTOM_WORDS = {
|
CUSTOM_WORDS = {
|
||||||
'零几': '年份', # 将“零几”识别为一个整体
|
'零几': '年份', # 将“零几”识别为一个整体
|
||||||
'高频词': '专业术语' # 示例自定义词
|
'高频词': '专业术语' # 示例自定义词
|
||||||
}
|
}
|
||||||
|
|
||||||
#停用(禁用)词文件路径
|
# 停用(禁用)词文件路径
|
||||||
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
||||||
|
|
||||||
#中文字体文件路径
|
# 中文字体文件路径
|
||||||
FONT_PATH= "./docs/STZHONGS.TTF"
|
FONT_PATH = "./docs/STZHONGS.TTF"
|
||||||
|
@ -1,17 +1,15 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import random
|
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Dict, List, Optional, Union
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import BrowserContext
|
from playwright.async_api import BrowserContext
|
||||||
from tenacity import (RetryError, retry, stop_after_attempt,
|
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
||||||
wait_fixed)
|
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
from model.m_baidu_tieba import TiebaNote, TiebaComment
|
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||||
from proxy.proxy_ip_pool import ProxyIpPool
|
from proxy.proxy_ip_pool import ProxyIpPool
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
@ -103,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||||
raise e
|
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||||
|
|
||||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||||
"""
|
"""
|
||||||
@ -248,28 +246,44 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
||||||
|
|
||||||
all_sub_comments: List[TiebaComment] = []
|
all_sub_comments: List[TiebaComment] = []
|
||||||
for comment in comments:
|
for parment_comment in comments:
|
||||||
if comment.sub_comment_count == 0:
|
if parment_comment.sub_comment_count == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
current_page = 1
|
current_page = 1
|
||||||
max_sub_page_num = comment.sub_comment_count // 10 + 1
|
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||||
while max_sub_page_num >= current_page:
|
while max_sub_page_num >= current_page:
|
||||||
params = {
|
params = {
|
||||||
"tid": comment.note_id, # 帖子ID
|
"tid": parment_comment.note_id, # 帖子ID
|
||||||
"pid": comment.comment_id, # 父级评论ID
|
"pid": parment_comment.comment_id, # 父级评论ID
|
||||||
"fid": comment.tieba_id, # 贴吧ID
|
"fid": parment_comment.tieba_id, # 贴吧ID
|
||||||
"pn": current_page # 页码
|
"pn": current_page # 页码
|
||||||
}
|
}
|
||||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
|
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
|
||||||
parent_comment=comment)
|
parent_comment=parment_comment)
|
||||||
|
|
||||||
if not sub_comments:
|
if not sub_comments:
|
||||||
break
|
break
|
||||||
if callback:
|
if callback:
|
||||||
await callback(comment.note_id, sub_comments)
|
await callback(parment_comment.note_id, sub_comments)
|
||||||
all_sub_comments.extend(sub_comments)
|
all_sub_comments.extend(sub_comments)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
current_page += 1
|
current_page += 1
|
||||||
return all_sub_comments
|
return all_sub_comments
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||||
|
"""
|
||||||
|
根据贴吧名称获取帖子列表
|
||||||
|
Args:
|
||||||
|
tieba_name: 贴吧名称
|
||||||
|
page_num: 分页数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||||
|
page_content = await self.get(uri, return_ori_content=True)
|
||||||
|
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||||
|
@ -53,6 +53,7 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
if config.CRAWLER_TYPE == "search":
|
if config.CRAWLER_TYPE == "search":
|
||||||
# Search for notes and retrieve their comment information.
|
# Search for notes and retrieve their comment information.
|
||||||
await self.search()
|
await self.search()
|
||||||
|
await self.get_specified_tieba_notes()
|
||||||
elif config.CRAWLER_TYPE == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_notes()
|
await self.get_specified_notes()
|
||||||
@ -92,7 +93,7 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
if not notes_list:
|
if not notes_list:
|
||||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
|
utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
|
||||||
break
|
break
|
||||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}")
|
utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}")
|
||||||
await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
|
await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
|
||||||
page += 1
|
page += 1
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
@ -100,6 +101,34 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
|
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
async def get_specified_tieba_notes(self):
|
||||||
|
"""
|
||||||
|
Get the information and comments of the specified post by tieba name
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
tieba_limit_count = 50
|
||||||
|
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||||
|
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||||
|
for tieba_name in config.TIEBA_NAME_LIST:
|
||||||
|
utils.logger.info(
|
||||||
|
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}")
|
||||||
|
page_number = 0
|
||||||
|
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name(
|
||||||
|
tieba_name=tieba_name,
|
||||||
|
page_num=page_number
|
||||||
|
)
|
||||||
|
if not note_list:
|
||||||
|
utils.logger.info(
|
||||||
|
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty")
|
||||||
|
break
|
||||||
|
|
||||||
|
utils.logger.info(
|
||||||
|
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}")
|
||||||
|
await self.get_specified_notes([note.note_id for note in note_list])
|
||||||
|
page_number += tieba_limit_count
|
||||||
|
|
||||||
async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
|
async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
|
||||||
"""
|
"""
|
||||||
Get the information and comments of the specified post
|
Get the information and comments of the specified post
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import re
|
|
||||||
import json
|
|
||||||
import html
|
import html
|
||||||
from typing import List, Dict, Tuple
|
import json
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from parsel import Selector
|
from parsel import Selector
|
||||||
|
|
||||||
from model.m_baidu_tieba import TiebaNote, TiebaComment
|
|
||||||
from constant import baidu_tieba as const
|
from constant import baidu_tieba as const
|
||||||
|
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
@ -43,6 +43,42 @@ class TieBaExtractor:
|
|||||||
result.append(tieba_note)
|
result.append(tieba_note)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
|
||||||
|
"""
|
||||||
|
提取贴吧帖子列表
|
||||||
|
Args:
|
||||||
|
page_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
page_content = page_content.replace('<!--', "")
|
||||||
|
content_selector = Selector(text=page_content)
|
||||||
|
xpath_selector = "//ul[@id='thread_list']/li"
|
||||||
|
post_list = content_selector.xpath(xpath_selector)
|
||||||
|
result: List[TiebaNote] = []
|
||||||
|
for post_selector in post_list:
|
||||||
|
post_field_value: Dict = self.extract_data_field_value(post_selector)
|
||||||
|
if not post_field_value:
|
||||||
|
continue
|
||||||
|
note_id = str(post_field_value.get("id"))
|
||||||
|
tieba_note = TiebaNote(
|
||||||
|
note_id=note_id,
|
||||||
|
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
|
||||||
|
desc=post_selector.xpath(".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
|
||||||
|
default='').strip(),
|
||||||
|
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||||
|
user_link=const.TIEBA_URL + post_selector.xpath(
|
||||||
|
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
|
||||||
|
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get("author_name"),
|
||||||
|
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
|
||||||
|
tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(
|
||||||
|
default=''),
|
||||||
|
total_replay_num=post_field_value.get("reply_num", 0)
|
||||||
|
)
|
||||||
|
result.append(tieba_note)
|
||||||
|
return result
|
||||||
|
|
||||||
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
||||||
"""
|
"""
|
||||||
提取贴吧帖子详情
|
提取贴吧帖子详情
|
||||||
@ -124,8 +160,7 @@ class TieBaExtractor:
|
|||||||
result.append(tieba_comment)
|
result.append(tieba_comment)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||||
def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
|
||||||
"""
|
"""
|
||||||
提取贴吧帖子二级评论
|
提取贴吧帖子二级评论
|
||||||
Args:
|
Args:
|
||||||
@ -144,7 +179,8 @@ class TieBaExtractor:
|
|||||||
if not comment_value:
|
if not comment_value:
|
||||||
continue
|
continue
|
||||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||||
content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
content = utils.extract_text_from_html(
|
||||||
|
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||||
comment = TiebaComment(
|
comment = TiebaComment(
|
||||||
comment_id=str(comment_value.get("spid")),
|
comment_id=str(comment_value.get("spid")),
|
||||||
content=content,
|
content=content,
|
||||||
@ -227,6 +263,7 @@ def test_extract_tieba_note_parment_comments():
|
|||||||
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
||||||
print(result)
|
print(result)
|
||||||
|
|
||||||
|
|
||||||
def test_extract_tieba_note_sub_comments():
|
def test_extract_tieba_note_sub_comments():
|
||||||
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
@ -244,11 +281,21 @@ def test_extract_tieba_note_sub_comments():
|
|||||||
tieba_id="tieba_id",
|
tieba_id="tieba_id",
|
||||||
tieba_name="tieba_name",
|
tieba_name="tieba_name",
|
||||||
)
|
)
|
||||||
result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment)
|
result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
|
||||||
print(result)
|
print(result)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_tieba_note_list():
|
||||||
|
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
extractor = TieBaExtractor()
|
||||||
|
result = extractor.extract_tieba_note_list(content)
|
||||||
|
print(result)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# test_extract_search_note_list()
|
# test_extract_search_note_list()
|
||||||
# test_extract_note_detail()
|
# test_extract_note_detail()
|
||||||
# test_extract_tieba_note_parment_comments()
|
# test_extract_tieba_note_parment_comments()
|
||||||
test_extract_tieba_note_sub_comments()
|
test_extract_tieba_note_list()
|
||||||
|
3627
media_platform/tieba/test_data/tieba_note_list.html
Normal file
3627
media_platform/tieba/test_data/tieba_note_list.html
Normal file
File diff suppressed because one or more lines are too long
@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from model.m_baidu_tieba import TiebaNote, TiebaComment
|
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||||
|
|
||||||
from . import tieba_store_impl
|
from . import tieba_store_impl
|
||||||
from .tieba_store_impl import *
|
from .tieba_store_impl import *
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ from PIL import Image, ImageDraw
|
|||||||
from playwright.async_api import Cookie, Page
|
from playwright.async_api import Cookie, Page
|
||||||
|
|
||||||
from proxy import IpInfoModel
|
from proxy import IpInfoModel
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ def init_loging_config():
|
|||||||
level = logging.INFO
|
level = logging.INFO
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=level,
|
level=level,
|
||||||
format="%(asctime)s [%(threadName)s] %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
|
format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
|
||||||
datefmt='%Y-%m-%d %H:%M:%S'
|
datefmt='%Y-%m-%d %H:%M:%S'
|
||||||
)
|
)
|
||||||
_logger = logging.getLogger("MediaCrawler")
|
_logger = logging.getLogger("MediaCrawler")
|
||||||
|
Loading…
Reference in New Issue
Block a user