feat: 微博帖子支持保存到数据库中

This commit is contained in:
Relakkes 2023-12-24 18:19:26 +08:00
parent c5b64fdbf5
commit b1441ab4ae
3 changed files with 19 additions and 15 deletions

View File

@ -24,8 +24,8 @@
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | | 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | | 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
| B 站 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | | B 站 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
| 微博 | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | | 微博 | ✅ | ✕ | ✕ | ✅ | ✕ | ✅ | ✅ | ✅ | ✕ |
## 使用方法 ## 使用方法
@ -67,13 +67,16 @@
# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息 # 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
python3 main.py --platform xhs --lt qrcode --type detail python3 main.py --platform xhs --lt qrcode --type detail
# 打开对应APP扫二维码登录
# 其他平台爬虫使用示例, 执行下面的命令查看 # 其他平台爬虫使用示例, 执行下面的命令查看
python3 main.py --help python3 main.py --help
``` ```
打开对应APP扫二维码登录
等待爬虫程序执行完毕,数据会保存到 `data/xhs` 目录下 ### 数据保存
- 支持保存到关系型数据库Mysql、PgSQL等
- 支持保存到csv中data/目录下)
## 如何使用 IP 代理 ## 如何使用 IP 代理
➡️➡️➡️ [IP代理使用方法](docs/代理使用.md) ➡️➡️➡️ [IP代理使用方法](docs/代理使用.md)

View File

@ -2,3 +2,4 @@ from .bilibili import *
from .douyin import * from .douyin import *
from .kuaishou import * from .kuaishou import *
from .xiaohongshu import * from .xiaohongshu import *
from .weibo import *

View File

@ -35,14 +35,14 @@ class WeiboNote(WeiboBaseModel):
note_id = fields.CharField(max_length=64, index=True, description="帖子ID") note_id = fields.CharField(max_length=64, index=True, description="帖子ID")
content = fields.TextField(null=True, description="帖子正文内容") content = fields.TextField(null=True, description="帖子正文内容")
create_time = fields.BigIntField(description="帖子发布时间戳", index=True) create_time = fields.BigIntField(description="帖子发布时间戳", index=True)
create_date_time = fields.BigIntField(description="帖子发布日期时间", index=True) create_date_time = fields.CharField(description="帖子发布日期时间",max_length=32, index=True)
liked_count = fields.CharField(null=True, max_length=16, description="帖子点赞数") liked_count = fields.CharField(null=True, max_length=16, description="帖子点赞数")
comments_count = fields.CharField(null=True, max_length=16, description="帖子评论数量") comments_count = fields.CharField(null=True, max_length=16, description="帖子评论数量")
shared_count = fields.CharField(null=True, max_length=16, description="帖子转发数量") shared_count = fields.CharField(null=True, max_length=16, description="帖子转发数量")
note_url = fields.CharField(null=True, max_length=512, description="帖子详情URL") note_url = fields.CharField(null=True, max_length=512, description="帖子详情URL")
class Meta: class Meta:
table = "weibo_video" table = "weibo_note"
table_description = "微博帖子" table_description = "微博帖子"
def __str__(self): def __str__(self):
@ -54,7 +54,7 @@ class WeiboComment(WeiboBaseModel):
note_id = fields.CharField(max_length=64, index=True, description="帖子ID") note_id = fields.CharField(max_length=64, index=True, description="帖子ID")
content = fields.TextField(null=True, description="评论内容") content = fields.TextField(null=True, description="评论内容")
create_time = fields.BigIntField(description="评论时间戳") create_time = fields.BigIntField(description="评论时间戳")
create_date_time = fields.BigIntField(description="评论日期时间", index=True) create_date_time = fields.CharField(description="评论日期时间", max_length=32, index=True)
comment_like_count = fields.CharField(max_length=16, description="评论点赞数量") comment_like_count = fields.CharField(max_length=16, description="评论点赞数量")
sub_comment_count = fields.CharField(max_length=16, description="评论回复数") sub_comment_count = fields.CharField(max_length=16, description="评论回复数")
@ -75,16 +75,16 @@ async def update_weibo_note(note_item: Dict):
"note_id": note_id, "note_id": note_id,
"content": mblog.get("text"), "content": mblog.get("text"),
"create_time": utils.rfc2822_to_timestamp(mblog.get("created_at")), "create_time": utils.rfc2822_to_timestamp(mblog.get("created_at")),
"create_date_time": utils.rfc2822_to_china_datetime(mblog.get("created_at")), "create_date_time": str(utils.rfc2822_to_china_datetime(mblog.get("created_at"))),
"liked_count": mblog.get("attitudes_count", 0), "liked_count": str(mblog.get("attitudes_count", 0)),
"comments_count": mblog.get("comments_count", 0), "comments_count": str(mblog.get("comments_count", 0)),
"shared_count": mblog.get("reposts_count", 0), "shared_count": str(mblog.get("reposts_count", 0)),
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
"note_url": f"https://m.weibo.cn/detail/{note_id}", "note_url": f"https://m.weibo.cn/detail/{note_id}",
"ip_location": mblog.get("region_name", "").replace("发布于 ", ""), "ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
# 用户信息 # 用户信息
"user_id": user_info.get("id"), "user_id": str(user_info.get("id")),
"nickname": user_info.get("screen_name", ""), "nickname": user_info.get("screen_name", ""),
"gender": user_info.get("gender", ""), "gender": user_info.get("gender", ""),
"profile_url": user_info.get("profile_url", ""), "profile_url": user_info.get("profile_url", ""),
@ -130,7 +130,7 @@ async def update_weibo_video_comment(note_id: str, comment_item: Dict):
local_db_item = { local_db_item = {
"comment_id": comment_id, "comment_id": comment_id,
"create_time": utils.rfc2822_to_timestamp(comment_item.get("created_at")), "create_time": utils.rfc2822_to_timestamp(comment_item.get("created_at")),
"create_date_time": utils.rfc2822_to_china_datetime(comment_item.get("created_at")), "create_date_time": str(utils.rfc2822_to_china_datetime(comment_item.get("created_at"))),
"note_id": note_id, "note_id": note_id,
"content": content.get("message"), "content": content.get("message"),
"sub_comment_count": str(comment_item.get("total_number", 0)), "sub_comment_count": str(comment_item.get("total_number", 0)),
@ -139,7 +139,7 @@ async def update_weibo_video_comment(note_id: str, comment_item: Dict):
"ip_location": comment_item.get("source", "").replace("来自", ""), "ip_location": comment_item.get("source", "").replace("来自", ""),
# 用户信息 # 用户信息
"user_id": user_info.get("id"), "user_id": str(user_info.get("id")),
"nickname": user_info.get("screen_name", ""), "nickname": user_info.get("screen_name", ""),
"gender": user_info.get("gender", ""), "gender": user_info.get("gender", ""),
"profile_url": user_info.get("profile_url", ""), "profile_url": user_info.get("profile_url", ""),