From acb29add284b288455f14a369cd41d5dd955330e Mon Sep 17 00:00:00 2001 From: Relakkes Yang Date: Sat, 24 Aug 2024 11:03:23 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E7=99=BE=E5=BA=A6=E8=B4=B4=E5=90=A7?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=88=9B=E4=BD=9C=E8=80=85=E4=B8=BB=E9=A1=B5?= =?UTF-8?q?=E5=B8=96=E5=AD=90=E7=88=AC=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 2 +- media_platform/tieba/client.py | 29 ++-- media_platform/tieba/core.py | 22 ++- media_platform/tieba/help.py | 230 ++++++++++++++++++-------------- model/m_baidu_tieba.py | 3 - schema/tables.sql | 41 ++++-- store/tieba/__init__.py | 25 ++-- store/tieba/tieba_store_impl.py | 25 ++-- 8 files changed, 210 insertions(+), 167 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index 4d389e4..0a03af8 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -102,7 +102,7 @@ TIEBA_NAME_LIST = [ ] TIEBA_CREATOR_URL_LIST = [ - "https://tieba.baidu.com/home/main/?id=tb.1.6a328702.02qx9GEBmrwqYDRyOgGKXQ&fr=frs", + "https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs", # ........................ ] diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index cb83c4a..f20b02e 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -9,7 +9,7 @@ from tenacity import RetryError, retry, stop_after_attempt, wait_fixed import config from base.base_crawler import AbstractApiClient -from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator +from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote from proxy.proxy_ip_pool import ProxyIpPool from tools import utils @@ -317,14 +317,17 @@ class BaiduTieBaClient(AbstractApiClient): } return await self.get(uri, params=params) - async def get_all_notes_by_creator_user_name(self, user_name: str,crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[TiebaNote]: + async def get_all_notes_by_creator_user_name(self, + user_name: str, crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_note_count: int = 0) -> List[TiebaNote]: """ 根据创作者用户名获取创作者所有帖子 Args: - user_name: - crawl_interval: - callback: + user_name: 创作者用户名 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数 + max_note_count: 帖子最大获取数量,如果为0则获取所有 Returns: @@ -332,16 +335,17 @@ class BaiduTieBaClient(AbstractApiClient): result = [] notes_has_more = 1 page_number = 1 - while notes_has_more == 1: + page_per_count = 20 + total_get_count = 0 + while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count): notes_res = await self.get_notes_by_creator(user_name, page_number) if not notes_res or notes_res.get("no") != 0: utils.logger.error( f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}") break - - notes_has_more = notes_res.get("has_more") - page_number += 1 - notes = notes_res["thread_list"] + notes_data = notes_res.get("data") + notes_has_more = notes_data.get("has_more") + notes = notes_data["thread_list"] utils.logger.info( f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}") @@ -351,5 +355,6 @@ class BaiduTieBaClient(AbstractApiClient): await callback(notes) await asyncio.sleep(crawl_interval) result.extend(notes) + page_number += 1 + total_get_count += page_per_count return result - diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 7a78382..f10ecf5 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -9,7 +9,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler -from model.m_baidu_tieba import TiebaNote +from model.m_baidu_tieba import TiebaCreator, TiebaNote from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import tieba as tieba_store from tools import utils @@ -226,19 +226,20 @@ class TieBaCrawler(AbstractCrawler): """ utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators") for creator_url in config.TIEBA_CREATOR_URL_LIST: - createor_info: Dict = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url) - if createor_info: - utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}") - if not createor_info: + creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url) + if creator_info: + utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}") + if not creator_info: raise Exception("Get creator info error") - user_id = createor_info.get("user_id") - await tieba_store.save_creator(user_id, user_info=createor_info) + + await tieba_store.save_creator(user_info=creator_info) # Get all note information of the creator all_notes_list = await self.tieba_client.get_all_notes_by_creator_user_name( - user_name=createor_info.get("user_name"), + user_name=creator_info.user_name, crawl_interval=0, - callback=tieba_store.batch_update_tieba_notes + callback=tieba_store.batch_update_tieba_notes, + max_note_count=config.CRAWLER_MAX_NOTES_COUNT ) await self.batch_get_note_comments(all_notes_list) @@ -247,9 +248,6 @@ class TieBaCrawler(AbstractCrawler): utils.logger.error( f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}") - - - async def launch_browser( self, chromium: BrowserType, diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index b0db7d9..2f30d35 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -3,16 +3,16 @@ import html import json import re from typing import Dict, List, Tuple -from urllib.parse import unquote, parse_qs +from urllib.parse import parse_qs, unquote from parsel import Selector from constant import baidu_tieba as const -from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator +from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote from tools import utils GENDER_MALE = "sex_male" -GENDER_FMALE = "sex_fmale" +GENDER_FEMALE = "sex_female" class TieBaExtractor: @@ -33,17 +33,19 @@ class TieBaExtractor: post_list = Selector(text=page_content).xpath(xpath_selector) result: List[TiebaNote] = [] for post in post_list: - tieba_note = TiebaNote( - note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(), - title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(), - desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(), - note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(default=''), - user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip(), - user_link=const.TIEBA_URL + post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default=''), - tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(), - tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(default=''), - publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip(), - ) + tieba_note = TiebaNote(note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(), + title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(), + desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(), + note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get( + default=''), + user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get( + default='').strip(), user_link=const.TIEBA_URL + post.xpath( + ".//a[starts-with(@href, '/home/main')]/@href").get(default=''), + tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(), + tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get( + default=''), + publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get( + default='').strip(), ) result.append(tieba_note) return result @@ -66,20 +68,19 @@ class TieBaExtractor: if not post_field_value: continue note_id = str(post_field_value.get("id")) - tieba_note = TiebaNote( - note_id=note_id, - title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(), - desc=post_selector.xpath(".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get( - default='').strip(), - note_url=const.TIEBA_URL + f"/p/{note_id}", - user_link=const.TIEBA_URL + post_selector.xpath( - ".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(), - user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get("author_name"), - tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(), - tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get( - default=''), - total_replay_num=post_field_value.get("reply_num", 0) - ) + tieba_note = TiebaNote(note_id=note_id, + title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(), + desc=post_selector.xpath( + ".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get( + default='').strip(), note_url=const.TIEBA_URL + f"/p/{note_id}", + user_link=const.TIEBA_URL + post_selector.xpath( + ".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(), + user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get( + "author_name"), + tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get( + default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath( + "//a[@class='card_title_fname']/@href").get(default=''), + total_replay_num=post_field_value.get("reply_num", 0)) result.append(tieba_note) return result @@ -98,28 +99,25 @@ class TieBaExtractor: note_id = only_view_author_link.split("?")[0].split("/")[-1] # 帖子回复数、回复页数 thread_num_infos = content_selector.xpath( - "//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']" - ) + "//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']") # IP地理位置、发表时间 other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip() ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content) - note = TiebaNote( - note_id=note_id, - title=content_selector.xpath("//title/text()").get(default='').strip(), - desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(), - note_url=const.TIEBA_URL + f"/p/{note_id}", - user_link=const.TIEBA_URL + first_floor_selector.xpath(".//a[@class='p_author_face ']/@href").get( - default='').strip(), - user_nickname=first_floor_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( - default='').strip(), - user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(default='').strip(), - tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(), - tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(default=''), - ip_location=ip_location, - publish_time=publish_time, - total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(), - total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), - ) + note = TiebaNote(note_id=note_id, title=content_selector.xpath("//title/text()").get(default='').strip(), + desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(), + note_url=const.TIEBA_URL + f"/p/{note_id}", + user_link=const.TIEBA_URL + first_floor_selector.xpath( + ".//a[@class='p_author_face ']/@href").get(default='').strip(), + user_nickname=first_floor_selector.xpath( + ".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(), + user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get( + default='').strip(), + tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get( + default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath( + "//a[@class='card_title_fname']/@href").get(default=''), ip_location=ip_location, + publish_time=publish_time, + total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(), + total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), ) note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "") return note @@ -143,24 +141,20 @@ class TieBaExtractor: tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip() other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip() ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content) - tieba_comment = TiebaComment( - comment_id=str(comment_field_value.get("content").get("post_id")), - sub_comment_count=comment_field_value.get("content").get("comment_num"), - content=utils.extract_text_from_html(comment_field_value.get("content").get("content")), - note_url=const.TIEBA_URL + f"/p/{note_id}", - user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get( - default='').strip(), - user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( - default='').strip(), - user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get( - default='').strip(), - tieba_id=str(comment_field_value.get("content").get("forum_id", "")), - tieba_name=tieba_name, - tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}", - ip_location=ip_location, - publish_time=publish_time, - note_id=note_id, - ) + tieba_comment = TiebaComment(comment_id=str(comment_field_value.get("content").get("post_id")), + sub_comment_count=comment_field_value.get("content").get("comment_num"), + content=utils.extract_text_from_html( + comment_field_value.get("content").get("content")), + note_url=const.TIEBA_URL + f"/p/{note_id}", + user_link=const.TIEBA_URL + comment_selector.xpath( + ".//a[@class='p_author_face ']/@href").get(default='').strip(), + user_nickname=comment_selector.xpath( + ".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(), + user_avatar=comment_selector.xpath( + ".//a[@class='p_author_face ']/img/@src").get(default='').strip(), + tieba_id=str(comment_field_value.get("content").get("forum_id", "")), + tieba_name=tieba_name, tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}", + ip_location=ip_location, publish_time=publish_time, note_id=note_id, ) result.append(tieba_comment) return result @@ -186,19 +180,15 @@ class TieBaExtractor: content = utils.extract_text_from_html( comment_ele.xpath(".//span[@class='lzl_content_main']").get(default="")) comment = TiebaComment( - comment_id=str(comment_value.get("spid")), - content=content, + comment_id=str(comment_value.get("spid")), content=content, user_link=comment_user_a_selector.xpath("./@href").get(default=""), user_nickname=comment_value.get("showname"), user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""), publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(), parent_comment_id=parent_comment.comment_id, - note_id=parent_comment.note_id, - note_url=parent_comment.note_url, - tieba_id=parent_comment.tieba_id, - tieba_name=parent_comment.tieba_name, - tieba_link=parent_comment.tieba_link - ) + note_id=parent_comment.note_id, note_url=parent_comment.note_url, + tieba_id=parent_comment.tieba_id, tieba_name=parent_comment.tieba_name, + tieba_link=parent_comment.tieba_link) comments.append(comment) return comments @@ -215,23 +205,26 @@ class TieBaExtractor: selector = Selector(text=html_content) user_link_selector = selector.xpath("//p[@class='space']/a") user_link: str = user_link_selector.xpath("./@href").get(default='') - user_link_params: Dict = parse_qs(unquote(user_link)) + user_link_params: Dict = parse_qs(unquote(user_link.split("?")[-1])) user_name = user_link_params.get("un")[0] if user_link_params.get("un") else "" user_id = user_link_params.get("id")[0] if user_link_params.get("id") else "" userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']") - creator = TiebaCreator( - user_id=user_id, - user_name=user_name, - nickname=selector.xpath(".//a[@class='userinfo_username']/text()").get(default='').strip(), - avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(default='').strip(), - gender=self.extract_gender(userinfo_userdata_selector.get(default='')), - ip_location=self.extract_ip(userinfo_userdata_selector.get(default='')), - follows=0, - fans=0, - follow_tieba_list="", - registration_duration="", - ) - return creator + follow_fans_selector = selector.xpath("//span[@class='concern_num']") + follows, fans = 0, 0 + if len(follow_fans_selector) == 2: + follows, fans = self.extract_follow_and_fans(follow_fans_selector) + user_content = userinfo_userdata_selector.get(default='') + return TiebaCreator(user_id=user_id, user_name=user_name, + nickname=selector.xpath(".//span[@class='userinfo_username ']/text()").get( + default='').strip(), + avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get( + default='').strip(), + gender=self.extract_gender(user_content), + ip_location=self.extract_ip(user_content), + follows=follows, + fans=fans, + registration_duration=self.extract_registration_duration(user_content) + ) def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]: """ @@ -272,7 +265,39 @@ class TieBaExtractor: Returns: """ - pass + if GENDER_MALE in html_content: + return '男' + elif GENDER_FEMALE in html_content: + return '女' + return '未知' + + @staticmethod + def extract_follow_and_fans(selectors: List[Selector]) -> Tuple[str, str]: + """ + 提取关注数和粉丝数 + Args: + selectors: + + Returns: + + """ + pattern = re.compile(r'\(]*>(\d+)\)') + follow_match = pattern.findall(selectors[0].get()) + fans_match = pattern.findall(selectors[1].get()) + follows = follow_match[0] if follow_match else 0 + fans = fans_match[0] if fans_match else 0 + return follows, fans + + @staticmethod + def extract_registration_duration(html_content: str) -> str: + """ + "吧龄:1.9年" + Returns: 1.9年 + + """ + pattern = re.compile(r'吧龄:(\S+)') + match = pattern.search(html_content) + return match.group(1) if match else "" @staticmethod def extract_data_field_value(selector: Selector) -> Dict: @@ -325,19 +350,11 @@ def test_extract_tieba_note_sub_comments(): with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f: content = f.read() extractor = TieBaExtractor() - fake_parment_comment = TiebaComment( - comment_id="123456", - content="content", - user_link="user_link", - user_nickname="user_nickname", - user_avatar="user_avatar", - publish_time="publish_time", - parent_comment_id="parent_comment_id", - note_id="note_id", - note_url="note_url", - tieba_id="tieba_id", - tieba_name="tieba_name", - ) + fake_parment_comment = TiebaComment(comment_id="123456", content="content", user_link="user_link", + user_nickname="user_nickname", user_avatar="user_avatar", + publish_time="publish_time", parent_comment_id="parent_comment_id", + note_id="note_id", note_url="note_url", tieba_id="tieba_id", + tieba_name="tieba_name", ) result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment) print(result) @@ -351,8 +368,17 @@ def test_extract_tieba_note_list(): pass +def test_extract_creator_info(): + with open("test_data/creator_info.html", "r", encoding="utf-8") as f: + content = f.read() + extractor = TieBaExtractor() + result = extractor.extract_creator_info(content) + print(result.model_dump_json()) + + if __name__ == '__main__': # test_extract_search_note_list() # test_extract_note_detail() # test_extract_tieba_note_parment_comments() - test_extract_tieba_note_list() + # test_extract_tieba_note_list() + test_extract_creator_info() diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py index 5a20bd6..efc1496 100644 --- a/model/m_baidu_tieba.py +++ b/model/m_baidu_tieba.py @@ -45,7 +45,6 @@ class TiebaComment(BaseModel): tieba_link: str = Field(..., description="贴吧链接") - class TiebaCreator(BaseModel): """ 百度贴吧创作者 @@ -58,6 +57,4 @@ class TiebaCreator(BaseModel): ip_location: Optional[str] = Field(default="", description="IP地理位置") follows: int = Field(default=0, description="关注数") fans: int = Field(default=0, description="粉丝数") - follow_tieba_list: str = Field(default="", description="关注的贴吧列表") registration_duration: str = Field(default="", description="注册时长") - diff --git a/schema/tables.sql b/schema/tables.sql index bcde31b..d23e8d8 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -400,12 +400,18 @@ CREATE TABLE tieba_comment ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; -- 增加搜索来源关键字字段 -alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; -alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table bilibili_video + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table douyin_aweme + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table kuaishou_video + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table weibo_note + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table xhs_note + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table tieba_note + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; DROP TABLE IF EXISTS `weibo_creator`; @@ -419,7 +425,7 @@ CREATE TABLE `weibo_creator` `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', `desc` longtext COMMENT '用户描述', - `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `gender` varchar(2) DEFAULT NULL COMMENT '性别', `follows` varchar(16) DEFAULT NULL COMMENT '关注数', `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', `tag_list` longtext COMMENT '标签列表', @@ -428,4 +434,23 @@ CREATE TABLE `weibo_creator` ALTER TABLE `xhs_note_comment` - ADD COLUMN `like_count` VARCHAR(64) DEFAULT NULL COMMENT '评论点赞数量'; \ No newline at end of file + ADD COLUMN `like_count` VARCHAR(64) DEFAULT NULL COMMENT '评论点赞数量'; + + +DROP TABLE IF EXISTS `tieba_creator`; +CREATE TABLE `tieba_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_name` varchar(64) NOT NULL COMMENT '用户名', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `gender` varchar(2) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `registration_duration` varchar(16) DEFAULT NULL COMMENT '吧龄', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧创作者'; \ No newline at end of file diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index e6708a5..bab343f 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from typing import List -from model.m_baidu_tieba import TiebaComment, TiebaNote +from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote from var import source_keyword_var from . import tieba_store_impl @@ -23,6 +23,7 @@ class TieBaStoreFactory: "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...") return store_class() + async def batch_update_tieba_notes(note_list: List[TiebaNote]): """ Batch update tieba notes @@ -37,6 +38,7 @@ async def batch_update_tieba_notes(note_list: List[TiebaNote]): for note_item in note_list: await update_tieba_note(note_item) + async def update_tieba_note(note_item: TiebaNote): """ Add or Update tieba note @@ -54,7 +56,7 @@ async def update_tieba_note(note_item: TiebaNote): await TieBaStoreFactory.create_store().store_content(save_note_item) -async def batch_update_tieba_note_comments(note_id:str, comments: List[TiebaComment]): +async def batch_update_tieba_note_comments(note_id: str, comments: List[TiebaComment]): """ Batch update tieba note comments Args: @@ -86,27 +88,16 @@ async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment): await TieBaStoreFactory.create_store().store_comment(save_comment_item) -async def save_creator(user_id: str, user_info: Dict): +async def save_creator(user_info: TiebaCreator): """ Save creator information to local Args: - user_id: user_info: Returns: """ - local_db_item = { - 'user_id': user_id, - 'nickname': user_info.get('nickname'), - 'gender': '女' if user_info.get('gender') == "f" else '男', - 'avatar': user_info.get('avatar'), - 'ip_location': user_info.get("ip_location", ""), - 'follows': user_info.get('follow_count', ''), - 'fans': user_info.get('followers_count', ''), - 'follow_tieba_list': user_info.get("tieba_list", ''), - 'last_modify_ts': utils.get_current_timestamp(), - 'registration_duration': user_info.get("registration_duration", ""), - } + local_db_item = user_info.model_dump() + local_db_item["last_modify_ts"] = utils.get_current_timestamp() utils.logger.info(f"[store.tieba.save_creator] creator:{local_db_item}") - await TieBaStoreFactory.create_store().store_creator(local_db_item) \ No newline at end of file + await TieBaStoreFactory.create_store().store_creator(local_db_item) diff --git a/store/tieba/tieba_store_impl.py b/store/tieba/tieba_store_impl.py index fe0ccbc..7824e60 100644 --- a/store/tieba/tieba_store_impl.py +++ b/store/tieba/tieba_store_impl.py @@ -24,14 +24,14 @@ def calculate_number_of_files(file_store_path: str) -> int: if not os.path.exists(file_store_path): return 1 try: - return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 except ValueError: return 1 class TieBaCsvStoreImplement(AbstractStore): csv_store_path: str = "data/tieba" - file_count:int=calculate_number_of_files(csv_store_path) + file_count: int = calculate_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -65,7 +65,7 @@ class TieBaCsvStoreImplement(AbstractStore): async def store_content(self, content_item: Dict): """ - Xiaohongshu content CSV storage implementation + tieba content CSV storage implementation Args: content_item: note item dict @@ -76,7 +76,7 @@ class TieBaCsvStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - Xiaohongshu comment CSV storage implementation + tieba comment CSV storage implementation Args: comment_item: comment item dict @@ -87,7 +87,7 @@ class TieBaCsvStoreImplement(AbstractStore): async def store_creator(self, creator: Dict): """ - Xiaohongshu content CSV storage implementation + tieba content CSV storage implementation Args: creator: creator dict @@ -100,7 +100,7 @@ class TieBaCsvStoreImplement(AbstractStore): class TieBaDbStoreImplement(AbstractStore): async def store_content(self, content_item: Dict): """ - Xiaohongshu content DB storage implementation + tieba content DB storage implementation Args: content_item: content item dict @@ -120,7 +120,7 @@ class TieBaDbStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - Xiaohongshu content DB storage implementation + tieba content DB storage implementation Args: comment_item: comment item dict @@ -140,7 +140,7 @@ class TieBaDbStoreImplement(AbstractStore): async def store_creator(self, creator: Dict): """ - Xiaohongshu content DB storage implementation + tieba content DB storage implementation Args: creator: creator dict @@ -163,10 +163,10 @@ class TieBaJsonStoreImplement(AbstractStore): json_store_path: str = "data/tieba/json" words_store_path: str = "data/tieba/words" lock = asyncio.Lock() - file_count:int=calculate_number_of_files(json_store_path) + file_count: int = calculate_number_of_files(json_store_path) WordCloud = words.AsyncWordCloudGenerator() - def make_save_file_name(self, store_type: str) -> (str,str): + def make_save_file_name(self, store_type: str) -> (str, str): """ make save file name by store type Args: @@ -193,7 +193,7 @@ class TieBaJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) - save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: @@ -210,6 +210,7 @@ class TieBaJsonStoreImplement(AbstractStore): await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) except: pass + async def store_content(self, content_item: Dict): """ content JSON storage implementation @@ -234,7 +235,7 @@ class TieBaJsonStoreImplement(AbstractStore): async def store_creator(self, creator: Dict): """ - Xiaohongshu content JSON storage implementation + tieba content JSON storage implementation Args: creator: creator dict