diff --git a/README.md b/README.md index 053250b..9950b4e 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | +| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ## 使用方法 @@ -109,10 +109,12 @@ ## MediaCrawler爬虫项目交流群: -> 如果二维码过期,可以直接添加我的微信号:yzglan(备注来自:github),拉你进项目交流群 +> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群) +> +> 如果图片展示不出来,可以直接添加我的微信号:yzglan
-

图片展示不出来可以加作者微信: yzglan

+

relakkes_wechat

diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index 89b3ee1..5e77394 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -13,6 +13,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page +import config from tools import utils from .exception import DataFetchError @@ -70,7 +71,7 @@ class WeiboClient: utils.logger.info("[WeiboClient.pong] Begin pong weibo...") ping_flag = False try: - uri = "/api/config" + uri = "/api/config" resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers) if resp_data.get("login"): ping_flag = True @@ -129,13 +130,12 @@ class WeiboClient: return await self.get(uri, params, headers=headers) - async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, + async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, ): """ get note all comments include sub comments :param note_id: :param crawl_interval: - :param is_fetch_sub_comments: :param callback: :return: """ @@ -151,12 +151,37 @@ class WeiboClient: if callback: # 如果有回调函数,就执行回调函数 await callback(note_id, comment_list) await asyncio.sleep(crawl_interval) - if not is_fetch_sub_comments: - result.extend(comment_list) - continue - # todo handle get sub comments + result.extend(comment_list) + sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback) + result.extend(sub_comment_result) return result + @staticmethod + async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict], + callback: Optional[Callable] = None) -> List[Dict]: + """ + 获取评论的所有子评论 + Args: + note_id: + comment_list: + callback: + + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + utils.logger.info( + f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") + return [] + + res_sub_comments = [] + for comment in comment_list: + sub_comments = comment.get("comments") + if sub_comments and isinstance(sub_comments, list): + await callback(note_id, sub_comments) + res_sub_comments.extend(sub_comments) + return res_sub_comments + async def get_note_info_by_id(self, note_id: str) -> Dict: """ 根据帖子ID获取详情 @@ -184,12 +209,12 @@ class WeiboClient: return dict() async def get_note_image(self, image_url: str) -> bytes: - image_url = image_url[8:] # 去掉 https:// + image_url = image_url[8:] # 去掉 https:// sub_url = image_url.split("/") image_url = "" for i in range(len(sub_url)): if i == 1: - image_url += "large/" #都获取高清大图 + image_url += "large/" # 都获取高清大图 elif i == len(sub_url) - 1: image_url += sub_url[i] else: @@ -203,4 +228,4 @@ class WeiboClient: utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}") return None else: - return response.content \ No newline at end of file + return response.content diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 4301574..7683385 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -69,8 +69,6 @@ class WeiboCrawler(AbstractCrawler): context_page=self.context_page, cookie_str=config.COOKIES ) - await self.context_page.goto(self.index_url) - await asyncio.sleep(1) await login_obj.begin() # 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie @@ -192,7 +190,7 @@ class WeiboCrawler(AbstractCrawler): utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...") await self.wb_client.get_note_all_comments( note_id=note_id, - crawl_interval=random.randint(1,10), # 微博对API的限流比较严重,所以延时提高一些 + crawl_interval=random.randint(1,3), # 微博对API的限流比较严重,所以延时提高一些 callback=weibo_store.batch_update_weibo_note_comments ) except DataFetchError as ex: diff --git a/media_platform/weibo/login.py b/media_platform/weibo/login.py index 9dc1659..30af47b 100644 --- a/media_platform/weibo/login.py +++ b/media_platform/weibo/login.py @@ -30,6 +30,7 @@ class WeiboLogin(AbstractLogin): self.context_page = context_page self.login_phone = login_phone self.cookie_str = cookie_str + self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog" async def begin(self): """Start login weibo""" @@ -54,45 +55,19 @@ class WeiboLogin(AbstractLogin): """ current_cookie = await self.browser_context.cookies() _, cookie_dict = utils.convert_cookies(current_cookie) + if cookie_dict.get("SSOLoginState"): + return True current_web_session = cookie_dict.get("WBPSESS") if current_web_session != no_logged_in_session: return True return False - async def popup_login_dialog(self): - """If the login dialog box does not pop up automatically, we will manually click the login button""" - dialog_selector = "xpath=//div[@class='woo-modal-main']" - try: - # check dialog box is auto popup and wait for 4 seconds - await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 4) - except Exception as e: - utils.logger.error( - f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}") - utils.logger.info( - "[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button") - - # 向下滚动1000像素 - await self.context_page.mouse.wheel(0,500) - await asyncio.sleep(0.5) - - try: - # click login button - login_button_ele = self.context_page.locator( - "xpath=//a[text()='登录']", - ) - await login_button_ele.click() - await asyncio.sleep(0.5) - except Exception as e: - utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear:{e}") - async def login_by_qrcode(self): """login weibo website and keep webdriver login state""" utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...") - - await self.popup_login_dialog() - + await self.context_page.goto(self.weibo_sso_login_url) # find login qrcode - qrcode_img_selector = "//div[@class='woo-modal-main']//img" + qrcode_img_selector = "xpath=//img[@class='w-full h-full']" base64_qrcode_img = await utils.find_login_qrcode( self.context_page, selector=qrcode_img_selector diff --git a/schema/tables.sql b/schema/tables.sql index 7c5d53e..3530189 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -333,4 +333,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `bilibili_video_comment` ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; +ALTER TABLE `weibo_note_comment` +ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + SET FOREIGN_KEY_CHECKS = 1; diff --git a/store/weibo/__init__.py b/store/weibo/__init__.py index 933fdd8..e3751dc 100644 --- a/store/weibo/__init__.py +++ b/store/weibo/__init__.py @@ -6,8 +6,6 @@ import re from typing import List -import config - from .weibo_store_image import * from .weibo_store_impl import * @@ -81,6 +79,7 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict): "comment_like_count": str(comment_item.get("like_count", 0)), "last_modify_ts": utils.get_current_timestamp(), "ip_location": comment_item.get("source", "").replace("来自", ""), + "parent_comment_id": comment_item.get("rootid", ""), # 用户信息 "user_id": str(user_info.get("id")), diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py index f3476ff..188c016 100644 --- a/store/weibo/weibo_store_impl.py +++ b/store/weibo/weibo_store_impl.py @@ -27,7 +27,7 @@ def calculate_number_of_files(file_store_path: str) -> int: if not os.path.exists(file_store_path): return 1 try: - return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 except ValueError: return 1 @@ -37,7 +37,7 @@ class WeiboCsvStoreImplement(AbstractStore): pass csv_store_path: str = "data/weibo" - file_count:int=calculate_number_of_files(csv_store_path) + file_count: int = calculate_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -93,6 +93,7 @@ class WeiboCsvStoreImplement(AbstractStore): class WeiboDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): """ Weibo content DB storage implementation @@ -134,16 +135,18 @@ class WeiboDbStoreImplement(AbstractStore): else: await update_comment_by_comment_id(comment_id, comment_item=comment_item) + async def store_creator(self, creator: Dict): + pass + class WeiboJsonStoreImplement(AbstractStore): json_store_path: str = "data/weibo/json" words_store_path: str = "data/weibo/words" lock = asyncio.Lock() - file_count:int=calculate_number_of_files(json_store_path) + file_count: int = calculate_number_of_files(json_store_path) WordCloud = words.AsyncWordCloudGenerator() - - def make_save_file_name(self, store_type: str) -> (str,str): + def make_save_file_name(self, store_type: str) -> (str, str): """ make save file name by store type Args: @@ -170,7 +173,7 @@ class WeiboJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) - save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: @@ -209,3 +212,6 @@ class WeiboJsonStoreImplement(AbstractStore): """ await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + pass