fix: 微博登录问题修复

feat: 微博二级评论
2024-08-05 00:48:42 +08:00 · 2024-08-05 00:48:42 +08:00 · 1c2237a66f
commit 1c2237a66f
parent 7229d29123
7 changed files with 62 additions and 54 deletions
--- a/README.md
+++ b/README.md
@ -21,7 +21,7 @@
 | 抖音  | ✅     | ✅        | ✅    | ✅       | ✅     | ✅     | ✅    |
 | 快手  | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | B 站 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
-| 微博  | ✅     | ✅        | ❌   | ❌      | ✅     | ✅     | ✅    |
+| 微博  | ✅     | ✅        | ✅   | ❌      | ✅     | ✅     | ✅    |


 ## 使用方法
@ -109,10 +109,12 @@


 ## MediaCrawler爬虫项目交流群：
-> 如果二维码过期，可以直接添加我的微信号：yzglan（备注来自：github），拉你进项目交流群
+> 扫描下方我的个人微信，备注：github，拉你进MediaCrawler项目交流群(请一定备注：github，会有wx小助手自动拉群)
+> 
+> 如果图片展示不出来，可以直接添加我的微信号：yzglan

 <div style="max-width: 200px">  
-<p><img alt="图片展示不出来可以加作者微信: yzglan" src="static/images/12群二维码.JPG" style="width: 200px;height: 100%" ></p>
+<p><img alt="relakkes_wechat" src="static/images/relakkes_weichat.JPG" style="width: 200px;height: 100%" ></p>
 </div>


--- a/media_platform/weibo/client.py
+++ b/media_platform/weibo/client.py
@ -13,6 +13,7 @@ from urllib.parse import urlencode
 import httpx
 from playwright.async_api import BrowserContext, Page

+import config
 from tools import utils

 from .exception import DataFetchError
@ -70,7 +71,7 @@ class WeiboClient:
        utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
        ping_flag = False
        try:
-            uri  = "/api/config"
+            uri = "/api/config"
            resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
            if resp_data.get("login"):
                ping_flag = True
@ -129,13 +130,12 @@ class WeiboClient:

        return await self.get(uri, params, headers=headers)

-    async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
+    async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
                                    callback: Optional[Callable] = None, ):
        """
        get note all comments include sub comments
        :param note_id:
        :param crawl_interval:
-        :param is_fetch_sub_comments:
        :param callback:
        :return:
        """
@ -151,12 +151,37 @@ class WeiboClient:
            if callback:  # 如果有回调函数，就执行回调函数
                await callback(note_id, comment_list)
            await asyncio.sleep(crawl_interval)
-            if not is_fetch_sub_comments:
-                result.extend(comment_list)
-                continue
-            # todo handle get sub comments
+            result.extend(comment_list)
+            sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
+            result.extend(sub_comment_result)
        return result

+    @staticmethod
+    async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict],
+                                            callback: Optional[Callable] = None) -> List[Dict]:
+        """
+        获取评论的所有子评论
+        Args:
+            note_id:
+            comment_list:
+            callback:
+
+        Returns:
+
+        """
+        if not config.ENABLE_GET_SUB_COMMENTS:
+            utils.logger.info(
+                f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
+            return []
+
+        res_sub_comments = []
+        for comment in comment_list:
+            sub_comments = comment.get("comments")
+            if sub_comments and isinstance(sub_comments, list):
+                await callback(note_id, sub_comments)
+                res_sub_comments.extend(sub_comments)
+        return res_sub_comments
+
    async def get_note_info_by_id(self, note_id: str) -> Dict:
        """
        根据帖子ID获取详情
@ -184,12 +209,12 @@ class WeiboClient:
                return dict()

    async def get_note_image(self, image_url: str) -> bytes:
-        image_url = image_url[8:] # 去掉 https://
+        image_url = image_url[8:]  # 去掉 https://
        sub_url = image_url.split("/")
        image_url = ""
        for i in range(len(sub_url)):
            if i == 1:
-                image_url += "large/" #都获取高清大图
+                image_url += "large/"  # 都获取高清大图
            elif i == len(sub_url) - 1:
                image_url += sub_url[i]
            else:
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@ -69,8 +69,6 @@ class WeiboCrawler(AbstractCrawler):
                    context_page=self.context_page,
                    cookie_str=config.COOKIES
                )
-                await self.context_page.goto(self.index_url)
-                await asyncio.sleep(1)
                await login_obj.begin()

                # 登录成功后重定向到手机端的网站，再更新手机端登录成功的cookie
@ -192,7 +190,7 @@ class WeiboCrawler(AbstractCrawler):
                utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
                await self.wb_client.get_note_all_comments(
                    note_id=note_id,
-                    crawl_interval=random.randint(1,10), # 微博对API的限流比较严重，所以延时提高一些
+                    crawl_interval=random.randint(1,3), # 微博对API的限流比较严重，所以延时提高一些
                    callback=weibo_store.batch_update_weibo_note_comments
                )
            except DataFetchError as ex:
--- a/media_platform/weibo/login.py
+++ b/media_platform/weibo/login.py
@ -30,6 +30,7 @@ class WeiboLogin(AbstractLogin):
        self.context_page = context_page
        self.login_phone = login_phone
        self.cookie_str = cookie_str
+        self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"

    async def begin(self):
        """Start login weibo"""
@ -54,45 +55,19 @@ class WeiboLogin(AbstractLogin):
        """
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
+        if cookie_dict.get("SSOLoginState"):
+            return True
        current_web_session = cookie_dict.get("WBPSESS")
        if current_web_session != no_logged_in_session:
            return True
        return False

-    async def popup_login_dialog(self):
-        """If the login dialog box does not pop up automatically, we will manually click the login button"""
-        dialog_selector = "xpath=//div[@class='woo-modal-main']"
-        try:
-            # check dialog box is auto popup and wait for 4 seconds
-            await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 4)
-        except Exception as e:
-            utils.logger.error(
-                f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
-            utils.logger.info(
-                "[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
-
-            # 向下滚动1000像素
-            await self.context_page.mouse.wheel(0,500)
-            await asyncio.sleep(0.5)
-
-            try:
-                # click login button
-                login_button_ele = self.context_page.locator(
-                    "xpath=//a[text()='登录']",
-                )
-                await login_button_ele.click()
-                await asyncio.sleep(0.5)
-            except Exception as e:
-                utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear：{e}")
-
    async def login_by_qrcode(self):
        """login weibo website and keep webdriver login state"""
        utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
-
-        await self.popup_login_dialog()
-
+        await self.context_page.goto(self.weibo_sso_login_url)
        # find login qrcode
-        qrcode_img_selector = "//div[@class='woo-modal-main']//img"
+        qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
        base64_qrcode_img = await utils.find_login_qrcode(
            self.context_page,
            selector=qrcode_img_selector
--- a/schema/tables.sql
+++ b/schema/tables.sql
@ -333,4 +333,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
 ALTER TABLE `bilibili_video_comment`
 ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';

+ALTER TABLE `weibo_note_comment`
+ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
+
 SET FOREIGN_KEY_CHECKS = 1;
--- a/store/weibo/init.py
+++ b/store/weibo/init.py
@ -6,8 +6,6 @@
 import re
 from typing import List

-import config
-
 from .weibo_store_image import *
 from .weibo_store_impl import *

@ -81,6 +79,7 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
        "comment_like_count": str(comment_item.get("like_count", 0)),
        "last_modify_ts": utils.get_current_timestamp(),
        "ip_location": comment_item.get("source", "").replace("来自", ""),
+        "parent_comment_id": comment_item.get("rootid", ""),

        # 用户信息
        "user_id": str(user_info.get("id")),
--- a/store/weibo/weibo_store_impl.py
+++ b/store/weibo/weibo_store_impl.py
@ -27,7 +27,7 @@ def calculate_number_of_files(file_store_path: str) -> int:
    if not os.path.exists(file_store_path):
        return 1
    try:
-        return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
+        return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
    except ValueError:
        return 1

@ -37,7 +37,7 @@ class WeiboCsvStoreImplement(AbstractStore):
        pass

    csv_store_path: str = "data/weibo"
-    file_count:int=calculate_number_of_files(csv_store_path)
+    file_count: int = calculate_number_of_files(csv_store_path)

    def make_save_file_name(self, store_type: str) -> str:
        """
@ -93,6 +93,7 @@ class WeiboCsvStoreImplement(AbstractStore):


 class WeiboDbStoreImplement(AbstractStore):
+
    async def store_content(self, content_item: Dict):
        """
        Weibo content DB storage implementation
@ -134,16 +135,18 @@ class WeiboDbStoreImplement(AbstractStore):
        else:
            await update_comment_by_comment_id(comment_id, comment_item=comment_item)

+    async def store_creator(self, creator: Dict):
+        pass
+

 class WeiboJsonStoreImplement(AbstractStore):
    json_store_path: str = "data/weibo/json"
    words_store_path: str = "data/weibo/words"
    lock = asyncio.Lock()
-    file_count:int=calculate_number_of_files(json_store_path)
+    file_count: int = calculate_number_of_files(json_store_path)
    WordCloud = words.AsyncWordCloudGenerator()

-
-    def make_save_file_name(self, store_type: str) -> (str,str):
+    def make_save_file_name(self, store_type: str) -> (str, str):
        """
        make save file name by store type
        Args:
@ -170,7 +173,7 @@ class WeiboJsonStoreImplement(AbstractStore):
        """
        pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
        pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
-        save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
+        save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
        save_data = []

        async with self.lock:
@ -209,3 +212,6 @@ class WeiboJsonStoreImplement(AbstractStore):

        """
        await self.save_data_to_json(comment_item, "comments")
+
+    async def store_creator(self, creator: Dict):
+        pass