fix: 微博登录问题修复

feat: 微博二级评论
This commit is contained in:
Relakkes 2024-08-05 00:48:42 +08:00
parent 7229d29123
commit 1c2237a66f
7 changed files with 62 additions and 54 deletions

View File

@ -21,7 +21,7 @@
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | | ❌ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | | ❌ | ✅ | ✅ | ✅ |
## 使用方法
@ -109,10 +109,12 @@
## MediaCrawler爬虫项目交流群
> 如果二维码过期可以直接添加我的微信号yzglan备注来自github拉你进项目交流群
> 扫描下方我的个人微信备注github拉你进MediaCrawler项目交流群(请一定备注github会有wx小助手自动拉群)
>
> 如果图片展示不出来可以直接添加我的微信号yzglan
<div style="max-width: 200px">
<p><img alt="图片展示不出来可以加作者微信: yzglan" src="static/images/12群二维码.JPG" style="width: 200px;height: 100%" ></p>
<p><img alt="relakkes_wechat" src="static/images/relakkes_weichat.JPG" style="width: 200px;height: 100%" ></p>
</div>

View File

@ -13,6 +13,7 @@ from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext, Page
import config
from tools import utils
from .exception import DataFetchError
@ -129,13 +130,12 @@ class WeiboClient:
return await self.get(uri, params, headers=headers)
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None, ):
"""
get note all comments include sub comments
:param note_id:
:param crawl_interval:
:param is_fetch_sub_comments:
:param callback:
:return:
"""
@ -151,12 +151,37 @@ class WeiboClient:
if callback: # 如果有回调函数,就执行回调函数
await callback(note_id, comment_list)
await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments:
result.extend(comment_list)
continue
# todo handle get sub comments
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
result.extend(sub_comment_result)
return result
@staticmethod
async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict],
callback: Optional[Callable] = None) -> List[Dict]:
"""
获取评论的所有子评论
Args:
note_id:
comment_list:
callback:
Returns:
"""
if not config.ENABLE_GET_SUB_COMMENTS:
utils.logger.info(
f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
return []
res_sub_comments = []
for comment in comment_list:
sub_comments = comment.get("comments")
if sub_comments and isinstance(sub_comments, list):
await callback(note_id, sub_comments)
res_sub_comments.extend(sub_comments)
return res_sub_comments
async def get_note_info_by_id(self, note_id: str) -> Dict:
"""
根据帖子ID获取详情
@ -189,7 +214,7 @@ class WeiboClient:
image_url = ""
for i in range(len(sub_url)):
if i == 1:
image_url += "large/" #都获取高清大图
image_url += "large/" # 都获取高清大图
elif i == len(sub_url) - 1:
image_url += sub_url[i]
else:

View File

@ -69,8 +69,6 @@ class WeiboCrawler(AbstractCrawler):
context_page=self.context_page,
cookie_str=config.COOKIES
)
await self.context_page.goto(self.index_url)
await asyncio.sleep(1)
await login_obj.begin()
# 登录成功后重定向到手机端的网站再更新手机端登录成功的cookie
@ -192,7 +190,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
await self.wb_client.get_note_all_comments(
note_id=note_id,
crawl_interval=random.randint(1,10), # 微博对API的限流比较严重所以延时提高一些
crawl_interval=random.randint(1,3), # 微博对API的限流比较严重所以延时提高一些
callback=weibo_store.batch_update_weibo_note_comments
)
except DataFetchError as ex:

View File

@ -30,6 +30,7 @@ class WeiboLogin(AbstractLogin):
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
async def begin(self):
"""Start login weibo"""
@ -54,45 +55,19 @@ class WeiboLogin(AbstractLogin):
"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
if cookie_dict.get("SSOLoginState"):
return True
current_web_session = cookie_dict.get("WBPSESS")
if current_web_session != no_logged_in_session:
return True
return False
async def popup_login_dialog(self):
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
dialog_selector = "xpath=//div[@class='woo-modal-main']"
try:
# check dialog box is auto popup and wait for 4 seconds
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 4)
except Exception as e:
utils.logger.error(
f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
utils.logger.info(
"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
# 向下滚动1000像素
await self.context_page.mouse.wheel(0,500)
await asyncio.sleep(0.5)
try:
# click login button
login_button_ele = self.context_page.locator(
"xpath=//a[text()='登录']",
)
await login_button_ele.click()
await asyncio.sleep(0.5)
except Exception as e:
utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear{e}")
async def login_by_qrcode(self):
"""login weibo website and keep webdriver login state"""
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
await self.popup_login_dialog()
await self.context_page.goto(self.weibo_sso_login_url)
# find login qrcode
qrcode_img_selector = "//div[@class='woo-modal-main']//img"
qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector

View File

@ -333,4 +333,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `bilibili_video_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `weibo_note_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
SET FOREIGN_KEY_CHECKS = 1;

View File

@ -6,8 +6,6 @@
import re
from typing import List
import config
from .weibo_store_image import *
from .weibo_store_impl import *
@ -81,6 +79,7 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
"comment_like_count": str(comment_item.get("like_count", 0)),
"last_modify_ts": utils.get_current_timestamp(),
"ip_location": comment_item.get("source", "").replace("来自", ""),
"parent_comment_id": comment_item.get("rootid", ""),
# 用户信息
"user_id": str(user_info.get("id")),

View File

@ -27,7 +27,7 @@ def calculate_number_of_files(file_store_path: str) -> int:
if not os.path.exists(file_store_path):
return 1
try:
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
except ValueError:
return 1
@ -37,7 +37,7 @@ class WeiboCsvStoreImplement(AbstractStore):
pass
csv_store_path: str = "data/weibo"
file_count:int=calculate_number_of_files(csv_store_path)
file_count: int = calculate_number_of_files(csv_store_path)
def make_save_file_name(self, store_type: str) -> str:
"""
@ -93,6 +93,7 @@ class WeiboCsvStoreImplement(AbstractStore):
class WeiboDbStoreImplement(AbstractStore):
async def store_content(self, content_item: Dict):
"""
Weibo content DB storage implementation
@ -134,16 +135,18 @@ class WeiboDbStoreImplement(AbstractStore):
else:
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
async def store_creator(self, creator: Dict):
pass
class WeiboJsonStoreImplement(AbstractStore):
json_store_path: str = "data/weibo/json"
words_store_path: str = "data/weibo/words"
lock = asyncio.Lock()
file_count:int=calculate_number_of_files(json_store_path)
file_count: int = calculate_number_of_files(json_store_path)
WordCloud = words.AsyncWordCloudGenerator()
def make_save_file_name(self, store_type: str) -> (str,str):
def make_save_file_name(self, store_type: str) -> (str, str):
"""
make save file name by store type
Args:
@ -170,7 +173,7 @@ class WeiboJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@ -209,3 +212,6 @@ class WeiboJsonStoreImplement(AbstractStore):
"""
await self.save_data_to_json(comment_item, "comments")
async def store_creator(self, creator: Dict):
pass