diff --git a/README.md b/README.md
index 053250b..9950b4e 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
+| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
## 使用方法
@@ -109,10 +109,12 @@
## MediaCrawler爬虫项目交流群:
-> 如果二维码过期,可以直接添加我的微信号:yzglan(备注来自:github),拉你进项目交流群
+> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
+>
+> 如果图片展示不出来,可以直接添加我的微信号:yzglan
-
+
diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py
index 89b3ee1..5e77394 100644
--- a/media_platform/weibo/client.py
+++ b/media_platform/weibo/client.py
@@ -13,6 +13,7 @@ from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext, Page
+import config
from tools import utils
from .exception import DataFetchError
@@ -70,7 +71,7 @@ class WeiboClient:
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
ping_flag = False
try:
- uri = "/api/config"
+ uri = "/api/config"
resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
if resp_data.get("login"):
ping_flag = True
@@ -129,13 +130,12 @@ class WeiboClient:
return await self.get(uri, params, headers=headers)
- async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
+ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None, ):
"""
get note all comments include sub comments
:param note_id:
:param crawl_interval:
- :param is_fetch_sub_comments:
:param callback:
:return:
"""
@@ -151,12 +151,37 @@ class WeiboClient:
if callback: # 如果有回调函数,就执行回调函数
await callback(note_id, comment_list)
await asyncio.sleep(crawl_interval)
- if not is_fetch_sub_comments:
- result.extend(comment_list)
- continue
- # todo handle get sub comments
+ result.extend(comment_list)
+ sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
+ result.extend(sub_comment_result)
return result
+ @staticmethod
+ async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict],
+ callback: Optional[Callable] = None) -> List[Dict]:
+ """
+ 获取评论的所有子评论
+ Args:
+ note_id:
+ comment_list:
+ callback:
+
+ Returns:
+
+ """
+ if not config.ENABLE_GET_SUB_COMMENTS:
+ utils.logger.info(
+ f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
+ return []
+
+ res_sub_comments = []
+ for comment in comment_list:
+ sub_comments = comment.get("comments")
+ if sub_comments and isinstance(sub_comments, list):
+ await callback(note_id, sub_comments)
+ res_sub_comments.extend(sub_comments)
+ return res_sub_comments
+
async def get_note_info_by_id(self, note_id: str) -> Dict:
"""
根据帖子ID获取详情
@@ -184,12 +209,12 @@ class WeiboClient:
return dict()
async def get_note_image(self, image_url: str) -> bytes:
- image_url = image_url[8:] # 去掉 https://
+ image_url = image_url[8:] # 去掉 https://
sub_url = image_url.split("/")
image_url = ""
for i in range(len(sub_url)):
if i == 1:
- image_url += "large/" #都获取高清大图
+ image_url += "large/" # 都获取高清大图
elif i == len(sub_url) - 1:
image_url += sub_url[i]
else:
@@ -203,4 +228,4 @@ class WeiboClient:
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
return None
else:
- return response.content
\ No newline at end of file
+ return response.content
diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py
index 4301574..7683385 100644
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -69,8 +69,6 @@ class WeiboCrawler(AbstractCrawler):
context_page=self.context_page,
cookie_str=config.COOKIES
)
- await self.context_page.goto(self.index_url)
- await asyncio.sleep(1)
await login_obj.begin()
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
@@ -192,7 +190,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
await self.wb_client.get_note_all_comments(
note_id=note_id,
- crawl_interval=random.randint(1,10), # 微博对API的限流比较严重,所以延时提高一些
+ crawl_interval=random.randint(1,3), # 微博对API的限流比较严重,所以延时提高一些
callback=weibo_store.batch_update_weibo_note_comments
)
except DataFetchError as ex:
diff --git a/media_platform/weibo/login.py b/media_platform/weibo/login.py
index 9dc1659..30af47b 100644
--- a/media_platform/weibo/login.py
+++ b/media_platform/weibo/login.py
@@ -30,6 +30,7 @@ class WeiboLogin(AbstractLogin):
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
+ self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
async def begin(self):
"""Start login weibo"""
@@ -54,45 +55,19 @@ class WeiboLogin(AbstractLogin):
"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
+ if cookie_dict.get("SSOLoginState"):
+ return True
current_web_session = cookie_dict.get("WBPSESS")
if current_web_session != no_logged_in_session:
return True
return False
- async def popup_login_dialog(self):
- """If the login dialog box does not pop up automatically, we will manually click the login button"""
- dialog_selector = "xpath=//div[@class='woo-modal-main']"
- try:
- # check dialog box is auto popup and wait for 4 seconds
- await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 4)
- except Exception as e:
- utils.logger.error(
- f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
- utils.logger.info(
- "[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
-
- # 向下滚动1000像素
- await self.context_page.mouse.wheel(0,500)
- await asyncio.sleep(0.5)
-
- try:
- # click login button
- login_button_ele = self.context_page.locator(
- "xpath=//a[text()='登录']",
- )
- await login_button_ele.click()
- await asyncio.sleep(0.5)
- except Exception as e:
- utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear:{e}")
-
async def login_by_qrcode(self):
"""login weibo website and keep webdriver login state"""
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
-
- await self.popup_login_dialog()
-
+ await self.context_page.goto(self.weibo_sso_login_url)
# find login qrcode
- qrcode_img_selector = "//div[@class='woo-modal-main']//img"
+ qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
diff --git a/schema/tables.sql b/schema/tables.sql
index 7c5d53e..3530189 100644
--- a/schema/tables.sql
+++ b/schema/tables.sql
@@ -333,4 +333,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `bilibili_video_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
+ALTER TABLE `weibo_note_comment`
+ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
+
SET FOREIGN_KEY_CHECKS = 1;
diff --git a/store/weibo/__init__.py b/store/weibo/__init__.py
index 933fdd8..e3751dc 100644
--- a/store/weibo/__init__.py
+++ b/store/weibo/__init__.py
@@ -6,8 +6,6 @@
import re
from typing import List
-import config
-
from .weibo_store_image import *
from .weibo_store_impl import *
@@ -81,6 +79,7 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
"comment_like_count": str(comment_item.get("like_count", 0)),
"last_modify_ts": utils.get_current_timestamp(),
"ip_location": comment_item.get("source", "").replace("来自", ""),
+ "parent_comment_id": comment_item.get("rootid", ""),
# 用户信息
"user_id": str(user_info.get("id")),
diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py
index f3476ff..188c016 100644
--- a/store/weibo/weibo_store_impl.py
+++ b/store/weibo/weibo_store_impl.py
@@ -27,7 +27,7 @@ def calculate_number_of_files(file_store_path: str) -> int:
if not os.path.exists(file_store_path):
return 1
try:
- return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
+ return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
except ValueError:
return 1
@@ -37,7 +37,7 @@ class WeiboCsvStoreImplement(AbstractStore):
pass
csv_store_path: str = "data/weibo"
- file_count:int=calculate_number_of_files(csv_store_path)
+ file_count: int = calculate_number_of_files(csv_store_path)
def make_save_file_name(self, store_type: str) -> str:
"""
@@ -93,6 +93,7 @@ class WeiboCsvStoreImplement(AbstractStore):
class WeiboDbStoreImplement(AbstractStore):
+
async def store_content(self, content_item: Dict):
"""
Weibo content DB storage implementation
@@ -134,16 +135,18 @@ class WeiboDbStoreImplement(AbstractStore):
else:
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
+ async def store_creator(self, creator: Dict):
+ pass
+
class WeiboJsonStoreImplement(AbstractStore):
json_store_path: str = "data/weibo/json"
words_store_path: str = "data/weibo/words"
lock = asyncio.Lock()
- file_count:int=calculate_number_of_files(json_store_path)
+ file_count: int = calculate_number_of_files(json_store_path)
WordCloud = words.AsyncWordCloudGenerator()
-
- def make_save_file_name(self, store_type: str) -> (str,str):
+ def make_save_file_name(self, store_type: str) -> (str, str):
"""
make save file name by store type
Args:
@@ -170,7 +173,7 @@ class WeiboJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
- save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
+ save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@@ -209,3 +212,6 @@ class WeiboJsonStoreImplement(AbstractStore):
"""
await self.save_data_to_json(comment_item, "comments")
+
+ async def store_creator(self, creator: Dict):
+ pass