diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index ca56dc5..d4f2f3f 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -34,7 +34,7 @@ class DOUYINClient: if not params: return headers = headers or self.headers - local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore + local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore douyin_js_obj = execjs.compile(open('libs/douyin.js').read()) common_params = { "device_platform": "webapp", @@ -53,8 +53,8 @@ class DOUYINClient: "platform": "PC", "screen_width": "1920", "screen_height": "1200", - "webid": douyin_js_obj.call("get_web_id"), - "msToken": local_storage.get("xmst"), + #" webid": douyin_js_obj.call("get_web_id"), + # "msToken": local_storage.get("xmst"), # "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK", } params.update(common_params) @@ -142,7 +142,7 @@ class DOUYINClient: del headers["Origin"] return await self.get("/aweme/v1/web/aweme/detail/", params, headers) - async def get_aweme_comments(self, aweme_id: str, cursor: int = 0): + async def get_aweme_comments(self, aweme_id: str, cursor: int = 0, keywords: str = ""): """get note comments """ @@ -153,6 +153,9 @@ class DOUYINClient: "count": 20, "item_type": 0 } + referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general' + headers = copy.copy(self.headers) + headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') return await self.get(uri, params) async def get_aweme_all_comments( @@ -160,7 +163,8 @@ class DOUYINClient: aweme_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, - callback: Optional[Callable] = None + callback: Optional[Callable] = None, + keywords: str = "" ): """ get note all comments include sub comments @@ -168,13 +172,14 @@ class DOUYINClient: :param crawl_interval: :param is_fetch_sub_comments: :param callback: + :param keywords: :return: """ result = [] comments_has_more = 1 comments_cursor = 0 while comments_has_more: - comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) + comments_res = await self.get_aweme_comments(aweme_id, comments_cursor, keywords) comments_has_more = comments_res.get("has_more", 0) comments_cursor = comments_res.get("cursor", comments_cursor + 20) comments = comments_res.get("comments") diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index cba4aa5..19117f0 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -91,22 +91,23 @@ class DouYinCrawler(AbstractCrawler): aweme_list.append(aweme_info.get("aweme_id", "")) await douyin.update_douyin_aweme(aweme_item=aweme_info) utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") - await self.batch_get_note_comments(aweme_list) + await self.batch_get_note_comments(aweme_list, keyword) - async def batch_get_note_comments(self, aweme_list: List[str]) -> None: + async def batch_get_note_comments(self, aweme_list: List[str], keywords: str) -> None: task_list: List[Task] = [] semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) for aweme_id in aweme_list: - task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id) + task = asyncio.create_task(self.get_comments(aweme_id, semaphore, keywords), name=aweme_id) task_list.append(task) await asyncio.wait(task_list) - async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None: + async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, keywords: str) -> None: async with semaphore: try: await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, - callback=douyin.batch_update_dy_aweme_comments + callback=douyin.batch_update_dy_aweme_comments, + keywords=keywords ) utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...") except DataFetchError as e: diff --git a/models/douyin.py b/models/douyin.py index d2a8029..2d463dc 100644 --- a/models/douyin.py +++ b/models/douyin.py @@ -88,12 +88,13 @@ async def update_douyin_aweme(aweme_item: Dict): if config.IS_SAVED_DATABASED: if not await DouyinAweme.filter(aweme_id=aweme_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() - douyin_aweme_pydantic = pydantic_model_creator(DouyinAweme,name='DouyinAwemeCreate',exclude=('id',)) + douyin_aweme_pydantic = pydantic_model_creator(DouyinAweme, name='DouyinAwemeCreate', exclude=('id',)) douyin_data = douyin_aweme_pydantic(**local_db_item) douyin_aweme_pydantic.validate(douyin_data) await DouyinAweme.create(**douyin_data.dict()) else: - douyin_aweme_pydantic = pydantic_model_creator(DouyinAweme, name='DouyinAwemeUpdate', exclude=('id','add_ts')) + douyin_aweme_pydantic = pydantic_model_creator(DouyinAweme, name='DouyinAwemeUpdate', + exclude=('id', 'add_ts')) douyin_data = douyin_aweme_pydantic(**local_db_item) douyin_aweme_pydantic.validate(douyin_data) await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict()) @@ -121,7 +122,6 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): "ip_location": comment_item.get("ip_label", ""), "aweme_id": aweme_id, "content": comment_item.get("text"), - "content_extra": json.dumps(comment_item.get("text_extra", [])), "user_id": user_info.get("uid"), "sec_uid": user_info.get("sec_uid"), "short_user_id": user_info.get("short_id"), @@ -136,12 +136,14 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): if config.IS_SAVED_DATABASED: if not await DouyinAwemeComment.filter(comment_id=comment_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() - comment_pydantic = pydantic_model_creator(DouyinAwemeComment, name='DouyinAwemeCommentCreate', exclude=('id',)) + comment_pydantic = pydantic_model_creator(DouyinAwemeComment, name='DouyinAwemeCommentCreate', + exclude=('id',)) comment_data = comment_pydantic(**local_db_item) comment_pydantic.validate(comment_data) await DouyinAwemeComment.create(**comment_data.dict()) else: - comment_pydantic = pydantic_model_creator(DouyinAwemeComment, name='DouyinAwemeCommentUpdate', exclude=('id','add_ts')) + comment_pydantic = pydantic_model_creator(DouyinAwemeComment, name='DouyinAwemeCommentUpdate', + exclude=('id', 'add_ts')) comment_data = comment_pydantic(**local_db_item) comment_pydantic.validate(comment_data) await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict()) diff --git a/models/xiaohongshu.py b/models/xiaohongshu.py index cd2dfa3..aead421 100644 --- a/models/xiaohongshu.py +++ b/models/xiaohongshu.py @@ -85,12 +85,12 @@ async def update_xhs_note(note_item: Dict): if config.IS_SAVED_DATABASED: if not await XHSNote.filter(note_id=note_id).first(): local_db_item["add_ts"] = utils.get_current_timestamp() - note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticCreate", exclude=('id', )) + note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticCreate", exclude=('id',)) note_data = note_pydantic(**local_db_item) note_pydantic.validate(note_data) await XHSNote.create(**note_data.dict()) else: - note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticUpdate", exclude=('id','add_ts')) + note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticUpdate", exclude=('id', 'add_ts')) note_data = note_pydantic(**local_db_item) note_pydantic.validate(note_data) await XHSNote.filter(note_id=note_id).update(**note_data.dict()) @@ -115,16 +115,13 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): if config.IS_SAVED_DATABASED: if not await XHSNoteComment.filter(comment_id=comment_id).first(): local_db_item["add_ts"] = utils.get_current_timestamp() - comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticCreate", exclude=('id', )) + comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticCreate", exclude=('id',)) comment_data = comment_pydantic(**local_db_item) comment_pydantic.validate(comment_data) await XHSNoteComment.create(**comment_data.dict()) else: - comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticUpdate", exclude=('id','add_ts',)) + comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticUpdate", + exclude=('id', 'add_ts',)) comment_data = comment_pydantic(**local_db_item) comment_pydantic.validate(comment_data) await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict()) - - - -