diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 0000000..705b563 --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,17 @@ +on: + push: + branches: + - main + +jobs: + contrib-readme-job: + runs-on: ubuntu-latest + name: A job to automate contrib in readme + permissions: + contents: write + pull-requests: write + steps: + - name: Contribute List + uses: akhilmhdh/contributors-readme-action@v2.3.10 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/README.md b/README.md index be329d7..98c9366 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,11 @@ |-----|-------|----------|-----|--------|-------|-------|-------| | 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | - ## 使用方法 ### 创建并激活 python 虚拟环境 @@ -87,12 +86,19 @@ ## 开发者服务 - 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题

- 星球图片 - 星球图片 + xingqiu

- 前20个入驻星球的小伙伴,将获得新人券50元,还剩14张。 -
-- 视频课程: + + 星球精选文章: + - [【独创】使用Playwright获取某音a_bogus参数流程(包含加密参数分析)](https://articles.zsxq.com/id_u89al50jk9x0.html) + - [【独创】使用Playwright低成本获取某书X-s参数流程分析(当年的回忆录)](https://articles.zsxq.com/id_u4lcrvqakuc7.html) + - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html) + - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) + - 每天 1 快钱订阅我的知识服务 + + + +- MediaCrawler视频课程: > 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~
> 课程售价非常非常的便宜,几杯咖啡的事儿.
> 课程介绍飞书文档链接:https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh @@ -115,7 +121,7 @@ > 7天有效期,自动更新, 如果人满了可以加作者wx拉进群: yzglan,备注来自github.
-

10群二维码

+

11群二维码

@@ -144,11 +150,220 @@ ## 手机号登录说明 ➡️➡️➡️ [手机号登录说明](docs/手机号登录说明.md) +<<<<<<< HEAD ## 词云图相关操作说明 ➡️➡️➡️ [词云图相关说明](docs/关于词云图相关操作.md) +======= +## 项目贡献者 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + NanmiCoder +
+ 程序员阿江-Relakkes +
+
+ + leantli +
+ leantli +
+
+ + BaoZhuhan +
+ Bao Zhuhan +
+
+ + nelzomal +
+ zhounan +
+
+ + Hiro-Lin +
+ HIRO +
+
+ + PeanutSplash +
+ PeanutSplash +
+
+ + Ermeng98 +
+ Ermeng +
+
+ + Rosyrain +
+ Rosyrain +
+
+ + henryhyn +
+ Henry He +
+
+ + Akiqqqqqqq +
+ leonardoqiuyu +
+
+ + jayeeliu +
+ jayeeliu +
+
+ + ZuWard +
+ ZuWard +
+
+ + Zzendrix +
+ Zendrix +
+
+ + chunpat +
+ zhangzhenpeng +
+
+ + tanpenggood +
+ Sam Tan +
+
+ + xbsheng +
+ xbsheng +
+
+ + yangrq1018 +
+ Martin +
+
+ + zhihuiio +
+ zhihuiio +
+
+ + renaissancezyc +
+ Ren +
+
+ + Tianci-King +
+ Wang Tianci +
+
+ + Styunlen +
+ Styunlen +
+
+ + Schofi +
+ Schofi +
+
+ + Klu5ure +
+ Klu5ure +
+
+ + keeper-jie +
+ Kermit +
+
+ + kexinoh +
+ KEXNA +
+
+ + aa65535 +
+ Jian Chang +
+
+ + 522109452 +
+ tianqing +
+
+ +>>>>>>> 86a88f72602fe3f692acc628427888487554b716 ## star 趋势图 - 如果该项目对你有帮助,star一下 ❤️❤️❤️ diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 2d07675..27854f7 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -1,4 +1,5 @@ import argparse + import config from tools.utils import str2bool diff --git a/config/base_config.py b/config/base_config.py index 9b52e52..fe7d61c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -3,8 +3,10 @@ PLATFORM = "xhs" KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" -# 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 +# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 SORT_TYPE = "popularity_descending" +# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音 +PUBLISH_TIME_TYPE = 0 CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) # 是否开启 IP 代理 @@ -103,6 +105,13 @@ BILI_CREATOR_ID_LIST = [ # ........................ ] +# 指定快手创作者ID列表 +KS_CREATOR_ID_LIST = [ + "3x4sm73aye7jq7i", + # ........................ +] + + #词云相关 #是否开启生成评论词云图 ENABLE_GET_WORDCLOUD = False @@ -118,5 +127,3 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt" #中文字体文件路径 FONT_PATH= "./docs/STZHONGS.TTF" - - diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index c14391d..1d08791 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -106,6 +106,7 @@ class BilibiliCrawler(AbstractCrawler): page += 1 continue + utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}") video_id_list: List[str] = [] videos_res = await self.bili_client.search_video_by_keyword( keyword=keyword, @@ -126,7 +127,6 @@ class BilibiliCrawler(AbstractCrawler): if video_item: video_id_list.append(video_item.get("View").get("aid")) await bilibili_store.update_bilibili_video(video_item) - page += 1 await self.batch_get_video_comments(video_id_list) diff --git a/media_platform/bilibili/login.py b/media_platform/bilibili/login.py index 33c929f..cc7baa2 100644 --- a/media_platform/bilibili/login.py +++ b/media_platform/bilibili/login.py @@ -12,8 +12,8 @@ from playwright.async_api import BrowserContext, Page from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, wait_fixed) -from base.base_crawler import AbstractLogin import config +from base.base_crawler import AbstractLogin from tools import utils diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index ede6049..ce04698 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -1,5 +1,6 @@ import asyncio import copy +import json import urllib.parse from typing import Any, Callable, Dict, List, Optional @@ -119,14 +120,19 @@ class DOUYINClient(AbstractApiClient): params = { "keyword": urllib.parse.quote(keyword), "search_channel": search_channel.value, - "sort_type": sort_type.value, - "publish_time": publish_time.value, "search_source": "normal_search", - "query_correct_type": "1", - "is_filter_search": "0", + "query_correct_type": 1, + "is_filter_search": 0, "offset": offset, "count": 10 # must be set to 10 } + if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED: + params["filter_selected"] = urllib.parse.quote(json.dumps({ + "sort_type": str(sort_type.value), + "publish_time": str(publish_time.value) + })) + params["is_filter_search"] = 1 + params["search_source"] = "tab_search" referer_url = "https://www.douyin.com/search/" + keyword referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general" headers = copy.copy(self.headers) diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index dde5d5b..64992ef 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -90,9 +90,10 @@ class DouYinCrawler(AbstractCrawler): page += 1 continue try: + utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}") posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, - offset=page * dy_limit_count, - publish_time=PublishTimeType.UNLIMITED + offset=page * dy_limit_count - dy_limit_count, + publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE) ) except DataFetchError: utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed") diff --git a/media_platform/douyin/field.py b/media_platform/douyin/field.py index 2ac4d62..ab9da9f 100644 --- a/media_platform/douyin/field.py +++ b/media_platform/douyin/field.py @@ -12,13 +12,12 @@ class SearchChannelType(Enum): class SearchSortType(Enum): """search sort type""" GENERAL = 0 # 综合排序 - LATEST = 1 # 最新发布 - MOST_LIKE = 2 # 最多点赞 - + MOST_LIKE = 1 # 最多点赞 + LATEST = 2 # 最新发布 class PublishTimeType(Enum): """publish time type""" UNLIMITED = 0 # 不限 ONE_DAY = 1 # 一天内 - ONE_WEEK = 2 # 一周内 - SIX_MONTH = 3 # 半年内 + ONE_WEEK = 7 # 一周内 + SIX_MONTH = 180 # 半年内 diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 8ad2155..f643171 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import asyncio import json -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, List, Optional from urllib.parse import urlencode import httpx @@ -67,7 +67,7 @@ class KuaiShouClient(AbstractApiClient): "variables": { "ftype": 1, }, - "query": self.graphql.get("vision_profile") + "query": self.graphql.get("vision_profile_user_list") } res = await self.post("", post_data) if res.get("visionProfileUserList", {}).get("result") == 1: @@ -129,17 +129,60 @@ class KuaiShouClient(AbstractApiClient): "pcursor": pcursor }, "query": self.graphql.get("comment_list") - } return await self.post("", post_data) - async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, - callback: Optional[Callable] = None): + async def get_video_sub_comments( + self, photo_id: str, rootCommentId: str, pcursor: str = "" + ) -> Dict: + """get video sub comments + :param photo_id: photo id you want to fetch + :param pcursor: last you get pcursor, defaults to "" + :return: + """ + post_data = { + "operationName": "visionSubCommentList", + "variables": { + "photoId": photo_id, + "pcursor": pcursor, + "rootCommentId": rootCommentId, + }, + "query": self.graphql.get("vision_sub_comment_list"), + } + return await self.post("", post_data) + + async def get_creator_profile(self, userId: str) -> Dict: + post_data = { + "operationName": "visionProfile", + "variables": { + "userId": userId + }, + "query": self.graphql.get("vision_profile"), + } + return await self.post("", post_data) + + async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict: + post_data = { + "operationName": "visionProfilePhotoList", + "variables": { + "page": "profile", + "pcursor": pcursor, + "userId": userId + }, + "query": self.graphql.get("vision_profile_photo_list"), + } + return await self.post("", post_data) + + async def get_video_all_comments( + self, + photo_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ): """ get video all comments include sub comments :param photo_id: :param crawl_interval: - :param is_fetch_sub_comments: :param callback: :return: """ @@ -158,7 +201,107 @@ class KuaiShouClient(AbstractApiClient): result.extend(comments) await asyncio.sleep(crawl_interval) - if not is_fetch_sub_comments: - continue - # todo handle get sub comments + sub_comments = await self.get_comments_all_sub_comments( + comments, photo_id, crawl_interval, callback + ) + result.extend(sub_comments) + return result + + async def get_comments_all_sub_comments( + self, + comments: List[Dict], + photo_id, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 + Args: + comments: 评论列表 + photo_id: 视频id + crawl_interval: 爬取一次评论的延迟单位(秒) + callback: 一次评论爬取结束后 + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + utils.logger.info( + f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled" + ) + return [] + + result = [] + for comment in comments: + sub_comments = comment.get("subComments") + if sub_comments and callback: + await callback(photo_id, sub_comments) + + sub_comment_pcursor = comment.get("subCommentsPcursor") + if sub_comment_pcursor == "no_more": + continue + + root_comment_id = comment.get("commentId") + sub_comment_pcursor = "" + + while sub_comment_pcursor != "no_more": + comments_res = await self.get_video_sub_comments( + photo_id, root_comment_id, sub_comment_pcursor + ) + vision_sub_comment_list = comments_res.get("visionSubCommentList",{}) + sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more") + + comments = vision_sub_comment_list.get("subComments", {}) + if callback: + await callback(photo_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) + return result + + async def get_creator_info(self, user_id: str) -> Dict: + """ + eg: https://www.kuaishou.com/profile/3x4jtnbfter525a + 快手用户主页 + """ + + visionProfile = await self.get_creator_profile(user_id) + return visionProfile.get("userProfile") + + async def get_all_videos_by_creator( + self, + user_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 + Args: + user_id: 用户ID + crawl_interval: 爬取一次的延迟单位(秒) + callback: 一次分页爬取结束后的更新回调函数 + Returns: + + """ + result = [] + pcursor = "" + + while pcursor != "no_more": + videos_res = await self.get_video_by_creater(user_id, pcursor) + if not videos_res: + utils.logger.error( + f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data." + ) + break + + vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {}) + pcursor = vision_profile_photo_list.get("pcursor", "") + + videos = vision_profile_photo_list.get("feeds", []) + utils.logger.info( + f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}" + ) + + if callback: + await callback(videos) + await asyncio.sleep(crawl_interval) + result.extend(videos) return result diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index d318a9c..a5eda58 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -65,11 +65,14 @@ class KuaishouCrawler(AbstractCrawler): crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": - # Search for notes and retrieve their comment information. + # Search for videos and retrieve their comment information. await self.search() elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_videos() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their videos and comments + await self.get_creators_and_videos() else: pass @@ -89,7 +92,7 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}") page += 1 continue - + utils.logger.info(f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}") video_id_list: List[str] = [] videos_res = await self.ks_client.search_info_by_keyword( keyword=keyword, @@ -135,7 +138,7 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}") return None except KeyError as ex: - utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}") + utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}") return None async def batch_get_video_comments(self, video_id_list: List[str]): @@ -145,7 +148,7 @@ class KuaishouCrawler(AbstractCrawler): :return: """ if not config.ENABLE_GET_COMMENTS: - utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled") return utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}") @@ -200,10 +203,10 @@ class KuaishouCrawler(AbstractCrawler): return playwright_proxy, httpx_proxy async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: - """Create xhs client""" + """Create ks client""" utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) - xhs_client_obj = KuaiShouClient( + ks_client_obj = KuaiShouClient( proxies=httpx_proxy, headers={ "User-Agent": self.user_agent, @@ -215,7 +218,7 @@ class KuaishouCrawler(AbstractCrawler): playwright_page=self.context_page, cookie_dict=cookie_dict, ) - return xhs_client_obj + return ks_client_obj async def launch_browser( self, @@ -246,6 +249,39 @@ class KuaishouCrawler(AbstractCrawler): ) return browser_context + async def get_creators_and_videos(self) -> None: + """Get creator's videos and retrieve their comment information.""" + utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators") + for user_id in config.KS_CREATOR_ID_LIST: + # get creator detail info from web html content + createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id) + if createor_info: + await kuaishou_store.save_creator(user_id, creator=createor_info) + + # Get all video information of the creator + all_video_list = await self.ks_client.get_all_videos_by_creator( + user_id = user_id, + crawl_interval = random.random(), + callback = self.fetch_creator_video_detail + ) + + video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list] + await self.batch_get_video_comments(video_ids) + + async def fetch_creator_video_detail(self, video_list: List[Dict]): + """ + Concurrently obtain the specified post list and save the data + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list + ] + + video_details = await asyncio.gather(*task_list) + for video_detail in video_details: + if video_detail is not None: + await kuaishou_store.update_kuaishou_video(video_detail) + async def close(self): """Close browser context""" await self.browser_context.close() diff --git a/media_platform/kuaishou/graphql.py b/media_platform/kuaishou/graphql.py index 215b57f..2d32689 100644 --- a/media_platform/kuaishou/graphql.py +++ b/media_platform/kuaishou/graphql.py @@ -11,7 +11,7 @@ class KuaiShouGraphQL: self.load_graphql_queries() def load_graphql_queries(self): - graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"] + graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"] for file in graphql_files: with open(self.graphql_dir + file, mode="r") as f: diff --git a/media_platform/kuaishou/graphql/vision_profile.graphql b/media_platform/kuaishou/graphql/vision_profile.graphql index 148165a..5499600 100644 --- a/media_platform/kuaishou/graphql/vision_profile.graphql +++ b/media_platform/kuaishou/graphql/vision_profile.graphql @@ -1,16 +1,27 @@ -query visionProfileUserList($pcursor: String, $ftype: Int) { - visionProfileUserList(pcursor: $pcursor, ftype: $ftype) { +query visionProfile($userId: String) { + visionProfile(userId: $userId) { result - fols { - user_name - headurl - user_text + hostName + userProfile { + ownerCount { + fan + photo + follow + photo_public + __typename + } + profile { + gender + user_name + user_id + headurl + user_text + user_profile_bg_url + __typename + } isFollowing - user_id __typename } - hostName - pcursor __typename } } diff --git a/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql b/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql new file mode 100644 index 0000000..328052e --- /dev/null +++ b/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql @@ -0,0 +1,110 @@ +fragment photoContent on PhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked + riskTagContent + riskTagUrl +} + +fragment recoPhotoFragment on recoPhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked + riskTagContent + riskTagUrl +} + +fragment feedContent on Feed { + type + author { + id + name + headerUrl + following + headerUrls { + url + __typename + } + __typename + } + photo { + ...photoContent + ...recoPhotoFragment + __typename + } + canAddComment + llsid + status + currentPcursor + tags { + type + name + __typename + } + __typename +} + +query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) { + visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) { + result + llsid + webPageArea + feeds { + ...feedContent + __typename + } + hostName + pcursor + __typename + } +} diff --git a/media_platform/kuaishou/graphql/vision_profile_user_list.graphql b/media_platform/kuaishou/graphql/vision_profile_user_list.graphql new file mode 100644 index 0000000..148165a --- /dev/null +++ b/media_platform/kuaishou/graphql/vision_profile_user_list.graphql @@ -0,0 +1,16 @@ +query visionProfileUserList($pcursor: String, $ftype: Int) { + visionProfileUserList(pcursor: $pcursor, ftype: $ftype) { + result + fols { + user_name + headurl + user_text + isFollowing + user_id + __typename + } + hostName + pcursor + __typename + } +} diff --git a/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql b/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql new file mode 100644 index 0000000..31730fc --- /dev/null +++ b/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql @@ -0,0 +1,22 @@ +mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) { + visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) { + pcursor + subComments { + commentId + authorId + authorName + content + headurl + timestamp + likedCount + realLikedCount + liked + status + authorLiked + replyToUserName + replyTo + __typename + } + __typename + } +} diff --git a/media_platform/kuaishou/login.py b/media_platform/kuaishou/login.py index cbd578b..68462ad 100644 --- a/media_platform/kuaishou/login.py +++ b/media_platform/kuaishou/login.py @@ -7,6 +7,7 @@ from playwright.async_api import BrowserContext, Page from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, wait_fixed) +import config from base.base_crawler import AbstractLogin from tools import utils @@ -57,7 +58,7 @@ class KuaishouLogin(AbstractLogin): # click login button login_button_ele = self.context_page.locator( - "xpath=//p[text()=' 登录 ']" + "xpath=//p[text()='登录']" ) await login_button_ele.click() diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 481287e..4301574 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -108,7 +108,7 @@ class WeiboCrawler(AbstractCrawler): utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}") page += 1 continue - + utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}") search_res = await self.wb_client.get_note_by_keyword( keyword=keyword, page=page, diff --git a/media_platform/weibo/login.py b/media_platform/weibo/login.py index 2c2cf38..9dc1659 100644 --- a/media_platform/weibo/login.py +++ b/media_platform/weibo/login.py @@ -12,6 +12,7 @@ from playwright.async_api import BrowserContext, Page from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, wait_fixed) +import config from base.base_crawler import AbstractLogin from tools import utils diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index d720b68..97d073f 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -102,6 +102,7 @@ class XiaoHongShuCrawler(AbstractCrawler): continue try: + utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}") note_id_list: List[str] = [] notes_res = await self.xhs_client.get_note_by_keyword( keyword=keyword, diff --git a/static/images/10群二维码.JPG b/static/images/10群二维码.JPG deleted file mode 100644 index f84e3c2..0000000 Binary files a/static/images/10群二维码.JPG and /dev/null differ diff --git a/static/images/11群二维码.JPG b/static/images/11群二维码.JPG new file mode 100644 index 0000000..4726ad1 Binary files /dev/null and b/static/images/11群二维码.JPG differ diff --git a/static/images/9群二维码.JPG b/static/images/9群二维码.JPG deleted file mode 100644 index 3fc5d24..0000000 Binary files a/static/images/9群二维码.JPG and /dev/null differ diff --git a/static/images/xingqiu.jpg b/static/images/xingqiu.jpg index 4784475..7cf0eb9 100644 Binary files a/static/images/xingqiu.jpg and b/static/images/xingqiu.jpg differ diff --git a/static/images/xingqiu_yh.png b/static/images/xingqiu_yh.png deleted file mode 100644 index 505315f..0000000 Binary files a/static/images/xingqiu_yh.png and /dev/null differ diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py index 7b93432..4a07dff 100644 --- a/store/bilibili/bilibili_store_impl.py +++ b/store/bilibili/bilibili_store_impl.py @@ -13,9 +13,9 @@ import aiofiles import config from base.base_crawler import AbstractStore -from tools import utils +from tools import utils, words from var import crawler_type_var -from tools import words + def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py index a4672ee..ed83450 100644 --- a/store/douyin/douyin_store_impl.py +++ b/store/douyin/douyin_store_impl.py @@ -11,10 +11,10 @@ from typing import Dict import aiofiles -from base.base_crawler import AbstractStore -from tools import utils,words -from var import crawler_type_var import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var def calculate_number_of_files(file_store_path: str) -> int: diff --git a/store/kuaishou/__init__.py b/store/kuaishou/__init__.py index 818c75a..cfdcd29 100644 --- a/store/kuaishou/__init__.py +++ b/store/kuaishou/__init__.py @@ -76,3 +76,22 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict): utils.logger.info( f"[store.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {save_comment_item.get('content')}") await KuaishouStoreFactory.create_store().store_comment(comment_item=save_comment_item) + +async def save_creator(user_id: str, creator: Dict): + ownerCount = creator.get('ownerCount', {}) + profile = creator.get('profile', {}) + + local_db_item = { + 'user_id': user_id, + 'nickname': profile.get('user_name'), + 'gender': '女' if profile.get('gender') == "F" else '男', + 'avatar': profile.get('headurl'), + 'desc': profile.get('user_text'), + 'ip_location': "", + 'follows': ownerCount.get("follow"), + 'fans': ownerCount.get("fan"), + 'interaction': ownerCount.get("photo_public"), + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"[store.kuaishou.save_creator] creator:{local_db_item}") + await KuaishouStoreFactory.create_store().store_creator(local_db_item) \ No newline at end of file diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py index 4883daa..57a15ad 100644 --- a/store/kuaishou/kuaishou_store_impl.py +++ b/store/kuaishou/kuaishou_store_impl.py @@ -11,10 +11,11 @@ from typing import Dict import aiofiles -from base.base_crawler import AbstractStore -from tools import utils,words -from var import crawler_type_var import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 @@ -205,3 +206,14 @@ class KuaishouJsonStoreImplement(AbstractStore): """ await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + Kuaishou content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") \ No newline at end of file diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py index fdd21d4..aa5a019 100644 --- a/store/weibo/weibo_store_impl.py +++ b/store/weibo/weibo_store_impl.py @@ -11,10 +11,11 @@ from typing import Dict import aiofiles -from base.base_crawler import AbstractStore -from tools import utils,words -from var import crawler_type_var import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py index ab13482..6620c70 100644 --- a/store/xhs/__init__.py +++ b/store/xhs/__init__.py @@ -113,7 +113,7 @@ async def save_creator(user_id: str, creator: Dict): 'gender': '女' if user_info.get('gender') == 1 else '男', 'avatar': user_info.get('images'), 'desc': user_info.get('desc'), - 'ip_location': user_info.get('ip_location'), + 'ip_location': user_info.get('ipLocation'), 'follows': follows, 'fans': fans, 'interaction': interaction, diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py index 3204d0c..8450de2 100644 --- a/store/xhs/xhs_store_impl.py +++ b/store/xhs/xhs_store_impl.py @@ -11,10 +11,11 @@ from typing import Dict import aiofiles -from base.base_crawler import AbstractStore -from tools import utils,words -from var import crawler_type_var import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 diff --git a/tools/words.py b/tools/words.py index b7c2b00..d9f0a8c 100644 --- a/tools/words.py +++ b/tools/words.py @@ -1,10 +1,12 @@ -import aiofiles import asyncio -import jieba -from collections import Counter -from wordcloud import WordCloud import json +from collections import Counter + +import aiofiles +import jieba import matplotlib.pyplot as plt +from wordcloud import WordCloud + import config from tools import utils