diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 0000000..705b563
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,17 @@
+on:
+ push:
+ branches:
+ - main
+
+jobs:
+ contrib-readme-job:
+ runs-on: ubuntu-latest
+ name: A job to automate contrib in readme
+ permissions:
+ contents: write
+ pull-requests: write
+ steps:
+ - name: Contribute List
+ uses: akhilmhdh/contributors-readme-action@v2.3.10
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/README.md b/README.md
index be329d7..98c9366 100644
--- a/README.md
+++ b/README.md
@@ -22,12 +22,11 @@
|-----|-------|----------|-----|--------|-------|-------|-------|
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
+| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
-
## 使用方法
### 创建并激活 python 虚拟环境
@@ -87,12 +86,19 @@
## 开发者服务
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题
-
-
+
- 前20个入驻星球的小伙伴,将获得新人券50元,还剩14张。
-
-- 视频课程:
+
+ 星球精选文章:
+ - [【独创】使用Playwright获取某音a_bogus参数流程(包含加密参数分析)](https://articles.zsxq.com/id_u89al50jk9x0.html)
+ - [【独创】使用Playwright低成本获取某书X-s参数流程分析(当年的回忆录)](https://articles.zsxq.com/id_u4lcrvqakuc7.html)
+ - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
+ - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
+ - 每天 1 快钱订阅我的知识服务
+
+
+
+- MediaCrawler视频课程:
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~
> 课程售价非常非常的便宜,几杯咖啡的事儿.
> 课程介绍飞书文档链接:https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
@@ -115,7 +121,7 @@
> 7天有效期,自动更新, 如果人满了可以加作者wx拉进群: yzglan,备注来自github.
-
+
@@ -144,11 +150,220 @@
## 手机号登录说明
➡️➡️➡️ [手机号登录说明](docs/手机号登录说明.md)
+<<<<<<< HEAD
## 词云图相关操作说明
➡️➡️➡️ [词云图相关说明](docs/关于词云图相关操作.md)
+=======
+## 项目贡献者
+
+
+
+>>>>>>> 86a88f72602fe3f692acc628427888487554b716
## star 趋势图
- 如果该项目对你有帮助,star一下 ❤️❤️❤️
diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py
index 2d07675..27854f7 100644
--- a/cmd_arg/arg.py
+++ b/cmd_arg/arg.py
@@ -1,4 +1,5 @@
import argparse
+
import config
from tools.utils import str2bool
diff --git a/config/base_config.py b/config/base_config.py
index 9b52e52..fe7d61c 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -3,8 +3,10 @@ PLATFORM = "xhs"
KEYWORDS = "python,golang"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
-# 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
+# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
SORT_TYPE = "popularity_descending"
+# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音
+PUBLISH_TIME_TYPE = 0
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
# 是否开启 IP 代理
@@ -103,6 +105,13 @@ BILI_CREATOR_ID_LIST = [
# ........................
]
+# 指定快手创作者ID列表
+KS_CREATOR_ID_LIST = [
+ "3x4sm73aye7jq7i",
+ # ........................
+]
+
+
#词云相关
#是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False
@@ -118,5 +127,3 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
#中文字体文件路径
FONT_PATH= "./docs/STZHONGS.TTF"
-
-
diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
index c14391d..1d08791 100644
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -106,6 +106,7 @@ class BilibiliCrawler(AbstractCrawler):
page += 1
continue
+ utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
@@ -126,7 +127,6 @@ class BilibiliCrawler(AbstractCrawler):
if video_item:
video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item)
-
page += 1
await self.batch_get_video_comments(video_id_list)
diff --git a/media_platform/bilibili/login.py b/media_platform/bilibili/login.py
index 33c929f..cc7baa2 100644
--- a/media_platform/bilibili/login.py
+++ b/media_platform/bilibili/login.py
@@ -12,8 +12,8 @@ from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
-from base.base_crawler import AbstractLogin
import config
+from base.base_crawler import AbstractLogin
from tools import utils
diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py
index ede6049..ce04698 100644
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@@ -1,5 +1,6 @@
import asyncio
import copy
+import json
import urllib.parse
from typing import Any, Callable, Dict, List, Optional
@@ -119,14 +120,19 @@ class DOUYINClient(AbstractApiClient):
params = {
"keyword": urllib.parse.quote(keyword),
"search_channel": search_channel.value,
- "sort_type": sort_type.value,
- "publish_time": publish_time.value,
"search_source": "normal_search",
- "query_correct_type": "1",
- "is_filter_search": "0",
+ "query_correct_type": 1,
+ "is_filter_search": 0,
"offset": offset,
"count": 10 # must be set to 10
}
+ if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED:
+ params["filter_selected"] = urllib.parse.quote(json.dumps({
+ "sort_type": str(sort_type.value),
+ "publish_time": str(publish_time.value)
+ }))
+ params["is_filter_search"] = 1
+ params["search_source"] = "tab_search"
referer_url = "https://www.douyin.com/search/" + keyword
referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general"
headers = copy.copy(self.headers)
diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
index dde5d5b..64992ef 100644
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -90,9 +90,10 @@ class DouYinCrawler(AbstractCrawler):
page += 1
continue
try:
+ utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
- offset=page * dy_limit_count,
- publish_time=PublishTimeType.UNLIMITED
+ offset=page * dy_limit_count - dy_limit_count,
+ publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE)
)
except DataFetchError:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
diff --git a/media_platform/douyin/field.py b/media_platform/douyin/field.py
index 2ac4d62..ab9da9f 100644
--- a/media_platform/douyin/field.py
+++ b/media_platform/douyin/field.py
@@ -12,13 +12,12 @@ class SearchChannelType(Enum):
class SearchSortType(Enum):
"""search sort type"""
GENERAL = 0 # 综合排序
- LATEST = 1 # 最新发布
- MOST_LIKE = 2 # 最多点赞
-
+ MOST_LIKE = 1 # 最多点赞
+ LATEST = 2 # 最新发布
class PublishTimeType(Enum):
"""publish time type"""
UNLIMITED = 0 # 不限
ONE_DAY = 1 # 一天内
- ONE_WEEK = 2 # 一周内
- SIX_MONTH = 3 # 半年内
+ ONE_WEEK = 7 # 一周内
+ SIX_MONTH = 180 # 半年内
diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py
index 8ad2155..f643171 100644
--- a/media_platform/kuaishou/client.py
+++ b/media_platform/kuaishou/client.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import asyncio
import json
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional
from urllib.parse import urlencode
import httpx
@@ -67,7 +67,7 @@ class KuaiShouClient(AbstractApiClient):
"variables": {
"ftype": 1,
},
- "query": self.graphql.get("vision_profile")
+ "query": self.graphql.get("vision_profile_user_list")
}
res = await self.post("", post_data)
if res.get("visionProfileUserList", {}).get("result") == 1:
@@ -129,17 +129,60 @@ class KuaiShouClient(AbstractApiClient):
"pcursor": pcursor
},
"query": self.graphql.get("comment_list")
-
}
return await self.post("", post_data)
- async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
- callback: Optional[Callable] = None):
+ async def get_video_sub_comments(
+ self, photo_id: str, rootCommentId: str, pcursor: str = ""
+ ) -> Dict:
+ """get video sub comments
+ :param photo_id: photo id you want to fetch
+ :param pcursor: last you get pcursor, defaults to ""
+ :return:
+ """
+ post_data = {
+ "operationName": "visionSubCommentList",
+ "variables": {
+ "photoId": photo_id,
+ "pcursor": pcursor,
+ "rootCommentId": rootCommentId,
+ },
+ "query": self.graphql.get("vision_sub_comment_list"),
+ }
+ return await self.post("", post_data)
+
+ async def get_creator_profile(self, userId: str) -> Dict:
+ post_data = {
+ "operationName": "visionProfile",
+ "variables": {
+ "userId": userId
+ },
+ "query": self.graphql.get("vision_profile"),
+ }
+ return await self.post("", post_data)
+
+ async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
+ post_data = {
+ "operationName": "visionProfilePhotoList",
+ "variables": {
+ "page": "profile",
+ "pcursor": pcursor,
+ "userId": userId
+ },
+ "query": self.graphql.get("vision_profile_photo_list"),
+ }
+ return await self.post("", post_data)
+
+ async def get_video_all_comments(
+ self,
+ photo_id: str,
+ crawl_interval: float = 1.0,
+ callback: Optional[Callable] = None,
+ ):
"""
get video all comments include sub comments
:param photo_id:
:param crawl_interval:
- :param is_fetch_sub_comments:
:param callback:
:return:
"""
@@ -158,7 +201,107 @@ class KuaiShouClient(AbstractApiClient):
result.extend(comments)
await asyncio.sleep(crawl_interval)
- if not is_fetch_sub_comments:
- continue
- # todo handle get sub comments
+ sub_comments = await self.get_comments_all_sub_comments(
+ comments, photo_id, crawl_interval, callback
+ )
+ result.extend(sub_comments)
+ return result
+
+ async def get_comments_all_sub_comments(
+ self,
+ comments: List[Dict],
+ photo_id,
+ crawl_interval: float = 1.0,
+ callback: Optional[Callable] = None,
+ ) -> List[Dict]:
+ """
+ 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
+ Args:
+ comments: 评论列表
+ photo_id: 视频id
+ crawl_interval: 爬取一次评论的延迟单位(秒)
+ callback: 一次评论爬取结束后
+ Returns:
+
+ """
+ if not config.ENABLE_GET_SUB_COMMENTS:
+ utils.logger.info(
+ f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
+ )
+ return []
+
+ result = []
+ for comment in comments:
+ sub_comments = comment.get("subComments")
+ if sub_comments and callback:
+ await callback(photo_id, sub_comments)
+
+ sub_comment_pcursor = comment.get("subCommentsPcursor")
+ if sub_comment_pcursor == "no_more":
+ continue
+
+ root_comment_id = comment.get("commentId")
+ sub_comment_pcursor = ""
+
+ while sub_comment_pcursor != "no_more":
+ comments_res = await self.get_video_sub_comments(
+ photo_id, root_comment_id, sub_comment_pcursor
+ )
+ vision_sub_comment_list = comments_res.get("visionSubCommentList",{})
+ sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more")
+
+ comments = vision_sub_comment_list.get("subComments", {})
+ if callback:
+ await callback(photo_id, comments)
+ await asyncio.sleep(crawl_interval)
+ result.extend(comments)
+ return result
+
+ async def get_creator_info(self, user_id: str) -> Dict:
+ """
+ eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
+ 快手用户主页
+ """
+
+ visionProfile = await self.get_creator_profile(user_id)
+ return visionProfile.get("userProfile")
+
+ async def get_all_videos_by_creator(
+ self,
+ user_id: str,
+ crawl_interval: float = 1.0,
+ callback: Optional[Callable] = None,
+ ) -> List[Dict]:
+ """
+ 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
+ Args:
+ user_id: 用户ID
+ crawl_interval: 爬取一次的延迟单位(秒)
+ callback: 一次分页爬取结束后的更新回调函数
+ Returns:
+
+ """
+ result = []
+ pcursor = ""
+
+ while pcursor != "no_more":
+ videos_res = await self.get_video_by_creater(user_id, pcursor)
+ if not videos_res:
+ utils.logger.error(
+ f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
+ )
+ break
+
+ vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
+ pcursor = vision_profile_photo_list.get("pcursor", "")
+
+ videos = vision_profile_photo_list.get("feeds", [])
+ utils.logger.info(
+ f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
+ )
+
+ if callback:
+ await callback(videos)
+ await asyncio.sleep(crawl_interval)
+ result.extend(videos)
return result
diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py
index d318a9c..a5eda58 100644
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@@ -65,11 +65,14 @@ class KuaishouCrawler(AbstractCrawler):
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
- # Search for notes and retrieve their comment information.
+ # Search for videos and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_videos()
+ elif config.CRAWLER_TYPE == "creator":
+ # Get creator's information and their videos and comments
+ await self.get_creators_and_videos()
else:
pass
@@ -89,7 +92,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
page += 1
continue
-
+ utils.logger.info(f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.ks_client.search_info_by_keyword(
keyword=keyword,
@@ -135,7 +138,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
return None
except KeyError as ex:
- utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
+ utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}")
return None
async def batch_get_video_comments(self, video_id_list: List[str]):
@@ -145,7 +148,7 @@ class KuaishouCrawler(AbstractCrawler):
:return:
"""
if not config.ENABLE_GET_COMMENTS:
- utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
+ utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled")
return
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
@@ -200,10 +203,10 @@ class KuaishouCrawler(AbstractCrawler):
return playwright_proxy, httpx_proxy
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
- """Create xhs client"""
+ """Create ks client"""
utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
- xhs_client_obj = KuaiShouClient(
+ ks_client_obj = KuaiShouClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
@@ -215,7 +218,7 @@ class KuaishouCrawler(AbstractCrawler):
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
- return xhs_client_obj
+ return ks_client_obj
async def launch_browser(
self,
@@ -246,6 +249,39 @@ class KuaishouCrawler(AbstractCrawler):
)
return browser_context
+ async def get_creators_and_videos(self) -> None:
+ """Get creator's videos and retrieve their comment information."""
+ utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators")
+ for user_id in config.KS_CREATOR_ID_LIST:
+ # get creator detail info from web html content
+ createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
+ if createor_info:
+ await kuaishou_store.save_creator(user_id, creator=createor_info)
+
+ # Get all video information of the creator
+ all_video_list = await self.ks_client.get_all_videos_by_creator(
+ user_id = user_id,
+ crawl_interval = random.random(),
+ callback = self.fetch_creator_video_detail
+ )
+
+ video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list]
+ await self.batch_get_video_comments(video_ids)
+
+ async def fetch_creator_video_detail(self, video_list: List[Dict]):
+ """
+ Concurrently obtain the specified post list and save the data
+ """
+ semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+ task_list = [
+ self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list
+ ]
+
+ video_details = await asyncio.gather(*task_list)
+ for video_detail in video_details:
+ if video_detail is not None:
+ await kuaishou_store.update_kuaishou_video(video_detail)
+
async def close(self):
"""Close browser context"""
await self.browser_context.close()
diff --git a/media_platform/kuaishou/graphql.py b/media_platform/kuaishou/graphql.py
index 215b57f..2d32689 100644
--- a/media_platform/kuaishou/graphql.py
+++ b/media_platform/kuaishou/graphql.py
@@ -11,7 +11,7 @@ class KuaiShouGraphQL:
self.load_graphql_queries()
def load_graphql_queries(self):
- graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"]
+ graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]
for file in graphql_files:
with open(self.graphql_dir + file, mode="r") as f:
diff --git a/media_platform/kuaishou/graphql/vision_profile.graphql b/media_platform/kuaishou/graphql/vision_profile.graphql
index 148165a..5499600 100644
--- a/media_platform/kuaishou/graphql/vision_profile.graphql
+++ b/media_platform/kuaishou/graphql/vision_profile.graphql
@@ -1,16 +1,27 @@
-query visionProfileUserList($pcursor: String, $ftype: Int) {
- visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
+query visionProfile($userId: String) {
+ visionProfile(userId: $userId) {
result
- fols {
- user_name
- headurl
- user_text
+ hostName
+ userProfile {
+ ownerCount {
+ fan
+ photo
+ follow
+ photo_public
+ __typename
+ }
+ profile {
+ gender
+ user_name
+ user_id
+ headurl
+ user_text
+ user_profile_bg_url
+ __typename
+ }
isFollowing
- user_id
__typename
}
- hostName
- pcursor
__typename
}
}
diff --git a/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql b/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql
new file mode 100644
index 0000000..328052e
--- /dev/null
+++ b/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql
@@ -0,0 +1,110 @@
+fragment photoContent on PhotoEntity {
+ __typename
+ id
+ duration
+ caption
+ originCaption
+ likeCount
+ viewCount
+ commentCount
+ realLikeCount
+ coverUrl
+ photoUrl
+ photoH265Url
+ manifest
+ manifestH265
+ videoResource
+ coverUrls {
+ url
+ __typename
+ }
+ timestamp
+ expTag
+ animatedCoverUrl
+ distance
+ videoRatio
+ liked
+ stereoType
+ profileUserTopPhoto
+ musicBlocked
+ riskTagContent
+ riskTagUrl
+}
+
+fragment recoPhotoFragment on recoPhotoEntity {
+ __typename
+ id
+ duration
+ caption
+ originCaption
+ likeCount
+ viewCount
+ commentCount
+ realLikeCount
+ coverUrl
+ photoUrl
+ photoH265Url
+ manifest
+ manifestH265
+ videoResource
+ coverUrls {
+ url
+ __typename
+ }
+ timestamp
+ expTag
+ animatedCoverUrl
+ distance
+ videoRatio
+ liked
+ stereoType
+ profileUserTopPhoto
+ musicBlocked
+ riskTagContent
+ riskTagUrl
+}
+
+fragment feedContent on Feed {
+ type
+ author {
+ id
+ name
+ headerUrl
+ following
+ headerUrls {
+ url
+ __typename
+ }
+ __typename
+ }
+ photo {
+ ...photoContent
+ ...recoPhotoFragment
+ __typename
+ }
+ canAddComment
+ llsid
+ status
+ currentPcursor
+ tags {
+ type
+ name
+ __typename
+ }
+ __typename
+}
+
+query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
+ visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
+ result
+ llsid
+ webPageArea
+ feeds {
+ ...feedContent
+ __typename
+ }
+ hostName
+ pcursor
+ __typename
+ }
+}
diff --git a/media_platform/kuaishou/graphql/vision_profile_user_list.graphql b/media_platform/kuaishou/graphql/vision_profile_user_list.graphql
new file mode 100644
index 0000000..148165a
--- /dev/null
+++ b/media_platform/kuaishou/graphql/vision_profile_user_list.graphql
@@ -0,0 +1,16 @@
+query visionProfileUserList($pcursor: String, $ftype: Int) {
+ visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
+ result
+ fols {
+ user_name
+ headurl
+ user_text
+ isFollowing
+ user_id
+ __typename
+ }
+ hostName
+ pcursor
+ __typename
+ }
+}
diff --git a/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql b/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql
new file mode 100644
index 0000000..31730fc
--- /dev/null
+++ b/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql
@@ -0,0 +1,22 @@
+mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
+ visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
+ pcursor
+ subComments {
+ commentId
+ authorId
+ authorName
+ content
+ headurl
+ timestamp
+ likedCount
+ realLikedCount
+ liked
+ status
+ authorLiked
+ replyToUserName
+ replyTo
+ __typename
+ }
+ __typename
+ }
+}
diff --git a/media_platform/kuaishou/login.py b/media_platform/kuaishou/login.py
index cbd578b..68462ad 100644
--- a/media_platform/kuaishou/login.py
+++ b/media_platform/kuaishou/login.py
@@ -7,6 +7,7 @@ from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
+import config
from base.base_crawler import AbstractLogin
from tools import utils
@@ -57,7 +58,7 @@ class KuaishouLogin(AbstractLogin):
# click login button
login_button_ele = self.context_page.locator(
- "xpath=//p[text()=' 登录 ']"
+ "xpath=//p[text()='登录']"
)
await login_button_ele.click()
diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py
index 481287e..4301574 100644
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -108,7 +108,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
page += 1
continue
-
+ utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
search_res = await self.wb_client.get_note_by_keyword(
keyword=keyword,
page=page,
diff --git a/media_platform/weibo/login.py b/media_platform/weibo/login.py
index 2c2cf38..9dc1659 100644
--- a/media_platform/weibo/login.py
+++ b/media_platform/weibo/login.py
@@ -12,6 +12,7 @@ from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
+import config
from base.base_crawler import AbstractLogin
from tools import utils
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
index d720b68..97d073f 100644
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -102,6 +102,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
continue
try:
+ utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}")
note_id_list: List[str] = []
notes_res = await self.xhs_client.get_note_by_keyword(
keyword=keyword,
diff --git a/static/images/10群二维码.JPG b/static/images/10群二维码.JPG
deleted file mode 100644
index f84e3c2..0000000
Binary files a/static/images/10群二维码.JPG and /dev/null differ
diff --git a/static/images/11群二维码.JPG b/static/images/11群二维码.JPG
new file mode 100644
index 0000000..4726ad1
Binary files /dev/null and b/static/images/11群二维码.JPG differ
diff --git a/static/images/9群二维码.JPG b/static/images/9群二维码.JPG
deleted file mode 100644
index 3fc5d24..0000000
Binary files a/static/images/9群二维码.JPG and /dev/null differ
diff --git a/static/images/xingqiu.jpg b/static/images/xingqiu.jpg
index 4784475..7cf0eb9 100644
Binary files a/static/images/xingqiu.jpg and b/static/images/xingqiu.jpg differ
diff --git a/static/images/xingqiu_yh.png b/static/images/xingqiu_yh.png
deleted file mode 100644
index 505315f..0000000
Binary files a/static/images/xingqiu_yh.png and /dev/null differ
diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py
index 7b93432..4a07dff 100644
--- a/store/bilibili/bilibili_store_impl.py
+++ b/store/bilibili/bilibili_store_impl.py
@@ -13,9 +13,9 @@ import aiofiles
import config
from base.base_crawler import AbstractStore
-from tools import utils
+from tools import utils, words
from var import crawler_type_var
-from tools import words
+
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py
index a4672ee..ed83450 100644
--- a/store/douyin/douyin_store_impl.py
+++ b/store/douyin/douyin_store_impl.py
@@ -11,10 +11,10 @@ from typing import Dict
import aiofiles
-from base.base_crawler import AbstractStore
-from tools import utils,words
-from var import crawler_type_var
import config
+from base.base_crawler import AbstractStore
+from tools import utils, words
+from var import crawler_type_var
def calculate_number_of_files(file_store_path: str) -> int:
diff --git a/store/kuaishou/__init__.py b/store/kuaishou/__init__.py
index 818c75a..cfdcd29 100644
--- a/store/kuaishou/__init__.py
+++ b/store/kuaishou/__init__.py
@@ -76,3 +76,22 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict):
utils.logger.info(
f"[store.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {save_comment_item.get('content')}")
await KuaishouStoreFactory.create_store().store_comment(comment_item=save_comment_item)
+
+async def save_creator(user_id: str, creator: Dict):
+ ownerCount = creator.get('ownerCount', {})
+ profile = creator.get('profile', {})
+
+ local_db_item = {
+ 'user_id': user_id,
+ 'nickname': profile.get('user_name'),
+ 'gender': '女' if profile.get('gender') == "F" else '男',
+ 'avatar': profile.get('headurl'),
+ 'desc': profile.get('user_text'),
+ 'ip_location': "",
+ 'follows': ownerCount.get("follow"),
+ 'fans': ownerCount.get("fan"),
+ 'interaction': ownerCount.get("photo_public"),
+ "last_modify_ts": utils.get_current_timestamp(),
+ }
+ utils.logger.info(f"[store.kuaishou.save_creator] creator:{local_db_item}")
+ await KuaishouStoreFactory.create_store().store_creator(local_db_item)
\ No newline at end of file
diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py
index 4883daa..57a15ad 100644
--- a/store/kuaishou/kuaishou_store_impl.py
+++ b/store/kuaishou/kuaishou_store_impl.py
@@ -11,10 +11,11 @@ from typing import Dict
import aiofiles
-from base.base_crawler import AbstractStore
-from tools import utils,words
-from var import crawler_type_var
import config
+from base.base_crawler import AbstractStore
+from tools import utils, words
+from var import crawler_type_var
+
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
@@ -205,3 +206,14 @@ class KuaishouJsonStoreImplement(AbstractStore):
"""
await self.save_data_to_json(comment_item, "comments")
+
+ async def store_creator(self, creator: Dict):
+ """
+ Kuaishou content JSON storage implementation
+ Args:
+ creator: creator dict
+
+ Returns:
+
+ """
+ await self.save_data_to_json(creator, "creator")
\ No newline at end of file
diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py
index fdd21d4..aa5a019 100644
--- a/store/weibo/weibo_store_impl.py
+++ b/store/weibo/weibo_store_impl.py
@@ -11,10 +11,11 @@ from typing import Dict
import aiofiles
-from base.base_crawler import AbstractStore
-from tools import utils,words
-from var import crawler_type_var
import config
+from base.base_crawler import AbstractStore
+from tools import utils, words
+from var import crawler_type_var
+
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py
index ab13482..6620c70 100644
--- a/store/xhs/__init__.py
+++ b/store/xhs/__init__.py
@@ -113,7 +113,7 @@ async def save_creator(user_id: str, creator: Dict):
'gender': '女' if user_info.get('gender') == 1 else '男',
'avatar': user_info.get('images'),
'desc': user_info.get('desc'),
- 'ip_location': user_info.get('ip_location'),
+ 'ip_location': user_info.get('ipLocation'),
'follows': follows,
'fans': fans,
'interaction': interaction,
diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py
index 3204d0c..8450de2 100644
--- a/store/xhs/xhs_store_impl.py
+++ b/store/xhs/xhs_store_impl.py
@@ -11,10 +11,11 @@ from typing import Dict
import aiofiles
-from base.base_crawler import AbstractStore
-from tools import utils,words
-from var import crawler_type_var
import config
+from base.base_crawler import AbstractStore
+from tools import utils, words
+from var import crawler_type_var
+
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
diff --git a/tools/words.py b/tools/words.py
index b7c2b00..d9f0a8c 100644
--- a/tools/words.py
+++ b/tools/words.py
@@ -1,10 +1,12 @@
-import aiofiles
import asyncio
-import jieba
-from collections import Counter
-from wordcloud import WordCloud
import json
+from collections import Counter
+
+import aiofiles
+import jieba
import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+
import config
from tools import utils