# Conflicts:
#	README.md
This commit is contained in:
Rosyrain 2024-06-27 21:26:15 +08:00
commit 71c07c7d36
33 changed files with 693 additions and 71 deletions

17
.github/workflows/main.yaml vendored Normal file
View File

@ -0,0 +1,17 @@
on:
push:
branches:
- main
jobs:
contrib-readme-job:
runs-on: ubuntu-latest
name: A job to automate contrib in readme
permissions:
contents: write
pull-requests: write
steps:
- name: Contribute List
uses: akhilmhdh/contributors-readme-action@v2.3.10
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

231
README.md
View File

@ -22,12 +22,11 @@
|-----|-------|----------|-----|--------|-------|-------|-------|
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
## 使用方法
### 创建并激活 python 虚拟环境
@ -87,12 +86,19 @@
## 开发者服务
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题
<p>
<img alt="星球图片" src="static/images/xingqiu.jpg" style="width: auto;height: 400px" >
<img alt="星球图片" src="static/images/xingqiu_yh.png" style="width: auto;height: 400px" >
<img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
</p>
前20个入驻星球的小伙伴将获得新人券50元还剩14张。
<br>
- 视频课程:
星球精选文章:
- [【独创】使用Playwright获取某音a_bogus参数流程包含加密参数分析](https://articles.zsxq.com/id_u89al50jk9x0.html)
- [【独创】使用Playwright低成本获取某书X-s参数流程分析当年的回忆录](https://articles.zsxq.com/id_u4lcrvqakuc7.html)
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
- 每天 1 快钱订阅我的知识服务
- MediaCrawler视频课程
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
> 课程售价非常非常的便宜,几杯咖啡的事儿.<br>
> 课程介绍飞书文档链接https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
@ -115,7 +121,7 @@
> 7天有效期自动更新, 如果人满了可以加作者wx拉进群: yzglan备注来自github.
<div style="max-width: 200px">
<p><img alt="10群二维码" src="static/images/10群二维码.JPG" style="width: 200px;height: 100%" ></p>
<p><img alt="11群二维码" src="static/images/11群二维码.JPG" style="width: 200px;height: 100%" ></p>
</div>
@ -144,11 +150,220 @@
## 手机号登录说明
➡️➡️➡️ [手机号登录说明](docs/手机号登录说明.md)
<<<<<<< HEAD
## 词云图相关操作说明
➡️➡️➡️ [词云图相关说明](docs/关于词云图相关操作.md)
=======
## 项目贡献者
<!-- readme: contributors -start -->
<table>
<tbody>
<tr>
<td align="center">
<a href="https://github.com/NanmiCoder">
<img src="https://avatars.githubusercontent.com/u/47178017?v=4" width="100;" alt="NanmiCoder"/>
<br />
<sub><b>程序员阿江-Relakkes</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/leantli">
<img src="https://avatars.githubusercontent.com/u/117699758?v=4" width="100;" alt="leantli"/>
<br />
<sub><b>leantli</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/BaoZhuhan">
<img src="https://avatars.githubusercontent.com/u/140676370?v=4" width="100;" alt="BaoZhuhan"/>
<br />
<sub><b>Bao Zhuhan</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/nelzomal">
<img src="https://avatars.githubusercontent.com/u/8512926?v=4" width="100;" alt="nelzomal"/>
<br />
<sub><b>zhounan</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/Hiro-Lin">
<img src="https://avatars.githubusercontent.com/u/40111864?v=4" width="100;" alt="Hiro-Lin"/>
<br />
<sub><b>HIRO</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/PeanutSplash">
<img src="https://avatars.githubusercontent.com/u/98582625?v=4" width="100;" alt="PeanutSplash"/>
<br />
<sub><b>PeanutSplash</b></sub>
</a>
</td>
</tr>
<tr>
<td align="center">
<a href="https://github.com/Ermeng98">
<img src="https://avatars.githubusercontent.com/u/55784769?v=4" width="100;" alt="Ermeng98"/>
<br />
<sub><b>Ermeng</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/Rosyrain">
<img src="https://avatars.githubusercontent.com/u/116946548?v=4" width="100;" alt="Rosyrain"/>
<br />
<sub><b>Rosyrain</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/henryhyn">
<img src="https://avatars.githubusercontent.com/u/5162443?v=4" width="100;" alt="henryhyn"/>
<br />
<sub><b>Henry He</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/Akiqqqqqqq">
<img src="https://avatars.githubusercontent.com/u/51102894?v=4" width="100;" alt="Akiqqqqqqq"/>
<br />
<sub><b>leonardoqiuyu</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/jayeeliu">
<img src="https://avatars.githubusercontent.com/u/77389?v=4" width="100;" alt="jayeeliu"/>
<br />
<sub><b>jayeeliu</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/ZuWard">
<img src="https://avatars.githubusercontent.com/u/38209256?v=4" width="100;" alt="ZuWard"/>
<br />
<sub><b>ZuWard</b></sub>
</a>
</td>
</tr>
<tr>
<td align="center">
<a href="https://github.com/Zzendrix">
<img src="https://avatars.githubusercontent.com/u/154900254?v=4" width="100;" alt="Zzendrix"/>
<br />
<sub><b>Zendrix</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/chunpat">
<img src="https://avatars.githubusercontent.com/u/19848304?v=4" width="100;" alt="chunpat"/>
<br />
<sub><b>zhangzhenpeng</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/tanpenggood">
<img src="https://avatars.githubusercontent.com/u/37927946?v=4" width="100;" alt="tanpenggood"/>
<br />
<sub><b>Sam Tan</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/xbsheng">
<img src="https://avatars.githubusercontent.com/u/56357338?v=4" width="100;" alt="xbsheng"/>
<br />
<sub><b>xbsheng</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/yangrq1018">
<img src="https://avatars.githubusercontent.com/u/25074163?v=4" width="100;" alt="yangrq1018"/>
<br />
<sub><b>Martin</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/zhihuiio">
<img src="https://avatars.githubusercontent.com/u/165655688?v=4" width="100;" alt="zhihuiio"/>
<br />
<sub><b>zhihuiio</b></sub>
</a>
</td>
</tr>
<tr>
<td align="center">
<a href="https://github.com/renaissancezyc">
<img src="https://avatars.githubusercontent.com/u/118403818?v=4" width="100;" alt="renaissancezyc"/>
<br />
<sub><b>Ren</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/Tianci-King">
<img src="https://avatars.githubusercontent.com/u/109196852?v=4" width="100;" alt="Tianci-King"/>
<br />
<sub><b>Wang Tianci</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/Styunlen">
<img src="https://avatars.githubusercontent.com/u/30810222?v=4" width="100;" alt="Styunlen"/>
<br />
<sub><b>Styunlen</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/Schofi">
<img src="https://avatars.githubusercontent.com/u/33537727?v=4" width="100;" alt="Schofi"/>
<br />
<sub><b>Schofi</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/Klu5ure">
<img src="https://avatars.githubusercontent.com/u/166240879?v=4" width="100;" alt="Klu5ure"/>
<br />
<sub><b>Klu5ure</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/keeper-jie">
<img src="https://avatars.githubusercontent.com/u/33612777?v=4" width="100;" alt="keeper-jie"/>
<br />
<sub><b>Kermit</b></sub>
</a>
</td>
</tr>
<tr>
<td align="center">
<a href="https://github.com/kexinoh">
<img src="https://avatars.githubusercontent.com/u/91727108?v=4" width="100;" alt="kexinoh"/>
<br />
<sub><b>KEXNA</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/aa65535">
<img src="https://avatars.githubusercontent.com/u/5417786?v=4" width="100;" alt="aa65535"/>
<br />
<sub><b>Jian Chang</b></sub>
</a>
</td>
<td align="center">
<a href="https://github.com/522109452">
<img src="https://avatars.githubusercontent.com/u/16929874?v=4" width="100;" alt="522109452"/>
<br />
<sub><b>tianqing</b></sub>
</a>
</td>
</tr>
<tbody>
</table>
<!-- readme: contributors -end -->
>>>>>>> 86a88f72602fe3f692acc628427888487554b716
## star 趋势图
- 如果该项目对你有帮助star一下 ❤️❤️❤️

View File

@ -1,4 +1,5 @@
import argparse
import config
from tools.utils import str2bool

View File

@ -3,8 +3,10 @@ PLATFORM = "xhs"
KEYWORDS = "python,golang"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
# 具体值参见media_platform.xxx.field下的枚举值展示只支持小红书
# 具体值参见media_platform.xxx.field下的枚举值暂时只支持小红书
SORT_TYPE = "popularity_descending"
# 具体值参见media_platform.xxx.field下的枚举值暂时只支持抖音
PUBLISH_TIME_TYPE = 0
CRAWLER_TYPE = "search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
# 是否开启 IP 代理
@ -103,6 +105,13 @@ BILI_CREATOR_ID_LIST = [
# ........................
]
# 指定快手创作者ID列表
KS_CREATOR_ID_LIST = [
"3x4sm73aye7jq7i",
# ........................
]
#词云相关
#是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False
@ -118,5 +127,3 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
#中文字体文件路径
FONT_PATH= "./docs/STZHONGS.TTF"

View File

@ -106,6 +106,7 @@ class BilibiliCrawler(AbstractCrawler):
page += 1
continue
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
@ -126,7 +127,6 @@ class BilibiliCrawler(AbstractCrawler):
if video_item:
video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item)
page += 1
await self.batch_get_video_comments(video_id_list)

View File

@ -12,8 +12,8 @@ from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
from base.base_crawler import AbstractLogin
import config
from base.base_crawler import AbstractLogin
from tools import utils

View File

@ -1,5 +1,6 @@
import asyncio
import copy
import json
import urllib.parse
from typing import Any, Callable, Dict, List, Optional
@ -119,14 +120,19 @@ class DOUYINClient(AbstractApiClient):
params = {
"keyword": urllib.parse.quote(keyword),
"search_channel": search_channel.value,
"sort_type": sort_type.value,
"publish_time": publish_time.value,
"search_source": "normal_search",
"query_correct_type": "1",
"is_filter_search": "0",
"query_correct_type": 1,
"is_filter_search": 0,
"offset": offset,
"count": 10 # must be set to 10
}
if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED:
params["filter_selected"] = urllib.parse.quote(json.dumps({
"sort_type": str(sort_type.value),
"publish_time": str(publish_time.value)
}))
params["is_filter_search"] = 1
params["search_source"] = "tab_search"
referer_url = "https://www.douyin.com/search/" + keyword
referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general"
headers = copy.copy(self.headers)

View File

@ -90,9 +90,10 @@ class DouYinCrawler(AbstractCrawler):
page += 1
continue
try:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
offset=page * dy_limit_count,
publish_time=PublishTimeType.UNLIMITED
offset=page * dy_limit_count - dy_limit_count,
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE)
)
except DataFetchError:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")

View File

@ -12,13 +12,12 @@ class SearchChannelType(Enum):
class SearchSortType(Enum):
"""search sort type"""
GENERAL = 0 # 综合排序
LATEST = 1 # 最新发布
MOST_LIKE = 2 # 最多点赞
MOST_LIKE = 1 # 最多点赞
LATEST = 2 # 最新发布
class PublishTimeType(Enum):
"""publish time type"""
UNLIMITED = 0 # 不限
ONE_DAY = 1 # 一天内
ONE_WEEK = 2 # 一周内
SIX_MONTH = 3 # 半年内
ONE_WEEK = 7 # 一周内
SIX_MONTH = 180 # 半年内

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import asyncio
import json
from typing import Any, Callable, Dict, Optional
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import urlencode
import httpx
@ -67,7 +67,7 @@ class KuaiShouClient(AbstractApiClient):
"variables": {
"ftype": 1,
},
"query": self.graphql.get("vision_profile")
"query": self.graphql.get("vision_profile_user_list")
}
res = await self.post("", post_data)
if res.get("visionProfileUserList", {}).get("result") == 1:
@ -129,17 +129,60 @@ class KuaiShouClient(AbstractApiClient):
"pcursor": pcursor
},
"query": self.graphql.get("comment_list")
}
return await self.post("", post_data)
async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
callback: Optional[Callable] = None):
async def get_video_sub_comments(
self, photo_id: str, rootCommentId: str, pcursor: str = ""
) -> Dict:
"""get video sub comments
:param photo_id: photo id you want to fetch
:param pcursor: last you get pcursor, defaults to ""
:return:
"""
post_data = {
"operationName": "visionSubCommentList",
"variables": {
"photoId": photo_id,
"pcursor": pcursor,
"rootCommentId": rootCommentId,
},
"query": self.graphql.get("vision_sub_comment_list"),
}
return await self.post("", post_data)
async def get_creator_profile(self, userId: str) -> Dict:
post_data = {
"operationName": "visionProfile",
"variables": {
"userId": userId
},
"query": self.graphql.get("vision_profile"),
}
return await self.post("", post_data)
async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
post_data = {
"operationName": "visionProfilePhotoList",
"variables": {
"page": "profile",
"pcursor": pcursor,
"userId": userId
},
"query": self.graphql.get("vision_profile_photo_list"),
}
return await self.post("", post_data)
async def get_video_all_comments(
self,
photo_id: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
):
"""
get video all comments include sub comments
:param photo_id:
:param crawl_interval:
:param is_fetch_sub_comments:
:param callback:
:return:
"""
@ -158,7 +201,107 @@ class KuaiShouClient(AbstractApiClient):
result.extend(comments)
await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments:
continue
# todo handle get sub comments
sub_comments = await self.get_comments_all_sub_comments(
comments, photo_id, crawl_interval, callback
)
result.extend(sub_comments)
return result
async def get_comments_all_sub_comments(
self,
comments: List[Dict],
photo_id,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
) -> List[Dict]:
"""
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
Args:
comments: 评论列表
photo_id: 视频id
crawl_interval: 爬取一次评论的延迟单位
callback: 一次评论爬取结束后
Returns:
"""
if not config.ENABLE_GET_SUB_COMMENTS:
utils.logger.info(
f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
)
return []
result = []
for comment in comments:
sub_comments = comment.get("subComments")
if sub_comments and callback:
await callback(photo_id, sub_comments)
sub_comment_pcursor = comment.get("subCommentsPcursor")
if sub_comment_pcursor == "no_more":
continue
root_comment_id = comment.get("commentId")
sub_comment_pcursor = ""
while sub_comment_pcursor != "no_more":
comments_res = await self.get_video_sub_comments(
photo_id, root_comment_id, sub_comment_pcursor
)
vision_sub_comment_list = comments_res.get("visionSubCommentList",{})
sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more")
comments = vision_sub_comment_list.get("subComments", {})
if callback:
await callback(photo_id, comments)
await asyncio.sleep(crawl_interval)
result.extend(comments)
return result
async def get_creator_info(self, user_id: str) -> Dict:
"""
eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
快手用户主页
"""
visionProfile = await self.get_creator_profile(user_id)
return visionProfile.get("userProfile")
async def get_all_videos_by_creator(
self,
user_id: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
) -> List[Dict]:
"""
获取指定用户下的所有发过的帖子该方法会一直查找一个用户下的所有帖子信息
Args:
user_id: 用户ID
crawl_interval: 爬取一次的延迟单位
callback: 一次分页爬取结束后的更新回调函数
Returns:
"""
result = []
pcursor = ""
while pcursor != "no_more":
videos_res = await self.get_video_by_creater(user_id, pcursor)
if not videos_res:
utils.logger.error(
f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
)
break
vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
pcursor = vision_profile_photo_list.get("pcursor", "")
videos = vision_profile_photo_list.get("feeds", [])
utils.logger.info(
f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
)
if callback:
await callback(videos)
await asyncio.sleep(crawl_interval)
result.extend(videos)
return result

View File

@ -65,11 +65,14 @@ class KuaishouCrawler(AbstractCrawler):
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
# Search for videos and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_videos()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their videos and comments
await self.get_creators_and_videos()
else:
pass
@ -89,7 +92,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
page += 1
continue
utils.logger.info(f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.ks_client.search_info_by_keyword(
keyword=keyword,
@ -135,7 +138,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}")
return None
async def batch_get_video_comments(self, video_id_list: List[str]):
@ -145,7 +148,7 @@ class KuaishouCrawler(AbstractCrawler):
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled")
return
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
@ -200,10 +203,10 @@ class KuaishouCrawler(AbstractCrawler):
return playwright_proxy, httpx_proxy
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
"""Create xhs client"""
"""Create ks client"""
utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
xhs_client_obj = KuaiShouClient(
ks_client_obj = KuaiShouClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
@ -215,7 +218,7 @@ class KuaishouCrawler(AbstractCrawler):
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return xhs_client_obj
return ks_client_obj
async def launch_browser(
self,
@ -246,6 +249,39 @@ class KuaishouCrawler(AbstractCrawler):
)
return browser_context
async def get_creators_and_videos(self) -> None:
"""Get creator's videos and retrieve their comment information."""
utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators")
for user_id in config.KS_CREATOR_ID_LIST:
# get creator detail info from web html content
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
if createor_info:
await kuaishou_store.save_creator(user_id, creator=createor_info)
# Get all video information of the creator
all_video_list = await self.ks_client.get_all_videos_by_creator(
user_id = user_id,
crawl_interval = random.random(),
callback = self.fetch_creator_video_detail
)
video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list]
await self.batch_get_video_comments(video_ids)
async def fetch_creator_video_detail(self, video_list: List[Dict]):
"""
Concurrently obtain the specified post list and save the data
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list
]
video_details = await asyncio.gather(*task_list)
for video_detail in video_details:
if video_detail is not None:
await kuaishou_store.update_kuaishou_video(video_detail)
async def close(self):
"""Close browser context"""
await self.browser_context.close()

View File

@ -11,7 +11,7 @@ class KuaiShouGraphQL:
self.load_graphql_queries()
def load_graphql_queries(self):
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"]
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]
for file in graphql_files:
with open(self.graphql_dir + file, mode="r") as f:

View File

@ -1,16 +1,27 @@
query visionProfileUserList($pcursor: String, $ftype: Int) {
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
query visionProfile($userId: String) {
visionProfile(userId: $userId) {
result
fols {
hostName
userProfile {
ownerCount {
fan
photo
follow
photo_public
__typename
}
profile {
gender
user_name
user_id
headurl
user_text
user_profile_bg_url
__typename
}
isFollowing
user_id
__typename
}
hostName
pcursor
__typename
}
}

View File

@ -0,0 +1,110 @@
fragment photoContent on PhotoEntity {
__typename
id
duration
caption
originCaption
likeCount
viewCount
commentCount
realLikeCount
coverUrl
photoUrl
photoH265Url
manifest
manifestH265
videoResource
coverUrls {
url
__typename
}
timestamp
expTag
animatedCoverUrl
distance
videoRatio
liked
stereoType
profileUserTopPhoto
musicBlocked
riskTagContent
riskTagUrl
}
fragment recoPhotoFragment on recoPhotoEntity {
__typename
id
duration
caption
originCaption
likeCount
viewCount
commentCount
realLikeCount
coverUrl
photoUrl
photoH265Url
manifest
manifestH265
videoResource
coverUrls {
url
__typename
}
timestamp
expTag
animatedCoverUrl
distance
videoRatio
liked
stereoType
profileUserTopPhoto
musicBlocked
riskTagContent
riskTagUrl
}
fragment feedContent on Feed {
type
author {
id
name
headerUrl
following
headerUrls {
url
__typename
}
__typename
}
photo {
...photoContent
...recoPhotoFragment
__typename
}
canAddComment
llsid
status
currentPcursor
tags {
type
name
__typename
}
__typename
}
query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
result
llsid
webPageArea
feeds {
...feedContent
__typename
}
hostName
pcursor
__typename
}
}

View File

@ -0,0 +1,16 @@
query visionProfileUserList($pcursor: String, $ftype: Int) {
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
result
fols {
user_name
headurl
user_text
isFollowing
user_id
__typename
}
hostName
pcursor
__typename
}
}

View File

@ -0,0 +1,22 @@
mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
pcursor
subComments {
commentId
authorId
authorName
content
headurl
timestamp
likedCount
realLikedCount
liked
status
authorLiked
replyToUserName
replyTo
__typename
}
__typename
}
}

View File

@ -7,6 +7,7 @@ from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from base.base_crawler import AbstractLogin
from tools import utils

View File

@ -108,7 +108,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
page += 1
continue
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
search_res = await self.wb_client.get_note_by_keyword(
keyword=keyword,
page=page,

View File

@ -12,6 +12,7 @@ from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from base.base_crawler import AbstractLogin
from tools import utils

View File

@ -102,6 +102,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
continue
try:
utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}")
note_id_list: List[str] = []
notes_res = await self.xhs_client.get_note_by_keyword(
keyword=keyword,

Binary file not shown.

Before

Width:  |  Height:  |  Size: 169 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 171 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 175 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 242 KiB

After

Width:  |  Height:  |  Size: 241 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 115 KiB

View File

@ -13,9 +13,9 @@ import aiofiles
import config
from base.base_crawler import AbstractStore
from tools import utils
from tools import utils, words
from var import crawler_type_var
from tools import words
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中

View File

@ -11,10 +11,10 @@ from typing import Dict
import aiofiles
import config
from base.base_crawler import AbstractStore
from tools import utils, words
from var import crawler_type_var
import config
def calculate_number_of_files(file_store_path: str) -> int:

View File

@ -76,3 +76,22 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict):
utils.logger.info(
f"[store.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {save_comment_item.get('content')}")
await KuaishouStoreFactory.create_store().store_comment(comment_item=save_comment_item)
async def save_creator(user_id: str, creator: Dict):
ownerCount = creator.get('ownerCount', {})
profile = creator.get('profile', {})
local_db_item = {
'user_id': user_id,
'nickname': profile.get('user_name'),
'gender': '' if profile.get('gender') == "F" else '',
'avatar': profile.get('headurl'),
'desc': profile.get('user_text'),
'ip_location': "",
'follows': ownerCount.get("follow"),
'fans': ownerCount.get("fan"),
'interaction': ownerCount.get("photo_public"),
"last_modify_ts": utils.get_current_timestamp(),
}
utils.logger.info(f"[store.kuaishou.save_creator] creator:{local_db_item}")
await KuaishouStoreFactory.create_store().store_creator(local_db_item)

View File

@ -11,10 +11,11 @@ from typing import Dict
import aiofiles
import config
from base.base_crawler import AbstractStore
from tools import utils, words
from var import crawler_type_var
import config
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
@ -205,3 +206,14 @@ class KuaishouJsonStoreImplement(AbstractStore):
"""
await self.save_data_to_json(comment_item, "comments")
async def store_creator(self, creator: Dict):
"""
Kuaishou content JSON storage implementation
Args:
creator: creator dict
Returns:
"""
await self.save_data_to_json(creator, "creator")

View File

@ -11,10 +11,11 @@ from typing import Dict
import aiofiles
import config
from base.base_crawler import AbstractStore
from tools import utils, words
from var import crawler_type_var
import config
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中

View File

@ -113,7 +113,7 @@ async def save_creator(user_id: str, creator: Dict):
'gender': '' if user_info.get('gender') == 1 else '',
'avatar': user_info.get('images'),
'desc': user_info.get('desc'),
'ip_location': user_info.get('ip_location'),
'ip_location': user_info.get('ipLocation'),
'follows': follows,
'fans': fans,
'interaction': interaction,

View File

@ -11,10 +11,11 @@ from typing import Dict
import aiofiles
import config
from base.base_crawler import AbstractStore
from tools import utils, words
from var import crawler_type_var
import config
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中

View File

@ -1,10 +1,12 @@
import aiofiles
import asyncio
import jieba
from collections import Counter
from wordcloud import WordCloud
import json
from collections import Counter
import aiofiles
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import config
from tools import utils