refactor: 代码优化

This commit is contained in:
Relakkes 2024-01-16 00:40:07 +08:00
parent e490123fcd
commit e0f9a487e4
9 changed files with 163 additions and 114 deletions

View File

@ -11,9 +11,6 @@ ENABLE_IP_PROXY = False
# 代理IP池数量 # 代理IP池数量
IP_PROXY_POOL_COUNT = 2 IP_PROXY_POOL_COUNT = 2
# 重试时间
RETRY_INTERVAL = 60 * 30 # 30 minutes
# 设置为True不会打开浏览器无头浏览器设置False会打开一个浏览器小红书如果一直扫码登录不通过打开浏览器手动过一下滑动验证码 # 设置为True不会打开浏览器无头浏览器设置False会打开一个浏览器小红书如果一直扫码登录不通过打开浏览器手动过一下滑动验证码
HEADLESS = True HEADLESS = True

View File

@ -1,6 +1,6 @@
import asyncio import asyncio
import json import json
from typing import Dict from typing import Callable, Dict, List, Optional
from urllib.parse import urlencode from urllib.parse import urlencode
import httpx import httpx
@ -34,7 +34,16 @@ class XHSClient:
self.playwright_page = playwright_page self.playwright_page = playwright_page
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
async def _pre_headers(self, url: str, data=None): async def _pre_headers(self, url: str, data=None) -> Dict:
"""
请求头参数签名
Args:
url:
data:
Returns:
"""
encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data]) encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
local_storage = await self.playwright_page.evaluate("() => window.localStorage") local_storage = await self.playwright_page.evaluate("() => window.localStorage")
signs = sign( signs = sign(
@ -54,6 +63,16 @@ class XHSClient:
return self.headers return self.headers
async def request(self, method, url, **kwargs) -> Dict: async def request(self, method, url, **kwargs) -> Dict:
"""
封装httpx的公共请求方法对请求响应做一些处理
Args:
method: 请求方法
url: 请求的URL
**kwargs: 其他请求参数例如请求头请求体等
Returns:
"""
async with httpx.AsyncClient(proxies=self.proxies) as client: async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request( response = await client.request(
method, url, timeout=self.timeout, method, url, timeout=self.timeout,
@ -68,6 +87,15 @@ class XHSClient:
raise DataFetchError(data.get("msg", None)) raise DataFetchError(data.get("msg", None))
async def get(self, uri: str, params=None) -> Dict: async def get(self, uri: str, params=None) -> Dict:
"""
GET请求对请求头签名
Args:
uri: 请求路由
params: 请求参数
Returns:
"""
final_uri = uri final_uri = uri
if isinstance(params, dict): if isinstance(params, dict):
final_uri = (f"{uri}?" final_uri = (f"{uri}?"
@ -76,12 +104,26 @@ class XHSClient:
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers) return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
async def post(self, uri: str, data: dict) -> Dict: async def post(self, uri: str, data: dict) -> Dict:
"""
POST请求对请求头签名
Args:
uri: 请求路由
data: 请求体参数
Returns:
"""
headers = await self._pre_headers(uri, data) headers = await self._pre_headers(uri, data)
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}", return await self.request(method="POST", url=f"{self._host}{uri}",
data=json_str, headers=headers) data=json_str, headers=headers)
async def pong(self) -> bool: async def pong(self) -> bool:
"""
用于检查登录态是否失效了
Returns:
"""
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
utils.logger.info("[XHSClient.pong] Begin to pong xhs...") utils.logger.info("[XHSClient.pong] Begin to pong xhs...")
ping_flag = False ping_flag = False
@ -95,6 +137,14 @@ class XHSClient:
return ping_flag return ping_flag
async def update_cookies(self, browser_context: BrowserContext): async def update_cookies(self, browser_context: BrowserContext):
"""
API客户端提供的更新cookies方法一般情况下登录成功后会调用此方法
Args:
browser_context: 浏览器上下文对象
Returns:
"""
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
@ -105,14 +155,17 @@ class XHSClient:
sort: SearchSortType = SearchSortType.GENERAL, sort: SearchSortType = SearchSortType.GENERAL,
note_type: SearchNoteType = SearchNoteType.ALL note_type: SearchNoteType = SearchNoteType.ALL
) -> Dict: ) -> Dict:
"""search note by keyword """
根据关键词搜索笔记
Args:
keyword: 关键词参数
page: 分页第几页
page_size: 分页数据长度
sort: 搜索结果排序指定
note_type: 搜索的笔记类型
Returns:
:param keyword: what notes you want to search
:param page: page number, defaults to 1
:param page_size: page size, defaults to 20
:param sort: sort ordering, defaults to SearchSortType.GENERAL
:param note_type: note type, defaults to SearchNoteType.ALL
:return: {has_more: true, items: []}
""" """
uri = "/api/sns/web/v1/search/notes" uri = "/api/sns/web/v1/search/notes"
data = { data = {
@ -127,8 +180,12 @@ class XHSClient:
async def get_note_by_id(self, note_id: str) -> Dict: async def get_note_by_id(self, note_id: str) -> Dict:
""" """
:param note_id: note_id you want to fetch 获取笔记详情API
:return: {"time":1679019883000,"user":{"nickname":"nickname","avatar":"avatar","user_id":"user_id"},"image_list":[{"url":"https://sns-img-qc.xhscdn.com/c8e505ca-4e5f-44be-fe1c-ca0205a38bad","trace_id":"1000g00826s57r6cfu0005ossb1e9gk8c65d0c80","file_id":"c8e505ca-4e5f-44be-fe1c-ca0205a38bad","height":1920,"width":1440}],"tag_list":[{"id":"5be78cdfdb601f000100d0bc","name":"jk","type":"topic"}],"desc":"裙裙","interact_info":{"followed":false,"liked":false,"liked_count":"1732","collected":false,"collected_count":"453","comment_count":"30","share_count":"41"},"at_user_list":[],"last_update_time":1679019884000,"note_id":"6413cf6b00000000270115b5","type":"normal","title":"title"} Args:
note_id:笔记ID
Returns:
""" """
data = {"source_note_id": note_id} data = {"source_note_id": note_id}
uri = "/api/sns/web/v1/feed" uri = "/api/sns/web/v1/feed"
@ -140,10 +197,14 @@ class XHSClient:
return dict() return dict()
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict: async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
"""get note comments """
:param note_id: note id you want to fetch 获取一级评论的API
:param cursor: last you get cursor, defaults to "" Args:
:return: {"has_more": true,"cursor": "6422442d000000000700dcdb",comments: [],"user_id": "63273a77000000002303cc9b","time": 1681566542930} note_id: 笔记ID
cursor: 分页游标
Returns:
""" """
uri = "/api/sns/web/v2/comment/page" uri = "/api/sns/web/v2/comment/page"
params = { params = {
@ -152,18 +213,17 @@ class XHSClient:
} }
return await self.get(uri, params) return await self.get(uri, params)
async def get_note_sub_comments( async def get_note_sub_comments(self, note_id: str, root_comment_id: str, num: int = 30, cursor: str = ""):
self, note_id: str,
root_comment_id: str,
num: int = 30, cursor: str = ""
):
""" """
get note sub comments 获取指定父评论下的子评论的API
:param note_id: note id you want to fetch Args:
:param root_comment_id: parent comment id note_id: 子评论的帖子ID
:param num: recommend 30, if num greater 30, it only return 30 comments root_comment_id: 根评论ID
:param cursor: last you get cursor, defaults to "" num: 分页数量
:return: {"has_more": true,"cursor": "6422442d000000000700dcdb",comments: [],"user_id": "63273a77000000002303cc9b","time": 1681566542930} cursor: 分页游标
Returns:
""" """
uri = "/api/sns/web/v2/comment/sub/page" uri = "/api/sns/web/v2/comment/sub/page"
params = { params = {
@ -174,15 +234,18 @@ class XHSClient:
} }
return await self.get(uri, params) return await self.get(uri, params)
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False): async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
""" callback: Optional[Callable] = None) -> List[Dict]:
get note all comments include sub comments
:param note_id:
:param crawl_interval:
:param is_fetch_sub_comments:
:return:
""" """
获取指定笔记下的所有一级评论该方法会一直查找一个帖子下的所有评论信息
Args:
note_id: 笔记ID
crawl_interval: 爬取一次笔记的延迟单位
callback: 一次笔记爬取结束后
Returns:
"""
result = [] result = []
comments_has_more = True comments_has_more = True
comments_cursor = "" comments_cursor = ""
@ -190,34 +253,13 @@ class XHSClient:
comments_res = await self.get_note_comments(note_id, comments_cursor) comments_res = await self.get_note_comments(note_id, comments_cursor)
comments_has_more = comments_res.get("has_more", False) comments_has_more = comments_res.get("has_more", False)
comments_cursor = comments_res.get("cursor", "") comments_cursor = comments_res.get("cursor", "")
# Check if 'comments' key exists in the response
if "comments" not in comments_res: if "comments" not in comments_res:
# Handle the absence of 'comments' key appropriately utils.logger.info(
# For example, log an error message, break from the loop, etc. f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
# This is just an example:
utils.logger.info(f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
break break
comments = comments_res["comments"] comments = comments_res["comments"]
if not is_fetch_sub_comments: if callback:
await callback(note_id, comments)
await asyncio.sleep(crawl_interval)
result.extend(comments) result.extend(comments)
continue
# handle get sub comments
for comment in comments:
result.append(comment)
cur_sub_comment_count = int(comment["sub_comment_count"])
cur_sub_comments = comment["sub_comments"]
result.extend(cur_sub_comments)
sub_comments_has_more = comment["sub_comment_has_more"] and len(
cur_sub_comments) < cur_sub_comment_count
sub_comment_cursor = comment["sub_comment_cursor"]
while sub_comments_has_more:
page_num = 30
sub_comments_res = await self.get_note_sub_comments(note_id, comment["id"], num=page_num,
cursor=sub_comment_cursor)
sub_comments = sub_comments_res["comments"]
sub_comments_has_more = sub_comments_res["has_more"] and len(sub_comments) == page_num
sub_comment_cursor = sub_comments_res["cursor"]
result.extend(sub_comments)
await asyncio.sleep(crawl_interval)
await asyncio.sleep(crawl_interval)
return result return result

View File

@ -139,12 +139,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}") utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}") utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
return None return None
async def batch_get_note_comments(self, note_list: List[str]): async def batch_get_note_comments(self, note_list: List[str]):
"""Batch get note comments""" """Batch get note comments"""
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}") utils.logger.info(
f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
for note_id in note_list: for note_id in note_list:
@ -156,25 +158,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
"""Get note comments with keyword filtering and quantity limitation""" """Get note comments with keyword filtering and quantity limitation"""
async with semaphore: async with semaphore:
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}") utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) await self.xhs_client.get_note_all_comments(
note_id=note_id,
# 从配置文件中读取关键词和数量限制 crawl_interval=random.random(),
keywords = getattr(config, 'COMMENT_KEYWORDS', []) callback=xhs_store.batch_update_xhs_note_comments
max_comments = getattr(config, 'MAX_COMMENTS_PER_POST', 0) )
# 过滤评论
filtered_comments = []
for comment in all_comments:
# 检查评论内容是否包含关键词
if not keywords or any(keyword in comment['content'] for keyword in keywords):
filtered_comments.append(comment)
# 如果达到最大评论数量限制,则停止添加更多评论
if max_comments and len(filtered_comments) >= max_comments:
break
# 更新或保存过滤后的评论
for comment in filtered_comments:
await xhs_store.update_xhs_note_comment(note_id=note_id, comment_item=comment)
@staticmethod @staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:

View File

@ -2,6 +2,7 @@
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 19:34 # @Time : 2024/1/14 19:34
# @Desc : B站存储实现类 # @Desc : B站存储实现类
import asyncio
import csv import csv
import json import json
import os import os
@ -124,6 +125,7 @@ class BiliDbStoreImplement(AbstractStore):
class BiliJsonStoreImplement(AbstractStore): class BiliJsonStoreImplement(AbstractStore):
json_store_path: str = "data/bilibili" json_store_path: str = "data/bilibili"
lock = asyncio.Lock()
def make_save_file_name(self, store_type: str) -> str: def make_save_file_name(self, store_type: str) -> str:
""" """
@ -150,6 +152,7 @@ class BiliJsonStoreImplement(AbstractStore):
save_file_name = self.make_save_file_name(store_type=store_type) save_file_name = self.make_save_file_name(store_type=store_type)
save_data = [] save_data = []
async with self.lock:
if os.path.exists(save_file_name): if os.path.exists(save_file_name):
async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file:
save_data = json.loads(await file.read()) save_data = json.loads(await file.read())

View File

@ -2,6 +2,7 @@
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 18:46 # @Time : 2024/1/14 18:46
# @Desc : 抖音存储实现类 # @Desc : 抖音存储实现类
import asyncio
import csv import csv
import json import json
import os import os
@ -124,6 +125,7 @@ class DouyinDbStoreImplement(AbstractStore):
class DouyinJsonStoreImplement(AbstractStore): class DouyinJsonStoreImplement(AbstractStore):
json_store_path: str = "data/douyin" json_store_path: str = "data/douyin"
lock = asyncio.Lock()
def make_save_file_name(self, store_type: str) -> str: def make_save_file_name(self, store_type: str) -> str:
""" """
@ -150,6 +152,7 @@ class DouyinJsonStoreImplement(AbstractStore):
save_file_name = self.make_save_file_name(store_type=store_type) save_file_name = self.make_save_file_name(store_type=store_type)
save_data = [] save_data = []
async with self.lock:
if os.path.exists(save_file_name): if os.path.exists(save_file_name):
async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file:
save_data = json.loads(await file.read()) save_data = json.loads(await file.read())

View File

@ -2,6 +2,7 @@
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 20:03 # @Time : 2024/1/14 20:03
# @Desc : 快手存储实现类 # @Desc : 快手存储实现类
import asyncio
import csv import csv
import json import json
import os import os
@ -124,6 +125,7 @@ class KuaishouDbStoreImplement(AbstractStore):
class KuaishouJsonStoreImplement(AbstractStore): class KuaishouJsonStoreImplement(AbstractStore):
json_store_path: str = "data/kuaishou" json_store_path: str = "data/kuaishou"
lock = asyncio.Lock()
def make_save_file_name(self, store_type: str) -> str: def make_save_file_name(self, store_type: str) -> str:
""" """
@ -150,6 +152,7 @@ class KuaishouJsonStoreImplement(AbstractStore):
save_file_name = self.make_save_file_name(store_type=store_type) save_file_name = self.make_save_file_name(store_type=store_type)
save_data = [] save_data = []
async with self.lock:
if os.path.exists(save_file_name): if os.path.exists(save_file_name):
async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file:
save_data = json.loads(await file.read()) save_data = json.loads(await file.read())
@ -158,6 +161,7 @@ class KuaishouJsonStoreImplement(AbstractStore):
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False)) await file.write(json.dumps(save_data, ensure_ascii=False))
async def store_content(self, content_item: Dict): async def store_content(self, content_item: Dict):
""" """
content JSON storage implementation content JSON storage implementation

View File

@ -2,6 +2,7 @@
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 21:35 # @Time : 2024/1/14 21:35
# @Desc : 微博存储实现类 # @Desc : 微博存储实现类
import asyncio
import csv import csv
import json import json
import os import os
@ -124,6 +125,7 @@ class WeiboDbStoreImplement(AbstractStore):
class WeiboJsonStoreImplement(AbstractStore): class WeiboJsonStoreImplement(AbstractStore):
json_store_path: str = "data/weibo" json_store_path: str = "data/weibo"
lock = asyncio.Lock()
def make_save_file_name(self, store_type: str) -> str: def make_save_file_name(self, store_type: str) -> str:
""" """
@ -150,6 +152,7 @@ class WeiboJsonStoreImplement(AbstractStore):
save_file_name = self.make_save_file_name(store_type=store_type) save_file_name = self.make_save_file_name(store_type=store_type)
save_data = [] save_data = []
async with self.lock:
if os.path.exists(save_file_name): if os.path.exists(save_file_name):
async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file:
save_data = json.loads(await file.read()) save_data = json.loads(await file.read())
@ -158,6 +161,7 @@ class WeiboJsonStoreImplement(AbstractStore):
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False)) await file.write(json.dumps(save_data, ensure_ascii=False))
async def store_content(self, content_item: Dict): async def store_content(self, content_item: Dict):
""" """
content JSON storage implementation content JSON storage implementation

View File

@ -54,6 +54,11 @@ async def update_xhs_note(note_item: Dict):
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}") utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
await XhsStoreFactory.create_store().store_content(local_db_item) await XhsStoreFactory.create_store().store_content(local_db_item)
async def batch_update_xhs_note_comments(note_id: str, comments: List[Dict]):
if not comments:
return
for comment_item in comments:
await update_xhs_note_comment(note_id, comment_item)
async def update_xhs_note_comment(note_id: str, comment_item: Dict): async def update_xhs_note_comment(note_id: str, comment_item: Dict):
user_info = comment_item.get("user_info", {}) user_info = comment_item.get("user_info", {})

View File

@ -2,6 +2,7 @@
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 16:58 # @Time : 2024/1/14 16:58
# @Desc : 小红书存储实现类 # @Desc : 小红书存储实现类
import asyncio
import csv import csv
import json import json
import os import os
@ -123,6 +124,7 @@ class XhsDbStoreImplement(AbstractStore):
class XhsJsonStoreImplement(AbstractStore): class XhsJsonStoreImplement(AbstractStore):
json_store_path: str = "data/xhs" json_store_path: str = "data/xhs"
lock = asyncio.Lock()
def make_save_file_name(self, store_type: str) -> str: def make_save_file_name(self, store_type: str) -> str:
""" """
@ -149,6 +151,7 @@ class XhsJsonStoreImplement(AbstractStore):
save_file_name = self.make_save_file_name(store_type=store_type) save_file_name = self.make_save_file_name(store_type=store_type)
save_data = [] save_data = []
async with self.lock:
if os.path.exists(save_file_name): if os.path.exists(save_file_name):
async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file:
save_data = json.loads(await file.read()) save_data = json.loads(await file.read())