diff --git a/README.md b/README.md index 2288bfb..4ce21da 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,7 @@ ### 数据保存 - 支持保存到关系型数据库(Mysql、PgSQL等) - 支持保存到csv中(data/目录下) +- 支持保存到json中(data/目录下) ## 如何使用 IP 代理 ➡️➡️➡️ [IP代理使用方法](docs/代理使用.md) diff --git a/config/base_config.py b/config/base_config.py index ab5ec4e..990ab25 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -21,7 +21,7 @@ HEADLESS = True SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "csv" # csv or db or json +SAVE_DATA_OPTION = "json" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py index 1006bd5..8516937 100644 --- a/store/bilibili/bilibili_store_impl.py +++ b/store/bilibili/bilibili_store_impl.py @@ -2,8 +2,9 @@ # @Author : relakkes@gmail.com # @Time : 2024/1/14 19:34 # @Desc : B站存储实现类 - import csv +import json +import os import pathlib from typing import Dict @@ -122,8 +123,59 @@ class BiliDbStoreImplement(AbstractStore): class BiliJsonStoreImplement(AbstractStore): + json_store_path: str = "data/bilibili" + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + save_data = [] + + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + async def store_content(self, content_item: Dict): - pass + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") async def store_comment(self, comment_item: Dict): - pass + """ + comment JSON storage implementatio + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py index 023897d..671f5fa 100644 --- a/store/douyin/douyin_store_impl.py +++ b/store/douyin/douyin_store_impl.py @@ -2,8 +2,9 @@ # @Author : relakkes@gmail.com # @Time : 2024/1/14 18:46 # @Desc : 抖音存储实现类 - import csv +import json +import os import pathlib from typing import Dict @@ -122,9 +123,59 @@ class DouyinDbStoreImplement(AbstractStore): class DouyinJsonStoreImplement(AbstractStore): + json_store_path: str = "data/douyin" + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + save_data = [] + + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + async def store_content(self, content_item: Dict): - pass + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") async def store_comment(self, comment_item: Dict): - pass + """ + comment JSON storage implementatio + Args: + comment_item: + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py index cf29433..e68e635 100644 --- a/store/kuaishou/kuaishou_store_impl.py +++ b/store/kuaishou/kuaishou_store_impl.py @@ -3,6 +3,8 @@ # @Time : 2024/1/14 20:03 # @Desc : 快手存储实现类 import csv +import json +import os import pathlib from typing import Dict @@ -121,8 +123,59 @@ class KuaishouDbStoreImplement(AbstractStore): class KuaishouJsonStoreImplement(AbstractStore): + json_store_path: str = "data/kuaishou" + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + save_data = [] + + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + async def store_content(self, content_item: Dict): - pass + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") async def store_comment(self, comment_item: Dict): - pass + """ + comment JSON storage implementatio + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") diff --git a/store/weibo/__init__.py b/store/weibo/__init__.py index 0e1ca56..3424dcb 100644 --- a/store/weibo/__init__.py +++ b/store/weibo/__init__.py @@ -15,7 +15,7 @@ class WeibostoreFactory: STORES = { "csv": WeiboCsvStoreImplement, "db": WeiboDbStoreImplement, - "json": BiliJsonStoreImplement + "json": WeiboJsonStoreImplement } @staticmethod diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py index 65dabf0..3a6caa8 100644 --- a/store/weibo/weibo_store_impl.py +++ b/store/weibo/weibo_store_impl.py @@ -3,6 +3,8 @@ # @Time : 2024/1/14 21:35 # @Desc : 微博存储实现类 import csv +import json +import os import pathlib from typing import Dict @@ -120,9 +122,60 @@ class WeiboDbStoreImplement(AbstractStore): await WeiboComment.filter(comment_id=comment_id).update(**comment_data.model_dump()) -class BiliJsonStoreImplement(AbstractStore): +class WeiboJsonStoreImplement(AbstractStore): + json_store_path: str = "data/weibo" + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + save_data = [] + + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + async def store_content(self, content_item: Dict): - pass + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") async def store_comment(self, comment_item: Dict): - pass + """ + comment JSON storage implementatio + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py index 82fd97c..d1e245d 100644 --- a/store/xhs/xhs_store_impl.py +++ b/store/xhs/xhs_store_impl.py @@ -3,6 +3,8 @@ # @Time : 2024/1/14 16:58 # @Desc : 小红书存储实现类 import csv +import json +import os import pathlib from typing import Dict @@ -120,8 +122,59 @@ class XhsDbStoreImplement(AbstractStore): class XhsJsonStoreImplement(AbstractStore): + json_store_path: str = "data/xhs" + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + save_data = [] + + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + async def store_content(self, content_item: Dict): - pass + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") async def store_comment(self, comment_item: Dict): - pass + """ + comment JSON storage implementatio + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments")