From 9f8ffe18407860c92f62d626649fca5c2f402b0f Mon Sep 17 00:00:00 2001 From: KEXNA <91727108+kexinoh@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:57:44 +0800 Subject: [PATCH] Update weibo_store_impl.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update bilibili_store_impl.py 新增id Update bilibili_store_impl.py 新增id,解决同一天内的不同查询写入同一个文件的问题 Update douyin_store_impl.py 新增id,解决同一天内的不同查询写入同一个文件的问题 Update kuaishou_store_impl.py Update weibo_store_impl.py Update xhs_store_impl.py Update weibo_store_impl.py Update kuaishou_store_impl.py Update bilibili_store_impl.py Update douyin_store_impl.py Update kuaishou_store_impl.py Update xhs_store_impl.py --- store/bilibili/bilibili_store_impl.py | 20 ++++++++++++++++---- store/douyin/douyin_store_impl.py | 19 ++++++++++++++++--- store/kuaishou/kuaishou_store_impl.py | 21 ++++++++++++++++++--- store/weibo/weibo_store_impl.py | 18 +++++++++++++++++- store/xhs/xhs_store_impl.py | 20 +++++++++++++++++--- 5 files changed, 84 insertions(+), 14 deletions(-) diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py index 3f69aa1..eea2e81 100644 --- a/store/bilibili/bilibili_store_impl.py +++ b/store/bilibili/bilibili_store_impl.py @@ -14,11 +14,20 @@ import aiofiles from base.base_crawler import AbstractStore from tools import utils from var import crawler_type_var - +def calculatet_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 class BiliCsvStoreImplement(AbstractStore): csv_store_path: str = "data/bilibili" - + file_count:int=calculatet_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ make save file name by store type @@ -28,7 +37,7 @@ class BiliCsvStoreImplement(AbstractStore): Returns: eg: data/bilibili/search_comments_20240114.csv ... """ - return f"{self.csv_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" async def save_data_to_csv(self, save_item: Dict, store_type: str): """ @@ -118,6 +127,8 @@ class BiliDbStoreImplement(AbstractStore): class BiliJsonStoreImplement(AbstractStore): json_store_path: str = "data/bilibili" lock = asyncio.Lock() + file_count:int=calculatet_number_of_files(json_store_path) + def make_save_file_name(self, store_type: str) -> str: """ @@ -128,7 +139,8 @@ class BiliJsonStoreImplement(AbstractStore): Returns: """ - return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" async def save_data_to_json(self, save_item: Dict, store_type: str): """ diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py index da26c0a..02e61e0 100644 --- a/store/douyin/douyin_store_impl.py +++ b/store/douyin/douyin_store_impl.py @@ -14,10 +14,20 @@ import aiofiles from base.base_crawler import AbstractStore from tools import utils from var import crawler_type_var - +def calculatet_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 class DouyinCsvStoreImplement(AbstractStore): csv_store_path: str = "data/douyin" + file_count:int=calculatet_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -28,7 +38,7 @@ class DouyinCsvStoreImplement(AbstractStore): Returns: eg: data/douyin/search_comments_20240114.csv ... """ - return f"{self.csv_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" async def save_data_to_csv(self, save_item: Dict, store_type: str): """ @@ -119,6 +129,7 @@ class DouyinDbStoreImplement(AbstractStore): class DouyinJsonStoreImplement(AbstractStore): json_store_path: str = "data/douyin" lock = asyncio.Lock() + file_count:int=calculatet_number_of_files(json_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -129,7 +140,9 @@ class DouyinJsonStoreImplement(AbstractStore): Returns: """ - return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + + return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" async def save_data_to_json(self, save_item: Dict, store_type: str): """ diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py index 99ae1f4..c70d7f9 100644 --- a/store/kuaishou/kuaishou_store_impl.py +++ b/store/kuaishou/kuaishou_store_impl.py @@ -14,10 +14,21 @@ import aiofiles from base.base_crawler import AbstractStore from tools import utils from var import crawler_type_var +def calculatet_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 class KuaishouCsvStoreImplement(AbstractStore): csv_store_path: str = "data/kuaishou" + file_count:int=calculatet_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -25,10 +36,10 @@ class KuaishouCsvStoreImplement(AbstractStore): Args: store_type: contents or comments - Returns: eg: data/kuaishou/search_comments_20240114.csv ... + Returns: eg: data/douyin/search_comments_20240114.csv ... """ - return f"{self.csv_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" async def save_data_to_csv(self, save_item: Dict, store_type: str): """ @@ -117,6 +128,8 @@ class KuaishouDbStoreImplement(AbstractStore): class KuaishouJsonStoreImplement(AbstractStore): json_store_path: str = "data/kuaishou" lock = asyncio.Lock() + file_count:int=calculatet_number_of_files(json_store_path) + def make_save_file_name(self, store_type: str) -> str: """ @@ -127,7 +140,9 @@ class KuaishouJsonStoreImplement(AbstractStore): Returns: """ - return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + + return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" async def save_data_to_json(self, save_item: Dict, store_type: str): """ diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py index e2df846..d0247cb 100644 --- a/store/weibo/weibo_store_impl.py +++ b/store/weibo/weibo_store_impl.py @@ -15,9 +15,21 @@ from base.base_crawler import AbstractStore from tools import utils from var import crawler_type_var +def calculatet_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + class WeiboCsvStoreImplement(AbstractStore): csv_store_path: str = "data/weibo" + file_count:int=calculatet_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -28,6 +40,7 @@ class WeiboCsvStoreImplement(AbstractStore): Returns: eg: data/bilibili/search_comments_20240114.csv ... """ + return f"{self.csv_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" async def save_data_to_csv(self, save_item: Dict, store_type: str): @@ -117,6 +130,8 @@ class WeiboDbStoreImplement(AbstractStore): class WeiboJsonStoreImplement(AbstractStore): json_store_path: str = "data/weibo" lock = asyncio.Lock() + file_count:int=calculatet_number_of_files(json_store_path) + def make_save_file_name(self, store_type: str) -> str: """ @@ -127,7 +142,8 @@ class WeiboJsonStoreImplement(AbstractStore): Returns: """ - return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" async def save_data_to_json(self, save_item: Dict, store_type: str): """ diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py index f7a8bcf..8ca5ab0 100644 --- a/store/xhs/xhs_store_impl.py +++ b/store/xhs/xhs_store_impl.py @@ -15,9 +15,21 @@ from base.base_crawler import AbstractStore from tools import utils from var import crawler_type_var +def calculatet_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + class XhsCsvStoreImplement(AbstractStore): csv_store_path: str = "data/xhs" + file_count:int=calculatet_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -28,7 +40,7 @@ class XhsCsvStoreImplement(AbstractStore): Returns: eg: data/xhs/search_comments_20240114.csv ... """ - return f"{self.csv_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" async def save_data_to_csv(self, save_item: Dict, store_type: str): """ @@ -147,6 +159,7 @@ class XhsDbStoreImplement(AbstractStore): class XhsJsonStoreImplement(AbstractStore): json_store_path: str = "data/xhs" lock = asyncio.Lock() + file_count:int=calculatet_number_of_files(json_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -157,7 +170,8 @@ class XhsJsonStoreImplement(AbstractStore): Returns: """ - return f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + + return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" async def save_data_to_json(self, save_item: Dict, store_type: str): """ @@ -213,4 +227,4 @@ class XhsJsonStoreImplement(AbstractStore): Returns: """ - await self.save_data_to_json(creator, "creator") \ No newline at end of file + await self.save_data_to_json(creator, "creator")