diff --git a/README.md b/README.md index ad3335b..9ad0002 100644 --- a/README.md +++ b/README.md @@ -17,13 +17,15 @@ ## 功能列表 > 下面不支持的项目,相关的代码架构已经搭建好,只需要实现对应的方法即可,欢迎大家提交PR -| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | -|-----|-------|----------|-----|--------|-------|-------| -| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | -| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | + +| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 | +|-----|-------|----------|-----|--------|-------|-------|-------| +| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | +| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | + ## 使用方法 @@ -186,4 +188,3 @@ - diff --git a/config/base_config.py b/config/base_config.py index 7c1fbe2..9b52e52 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -102,3 +102,21 @@ BILI_CREATOR_ID_LIST = [ "20813884", # ........................ ] + +#词云相关 +#是否开启生成评论词云图 +ENABLE_GET_WORDCLOUD = False +# 自定义词语及其分组 +#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 +CUSTOM_WORDS = { + '零几': '年份', # 将“零几”识别为一个整体 + '高频词': '专业术语' # 示例自定义词 +} + +#停用(禁用)词文件路径 +STOP_WORDS_FILE = "./docs/hit_stopwords.txt" + +#中文字体文件路径 +FONT_PATH= "./docs/STZHONGS.TTF" + + diff --git a/docs/STZHONGS.TTF b/docs/STZHONGS.TTF new file mode 100644 index 0000000..412dfcd Binary files /dev/null and b/docs/STZHONGS.TTF differ diff --git a/docs/hit_stopwords.txt b/docs/hit_stopwords.txt new file mode 100644 index 0000000..1d1818e --- /dev/null +++ b/docs/hit_stopwords.txt @@ -0,0 +1,768 @@ +\n +——— +》), +)÷(1- +”, +)、 +=( +: +→ +℃ +& +* +一一 +~~~~ +’ +. +『 +.一 +./ +-- +』 +=″ +【 +[*] +}> +[⑤]] +[①D] +c] +ng昉 +* +// +[ +] +[②e] +[②g] +={ +} +,也 +‘ +A +[①⑥] +[②B] +[①a] +[④a] +[①③] +[③h] +③] +1. +-- +[②b] +’‘ +××× +[①⑧] +0:2 +=[ +[⑤b] +[②c] +[④b] +[②③] +[③a] +[④c] +[①⑤] +[①⑦] +[①g] +∈[ +[①⑨] +[①④] +[①c] +[②f] +[②⑧] +[②①] +[①C] +[③c] +[③g] +[②⑤] +[②②] +一. +[①h] +.数 +[] +[①B] +数/ +[①i] +[③e] +[①①] +[④d] +[④e] +[③b] +[⑤a] +[①A] +[②⑧] +[②⑦] +[①d] +[②j] +〕〔 +][ +:// +′∈ +[②④ +[⑤e] +12% +b] +... +................... +…………………………………………………③ +ZXFITL +[③F] +」 +[①o] +]∧′=[ +∪φ∈ +′| +{- +②c +} +[③①] +R.L. +[①E] +Ψ +-[*]- +↑ +.日 +[②d] +[② +[②⑦] +[②②] +[③e] +[①i] +[①B] +[①h] +[①d] +[①g] +[①②] +[②a] +f] +[⑩] +a] +[①e] +[②h] +[②⑥] +[③d] +[②⑩] +e] +〉 +】 +元/吨 +[②⑩] +2.3% +5:0 +[①] +:: +[②] +[③] +[④] +[⑤] +[⑥] +[⑦] +[⑧] +[⑨] +…… +—— +? +、 +。 +“ +” +《 +》 +! +, +: +; +? +. +, +. +' +? +· +——— +── +? +— +< +> +( +) +〔 +〕 +[ +] +( +) +- ++ +~ +× +/ +/ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +Ⅲ +В +" +; +# +@ +γ +μ +φ +φ. +× +Δ +■ +▲ +sub +exp +sup +sub +Lex +# +% +& +' ++ ++ξ +++ +- +-β +< +<± +<Δ +<λ +<φ +<< += += +=☆ +=- +> +>λ +_ +~± +~+ +[⑤f] +[⑤d] +[②i] +≈ +[②G] +[①f] +LI +㈧ +[- +...... +〉 +[③⑩] +第二 +一番 +一直 +一个 +一些 +许多 +种 +有的是 +也就是说 +末##末 +啊 +阿 +哎 +哎呀 +哎哟 +唉 +俺 +俺们 +按 +按照 +吧 +吧哒 +把 +罢了 +被 +本 +本着 +比 +比方 +比如 +鄙人 +彼 +彼此 +边 +别 +别的 +别说 +并 +并且 +不比 +不成 +不单 +不但 +不独 +不管 +不光 +不过 +不仅 +不拘 +不论 +不怕 +不然 +不如 +不特 +不惟 +不问 +不只 +朝 +朝着 +趁 +趁着 +乘 +冲 +除 +除此之外 +除非 +除了 +此 +此间 +此外 +从 +从而 +打 +待 +但 +但是 +当 +当着 +到 +得 +的 +的话 +等 +等等 +地 +第 +叮咚 +对 +对于 +多 +多少 +而 +而况 +而且 +而是 +而外 +而言 +而已 +尔后 +反过来 +反过来说 +反之 +非但 +非徒 +否则 +嘎 +嘎登 +该 +赶 +个 +各 +各个 +各位 +各种 +各自 +给 +根据 +跟 +故 +故此 +固然 +关于 +管 +归 +果然 +果真 +过 +哈 +哈哈 +呵 +和 +何 +何处 +何况 +何时 +嘿 +哼 +哼唷 +呼哧 +乎 +哗 +还是 +还有 +换句话说 +换言之 +或 +或是 +或者 +极了 +及 +及其 +及至 +即 +即便 +即或 +即令 +即若 +即使 +几 +几时 +己 +既 +既然 +既是 +继而 +加之 +假如 +假若 +假使 +鉴于 +将 +较 +较之 +叫 +接着 +结果 +借 +紧接着 +进而 +尽 +尽管 +经 +经过 +就 +就是 +就是说 +据 +具体地说 +具体说来 +开始 +开外 +靠 +咳 +可 +可见 +可是 +可以 +况且 +啦 +来 +来着 +离 +例如 +哩 +连 +连同 +两者 +了 +临 +另 +另外 +另一方面 +论 +嘛 +吗 +慢说 +漫说 +冒 +么 +每 +每当 +们 +莫若 +某 +某个 +某些 +拿 +哪 +哪边 +哪儿 +哪个 +哪里 +哪年 +哪怕 +哪天 +哪些 +哪样 +那 +那边 +那儿 +那个 +那会儿 +那里 +那么 +那么些 +那么样 +那时 +那些 +那样 +乃 +乃至 +呢 +能 +你 +你们 +您 +宁 +宁可 +宁肯 +宁愿 +哦 +呕 +啪达 +旁人 +呸 +凭 +凭借 +其 +其次 +其二 +其他 +其它 +其一 +其余 +其中 +起 +起见 +起见 +岂但 +恰恰相反 +前后 +前者 +且 +然而 +然后 +然则 +让 +人家 +任 +任何 +任凭 +如 +如此 +如果 +如何 +如其 +如若 +如上所述 +若 +若非 +若是 +啥 +上下 +尚且 +设若 +设使 +甚而 +甚么 +甚至 +省得 +时候 +什么 +什么样 +使得 +是 +是的 +首先 +谁 +谁知 +顺 +顺着 +似的 +虽 +虽然 +虽说 +虽则 +随 +随着 +所 +所以 +他 +他们 +他人 +它 +它们 +她 +她们 +倘 +倘或 +倘然 +倘若 +倘使 +腾 +替 +通过 +同 +同时 +哇 +万一 +往 +望 +为 +为何 +为了 +为什么 +为着 +喂 +嗡嗡 +我 +我们 +呜 +呜呼 +乌乎 +无论 +无宁 +毋宁 +嘻 +吓 +相对而言 +像 +向 +向着 +嘘 +呀 +焉 +沿 +沿着 +要 +要不 +要不然 +要不是 +要么 +要是 +也 +也罢 +也好 +一 +一般 +一旦 +一方面 +一来 +一切 +一样 +一则 +依 +依照 +矣 +以 +以便 +以及 +以免 +以至 +以至于 +以致 +抑或 +因 +因此 +因而 +因为 +哟 +用 +由 +由此可见 +由于 +有 +有的 +有关 +有些 +又 +于 +于是 +于是乎 +与 +与此同时 +与否 +与其 +越是 +云云 +哉 +再说 +再者 +在 +在下 +咱 +咱们 +则 +怎 +怎么 +怎么办 +怎么样 +怎样 +咋 +照 +照着 +者 +这 +这边 +这儿 +这个 +这会儿 +这就是说 +这里 +这么 +这么点儿 +这么些 +这么样 +这时 +这些 +这样 +正如 +吱 +之 +之类 +之所以 +之一 +只是 +只限 +只要 +只有 +至 +至于 +诸位 +着 +着呢 +自 +自从 +自个儿 +自各儿 +自己 +自家 +自身 +综上所述 +总的来看 +总的来说 +总的说来 +总而言之 +总之 +纵 +纵令 +纵然 +纵使 +遵照 +作为 +兮 +呃 +呗 +咚 +咦 +喏 +啐 +喔唷 +嗬 +嗯 +嗳 diff --git a/docs/常见问题.md b/docs/常见问题.md index e01fd37..1b749d4 100644 --- a/docs/常见问题.md +++ b/docs/常见问题.md @@ -22,4 +22,10 @@ Q: 报错 `playwright._impl._api_types.TimeoutError: Timeout 30000ms exceeded.`< A: 出现这种情况检查下开梯子没有
Q: 小红书扫码登录成功后如何手动验证? -A: 打开 config/base_config.py 文件, 找到 HEADLESS 配置项, 将其设置为 False, 此时重启项目, 在浏览器中手动通过验证码 +A: 打开 config/base_config.py 文件, 找到 HEADLESS 配置项, 将其设置为 False, 此时重启项目, 在浏览器中手动通过验证码
+ +Q: 如何配置词云图的生成? +A: 打开 config/base_config.py 文件, 找到`ENABLE_GET_WORDCLOUD` 以及`ENABLE_GET_COMMENTS` 两个配置项,将其都设为True即可使用该功能。
+ +Q: 如何给词云图添加禁用词和自定义词组? +A: 打开 `docs/hit_stopwords.txt` 输入禁用词(注意一个词语一行)。打开 config/base_config.py 文件找到 `CUSTOM_WORDS `按格式添加自定义词组即可。
diff --git a/docs/项目代码结构.md b/docs/项目代码结构.md index baaa71f..ca076ed 100644 --- a/docs/项目代码结构.md +++ b/docs/项目代码结构.md @@ -29,7 +29,8 @@ MediaCrawler │ ├── crawler_util.py # 爬虫相关的工具函数 │ ├── slider_util.py # 滑块相关的工具函数 │ ├── time_util.py # 时间相关的工具函数 -│ └── easing.py # 模拟滑动轨迹相关的函数 +│ ├── easing.py # 模拟滑动轨迹相关的函数 +| └── words.py # 生成词云图相关的函数 ├── db.py # DB ORM ├── main.py # 程序入口 ├── var.py # 上下文变量定义 diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py index 018244d..7b93432 100644 --- a/store/bilibili/bilibili_store_impl.py +++ b/store/bilibili/bilibili_store_impl.py @@ -11,10 +11,11 @@ from typing import Dict import aiofiles +import config from base.base_crawler import AbstractStore from tools import utils from var import crawler_type_var - +from tools import words def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 @@ -130,12 +131,14 @@ class BiliDbStoreImplement(AbstractStore): class BiliJsonStoreImplement(AbstractStore): - json_store_path: str = "data/bilibili" + json_store_path: str = "data/bilibili/json" + words_store_path: str = "data/bilibili/words" lock = asyncio.Lock() file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() - def make_save_file_name(self, store_type: str) -> str: + def make_save_file_name(self, store_type: str) -> (str,str): """ make save file name by store type Args: @@ -145,7 +148,10 @@ class BiliJsonStoreImplement(AbstractStore): """ - return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) async def save_data_to_json(self, save_item: Dict, store_type: str): """ @@ -158,7 +164,8 @@ class BiliJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) - save_file_name = self.make_save_file_name(store_type=store_type) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: @@ -170,6 +177,12 @@ class BiliJsonStoreImplement(AbstractStore): async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: await file.write(json.dumps(save_data, ensure_ascii=False)) + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + async def store_content(self, content_item: Dict): """ content JSON storage implementation diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py index 6277b4c..a4672ee 100644 --- a/store/douyin/douyin_store_impl.py +++ b/store/douyin/douyin_store_impl.py @@ -12,8 +12,9 @@ from typing import Dict import aiofiles from base.base_crawler import AbstractStore -from tools import utils +from tools import utils,words from var import crawler_type_var +import config def calculate_number_of_files(file_store_path: str) -> int: @@ -162,11 +163,14 @@ class DouyinDbStoreImplement(AbstractStore): await update_creator_by_user_id(user_id, creator) class DouyinJsonStoreImplement(AbstractStore): - json_store_path: str = "data/douyin" + json_store_path: str = "data/douyin/json" + words_store_path: str = "data/douyin/words" + lock = asyncio.Lock() file_count: int = calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() - def make_save_file_name(self, store_type: str) -> str: + def make_save_file_name(self, store_type: str) -> (str,str): """ make save file name by store type Args: @@ -176,8 +180,10 @@ class DouyinJsonStoreImplement(AbstractStore): """ - return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" - + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) async def save_data_to_json(self, save_item: Dict, store_type: str): """ Below is a simple way to save it in json format. @@ -189,7 +195,8 @@ class DouyinJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) - save_file_name = self.make_save_file_name(store_type=store_type) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: @@ -201,6 +208,12 @@ class DouyinJsonStoreImplement(AbstractStore): async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: await file.write(json.dumps(save_data, ensure_ascii=False)) + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + async def store_content(self, content_item: Dict): """ content JSON storage implementation diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py index 14b477a..4883daa 100644 --- a/store/kuaishou/kuaishou_store_impl.py +++ b/store/kuaishou/kuaishou_store_impl.py @@ -12,9 +12,9 @@ from typing import Dict import aiofiles from base.base_crawler import AbstractStore -from tools import utils +from tools import utils,words from var import crawler_type_var - +import config def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 @@ -131,12 +131,15 @@ class KuaishouDbStoreImplement(AbstractStore): class KuaishouJsonStoreImplement(AbstractStore): - json_store_path: str = "data/kuaishou" + json_store_path: str = "data/kuaishou/json" + words_store_path: str = "data/kuaishou/words" lock = asyncio.Lock() file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() - def make_save_file_name(self, store_type: str) -> str: + + def make_save_file_name(self, store_type: str) -> (str,str): """ make save file name by store type Args: @@ -146,8 +149,10 @@ class KuaishouJsonStoreImplement(AbstractStore): """ - - return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) async def save_data_to_json(self, save_item: Dict, store_type: str): """ @@ -160,7 +165,8 @@ class KuaishouJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) - save_file_name = self.make_save_file_name(store_type=store_type) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: @@ -172,6 +178,12 @@ class KuaishouJsonStoreImplement(AbstractStore): async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: await file.write(json.dumps(save_data, ensure_ascii=False)) + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + async def store_content(self, content_item: Dict): """ content JSON storage implementation diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py index 8bf09b4..fdd21d4 100644 --- a/store/weibo/weibo_store_impl.py +++ b/store/weibo/weibo_store_impl.py @@ -12,9 +12,9 @@ from typing import Dict import aiofiles from base.base_crawler import AbstractStore -from tools import utils +from tools import utils,words from var import crawler_type_var - +import config def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 @@ -132,12 +132,14 @@ class WeiboDbStoreImplement(AbstractStore): class WeiboJsonStoreImplement(AbstractStore): - json_store_path: str = "data/weibo" + json_store_path: str = "data/weibo/json" + words_store_path: str = "data/weibo/words" lock = asyncio.Lock() file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() - def make_save_file_name(self, store_type: str) -> str: + def make_save_file_name(self, store_type: str) -> (str,str): """ make save file name by store type Args: @@ -147,7 +149,10 @@ class WeiboJsonStoreImplement(AbstractStore): """ - return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) async def save_data_to_json(self, save_item: Dict, store_type: str): """ @@ -160,7 +165,8 @@ class WeiboJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) - save_file_name = self.make_save_file_name(store_type=store_type) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: @@ -172,6 +178,12 @@ class WeiboJsonStoreImplement(AbstractStore): async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: await file.write(json.dumps(save_data, ensure_ascii=False)) + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + async def store_content(self, content_item: Dict): """ content JSON storage implementation diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py index 63b5217..3204d0c 100644 --- a/store/xhs/xhs_store_impl.py +++ b/store/xhs/xhs_store_impl.py @@ -12,9 +12,9 @@ from typing import Dict import aiofiles from base.base_crawler import AbstractStore -from tools import utils +from tools import utils,words from var import crawler_type_var - +import config def calculate_number_of_files(file_store_path: str) -> int: """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 @@ -161,11 +161,13 @@ class XhsDbStoreImplement(AbstractStore): class XhsJsonStoreImplement(AbstractStore): - json_store_path: str = "data/xhs" + json_store_path: str = "data/xhs/json" + words_store_path: str = "data/xhs/words" lock = asyncio.Lock() file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() - def make_save_file_name(self, store_type: str) -> str: + def make_save_file_name(self, store_type: str) -> (str,str): """ make save file name by store type Args: @@ -175,7 +177,10 @@ class XhsJsonStoreImplement(AbstractStore): """ - return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) async def save_data_to_json(self, save_item: Dict, store_type: str): """ @@ -188,7 +193,8 @@ class XhsJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) - save_file_name = self.make_save_file_name(store_type=store_type) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: @@ -200,6 +206,11 @@ class XhsJsonStoreImplement(AbstractStore): async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: await file.write(json.dumps(save_data, ensure_ascii=False)) + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass async def store_content(self, content_item: Dict): """ content JSON storage implementation diff --git a/tools/words.py b/tools/words.py new file mode 100644 index 0000000..b7c2b00 --- /dev/null +++ b/tools/words.py @@ -0,0 +1,68 @@ +import aiofiles +import asyncio +import jieba +from collections import Counter +from wordcloud import WordCloud +import json +import matplotlib.pyplot as plt +import config +from tools import utils + +plot_lock = asyncio.Lock() + +class AsyncWordCloudGenerator: + def __init__(self): + self.stop_words_file = config.STOP_WORDS_FILE + self.lock = asyncio.Lock() + self.stop_words = self.load_stop_words() + self.custom_words = config.CUSTOM_WORDS + for word, group in self.custom_words.items(): + jieba.add_word(word) + + def load_stop_words(self): + with open(self.stop_words_file, 'r', encoding='utf-8') as f: + return set(f.read().strip().split('\n')) + + async def generate_word_frequency_and_cloud(self, data, save_words_prefix): + all_text = ' '.join(item['content'] for item in data) + words = [word for word in jieba.lcut(all_text) if word not in self.stop_words] + word_freq = Counter(words) + + # Save word frequency to file + freq_file = f"{save_words_prefix}_word_freq.json" + async with aiofiles.open(freq_file, 'w', encoding='utf-8') as file: + await file.write(json.dumps(word_freq, ensure_ascii=False, indent=4)) + + # Try to acquire the plot lock without waiting + if plot_lock.locked(): + utils.logger.info("Skipping word cloud generation as the lock is held.") + return + + await self.generate_word_cloud(word_freq, save_words_prefix) + + async def generate_word_cloud(self, word_freq, save_words_prefix): + await plot_lock.acquire() + top_20_word_freq = {word: freq for word, freq in + sorted(word_freq.items(), key=lambda item: item[1], reverse=True)[:20]} + wordcloud = WordCloud( + font_path=config.FONT_PATH, + width=800, + height=400, + background_color='white', + max_words=200, + stopwords=self.stop_words, + colormap='viridis', + contour_color='steelblue', + contour_width=1 + ).generate_from_frequencies(top_20_word_freq) + + # Save word cloud image + plt.figure(figsize=(10, 5), facecolor='white') + plt.imshow(wordcloud, interpolation='bilinear') + + plt.axis('off') + plt.tight_layout(pad=0) + plt.savefig(f"{save_words_prefix}_word_cloud.png", format='png', dpi=300) + plt.close() + + plot_lock.release() \ No newline at end of file