xhs_crawler/tools/words.py
2024-08-09 18:12:15 +08:00

72 lines
2.5 KiB
Python

import asyncio
import json
import logging
from collections import Counter
import aiofiles
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import config
from tools import utils
plot_lock = asyncio.Lock()
class AsyncWordCloudGenerator:
def __init__(self):
logging.getLogger('jieba').setLevel(logging.WARNING)
self.stop_words_file = config.STOP_WORDS_FILE
self.lock = asyncio.Lock()
self.stop_words = self.load_stop_words()
self.custom_words = config.CUSTOM_WORDS
for word, group in self.custom_words.items():
jieba.add_word(word)
def load_stop_words(self):
with open(self.stop_words_file, 'r', encoding='utf-8') as f:
return set(f.read().strip().split('\n'))
async def generate_word_frequency_and_cloud(self, data, save_words_prefix):
all_text = ' '.join(item['content'] for item in data)
words = [word for word in jieba.lcut(all_text) if word not in self.stop_words and len(word.strip()) > 0]
word_freq = Counter(words)
# Save word frequency to file
freq_file = f"{save_words_prefix}_word_freq.json"
async with aiofiles.open(freq_file, 'w', encoding='utf-8') as file:
await file.write(json.dumps(word_freq, ensure_ascii=False, indent=4))
# Try to acquire the plot lock without waiting
if plot_lock.locked():
utils.logger.info("Skipping word cloud generation as the lock is held.")
return
await self.generate_word_cloud(word_freq, save_words_prefix)
async def generate_word_cloud(self, word_freq, save_words_prefix):
await plot_lock.acquire()
top_20_word_freq = {word: freq for word, freq in
sorted(word_freq.items(), key=lambda item: item[1], reverse=True)[:20]}
wordcloud = WordCloud(
font_path=config.FONT_PATH,
width=800,
height=400,
background_color='white',
max_words=200,
stopwords=self.stop_words,
colormap='viridis',
contour_color='steelblue',
contour_width=1
).generate_from_frequencies(top_20_word_freq)
# Save word cloud image
plt.figure(figsize=(10, 5), facecolor='white')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig(f"{save_words_prefix}_word_cloud.png", format='png', dpi=300)
plt.close()
plot_lock.release()