xhs_crawler/tools/words.py

72 lines
2.5 KiB
Python
Raw Normal View History

import asyncio
import json
import logging
2024-06-16 16:25:48 +00:00
from collections import Counter
import aiofiles
import jieba
import matplotlib.pyplot as plt
2024-06-16 16:25:48 +00:00
from wordcloud import WordCloud
import config
from tools import utils
plot_lock = asyncio.Lock()
class AsyncWordCloudGenerator:
def __init__(self):
logging.getLogger('jieba').setLevel(logging.WARNING)
self.stop_words_file = config.STOP_WORDS_FILE
self.lock = asyncio.Lock()
self.stop_words = self.load_stop_words()
self.custom_words = config.CUSTOM_WORDS
for word, group in self.custom_words.items():
jieba.add_word(word)
def load_stop_words(self):
with open(self.stop_words_file, 'r', encoding='utf-8') as f:
return set(f.read().strip().split('\n'))
async def generate_word_frequency_and_cloud(self, data, save_words_prefix):
all_text = ' '.join(item['content'] for item in data)
2024-08-06 07:24:23 +00:00
words = [word for word in jieba.lcut(all_text) if word not in self.stop_words and len(word.strip()) > 0]
word_freq = Counter(words)
# Save word frequency to file
freq_file = f"{save_words_prefix}_word_freq.json"
async with aiofiles.open(freq_file, 'w', encoding='utf-8') as file:
await file.write(json.dumps(word_freq, ensure_ascii=False, indent=4))
# Try to acquire the plot lock without waiting
if plot_lock.locked():
utils.logger.info("Skipping word cloud generation as the lock is held.")
return
await self.generate_word_cloud(word_freq, save_words_prefix)
async def generate_word_cloud(self, word_freq, save_words_prefix):
await plot_lock.acquire()
top_20_word_freq = {word: freq for word, freq in
sorted(word_freq.items(), key=lambda item: item[1], reverse=True)[:20]}
wordcloud = WordCloud(
font_path=config.FONT_PATH,
width=800,
height=400,
background_color='white',
max_words=200,
stopwords=self.stop_words,
colormap='viridis',
contour_color='steelblue',
contour_width=1
).generate_from_frequencies(top_20_word_freq)
# Save word cloud image
plt.figure(figsize=(10, 5), facecolor='white')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig(f"{save_words_prefix}_word_cloud.png", format='png', dpi=300)
plt.close()
plot_lock.release()