web_search/utils.py
2024-12-19 11:32:54 +08:00

48 lines
1.5 KiB
Python

from bs4 import BeautifulSoup
# 文件扩展名集合
file_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
def is_file(url):
return any(url.lower().endswith(ext) for ext in file_extensions)
def is_webpage(url):
try:
response = requests.get(url, timeout=5)
content_type = response.headers.get('Content-Type', '')
return 'text/html' in content_type
except requests.RequestException:
return False
def can_crawl_webpage(url):
try:
response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.text, 'html.parser')
# Example criteria: Check if there is a meta robots tag with noindex or nofollow
meta_robots = soup.find('meta', attrs={'name': 'robots'})
if meta_robots and ('noindex' in meta_robots['content'] or 'nofollow' in meta_robots['content']):
return False
return True
except requests.RequestException:
return False
def is_cosmetic_related(snippet):
if snippet is None:
return False
keywords = ['化妆品', '美妆', '彩妆', '护肤']
return any(keyword in snippet for keyword in keywords)
def parse_search(result):
ret_str = ""
link = result.get('link')
if is_file(link):
ret_str = (f"File: {link}")
elif is_webpage(link):
if can_crawl_webpage(link):
ret_str = (f"Webpage crawlable: {link}")
else:
ret_str = (f"Webpage not crawlable: {link}")
else:
ret_str = (f"Unknown type: {link}")
return ret_str