48 lines
1.5 KiB
Python
48 lines
1.5 KiB
Python
from bs4 import BeautifulSoup
|
|
|
|
# 文件扩展名集合
|
|
file_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
|
|
|
|
def is_file(url):
|
|
return any(url.lower().endswith(ext) for ext in file_extensions)
|
|
|
|
def is_webpage(url):
|
|
try:
|
|
response = requests.get(url, timeout=5)
|
|
content_type = response.headers.get('Content-Type', '')
|
|
return 'text/html' in content_type
|
|
except requests.RequestException:
|
|
return False
|
|
|
|
def can_crawl_webpage(url):
|
|
try:
|
|
response = requests.get(url, timeout=5)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
# Example criteria: Check if there is a meta robots tag with noindex or nofollow
|
|
meta_robots = soup.find('meta', attrs={'name': 'robots'})
|
|
if meta_robots and ('noindex' in meta_robots['content'] or 'nofollow' in meta_robots['content']):
|
|
return False
|
|
return True
|
|
except requests.RequestException:
|
|
return False
|
|
|
|
def is_cosmetic_related(snippet):
|
|
if snippet is None:
|
|
return False
|
|
keywords = ['化妆品', '美妆', '彩妆', '护肤']
|
|
return any(keyword in snippet for keyword in keywords)
|
|
|
|
|
|
def parse_search(result):
|
|
ret_str = ""
|
|
link = result.get('link')
|
|
if is_file(link):
|
|
ret_str = (f"File: {link}")
|
|
elif is_webpage(link):
|
|
if can_crawl_webpage(link):
|
|
ret_str = (f"Webpage crawlable: {link}")
|
|
else:
|
|
ret_str = (f"Webpage not crawlable: {link}")
|
|
else:
|
|
ret_str = (f"Unknown type: {link}")
|
|
return ret_str |