from bs4 import BeautifulSoup # 文件扩展名集合 file_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'} def is_file(url): return any(url.lower().endswith(ext) for ext in file_extensions) def is_webpage(url): try: response = requests.get(url, timeout=5) content_type = response.headers.get('Content-Type', '') return 'text/html' in content_type except requests.RequestException: return False def can_crawl_webpage(url): try: response = requests.get(url, timeout=5) soup = BeautifulSoup(response.text, 'html.parser') # Example criteria: Check if there is a meta robots tag with noindex or nofollow meta_robots = soup.find('meta', attrs={'name': 'robots'}) if meta_robots and ('noindex' in meta_robots['content'] or 'nofollow' in meta_robots['content']): return False return True except requests.RequestException: return False def is_cosmetic_related(snippet): if snippet is None: return False keywords = ['化妆品', '美妆', '彩妆', '护肤'] return any(keyword in snippet for keyword in keywords) def parse_search(result): ret_str = "" link = result.get('link') if is_file(link): ret_str = (f"File: {link}") elif is_webpage(link): if can_crawl_webpage(link): ret_str = (f"Webpage crawlable: {link}") else: ret_str = (f"Webpage not crawlable: {link}") else: ret_str = (f"Unknown type: {link}") return ret_str