web_search/utils.py

from bs4 import BeautifulSoup

# 文件扩展名集合
file_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}

def is_file(url):
    return any(url.lower().endswith(ext) for ext in file_extensions)

def is_webpage(url):
    try:
        response = requests.get(url, timeout=5)
        content_type = response.headers.get('Content-Type', '')
        return 'text/html' in content_type
    except requests.RequestException:
        return False

def can_crawl_webpage(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Example criteria: Check if there is a meta robots tag with noindex or nofollow
        meta_robots = soup.find('meta', attrs={'name': 'robots'})
        if meta_robots and ('noindex' in meta_robots['content'] or 'nofollow' in meta_robots['content']):
            return False
        return True
    except requests.RequestException:
        return False

def is_cosmetic_related(snippet):
    if snippet is None:
        return False
    keywords = ['化妆品', '美妆', '彩妆', '护肤']
    return any(keyword in snippet for keyword in keywords)


def parse_search(result):
    ret_str = ""
    link = result.get('link')
    if is_file(link):
        ret_str = (f"File: {link}")
    elif is_webpage(link):
        if can_crawl_webpage(link):
            ret_str = (f"Webpage crawlable: {link}")
        else:
            ret_str = (f"Webpage not crawlable: {link}")
    else:
        ret_str = (f"Unknown type: {link}")
    return ret_str