feat: 帖子搜索 & 移除登录代码使用IP代理

This commit is contained in:
Relakkes 2024-08-06 03:37:55 +08:00
parent a87094f2fd
commit d347cf5a2c
8 changed files with 600 additions and 389 deletions

View File

@ -1,67 +1,77 @@
import asyncio
import json import json
import re import random
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode from urllib.parse import urlencode
import httpx import httpx
from playwright.async_api import BrowserContext, Page from playwright.async_api import BrowserContext
from tenacity import (RetryError, retry, stop_after_attempt,
wait_fixed)
import config
from base.base_crawler import AbstractApiClient from base.base_crawler import AbstractApiClient
from proxy.proxy_ip_pool import ProxyIpPool
from tools import utils from tools import utils
from .field import SearchNoteType, SearchSortType from .field import SearchNoteType, SearchSortType
from .help import TieBaExtractor
class BaiduTieBaClient(AbstractApiClient): class BaiduTieBaClient(AbstractApiClient):
def __init__( def __init__(
self, self,
timeout=10, timeout=10,
proxies=None, ip_pool=None,
*, default_ip_proxy=None,
headers: Dict[str, str],
playwright_page: Page,
cookie_dict: Dict[str, str],
): ):
self.proxies = proxies self.ip_pool: Optional[ProxyIpPool] = ip_pool
self.timeout = timeout self.timeout = timeout
self.headers = headers self.headers = utils.get_user_agent()
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
self._host = "https://tieba.baidu.com" self._host = "https://tieba.baidu.com"
self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy
async def request(self, method, url, **kwargs) -> Union[str, Any]: @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def request(self, method, url, return_ori_content=False, proxies=None, **kwargs) -> Union[str, Any]:
""" """
封装httpx的公共请求方法对请求响应做一些处理 封装httpx的公共请求方法对请求响应做一些处理
Args: Args:
method: 请求方法 method: 请求方法
url: 请求的URL url: 请求的URL
return_ori_content: 是否返回原始内容
proxies: 代理IP
**kwargs: 其他请求参数例如请求头请求体等 **kwargs: 其他请求参数例如请求头请求体等
Returns: Returns:
""" """
# return response.text actual_proxies = proxies if proxies else self.default_ip_proxy
return_response = kwargs.pop('return_response', False) async with httpx.AsyncClient(proxies=actual_proxies) as client:
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request( response = await client.request(
method, url, timeout=self.timeout, method, url, timeout=self.timeout,
**kwargs **kwargs
) )
if return_response: if response.status_code != 200:
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
utils.logger.error(f"Request failed, response: {response.text}")
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}")
raise Exception("account blocked")
if return_ori_content:
return response.text return response.text
return response.json() return response.json()
async def get(self, uri: str, params=None) -> Dict: async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any:
""" """
GET请求对请求头签名 GET请求对请求头签名
Args: Args:
uri: 请求路由 uri: 请求路由
params: 请求参数 params: 请求参数
return_ori_content: 是否返回原始内容
Returns: Returns:
@ -70,9 +80,25 @@ class BaiduTieBaClient(AbstractApiClient):
if isinstance(params, dict): if isinstance(params, dict):
final_uri = (f"{uri}?" final_uri = (f"{uri}?"
f"{urlencode(params)}") f"{urlencode(params)}")
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers) try:
res = await self.request(method="GET", url=f"{self._host}{final_uri}",
return_ori_content=return_ori_content,
**kwargs)
return res
except RetryError as e:
if self.ip_pool:
proxie_model = await self.ip_pool.get_proxy()
_, proxies = utils.format_proxy_info(proxie_model)
res = await self.request(method="GET", url=f"{self._host}{final_uri}",
return_ori_content=return_ori_content,
proxies=proxies,
**kwargs)
self.default_ip_proxy = proxies
return res
async def post(self, uri: str, data: dict) -> Dict: utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数请尝试更换新的IP代理: {e}")
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
""" """
POST请求对请求头签名 POST请求对请求头签名
Args: Args:
@ -84,7 +110,7 @@ class BaiduTieBaClient(AbstractApiClient):
""" """
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}", return await self.request(method="POST", url=f"{self._host}{uri}",
data=json_str, headers=self.headers) data=json_str, **kwargs)
async def pong(self) -> bool: async def pong(self) -> bool:
""" """
@ -96,6 +122,7 @@ class BaiduTieBaClient(AbstractApiClient):
try: try:
uri = "/mo/q/sync" uri = "/mo/q/sync"
res: Dict = await self.get(uri) res: Dict = await self.get(uri)
utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
if res and res.get("no") == 0: if res and res.get("no") == 0:
ping_flag = True ping_flag = True
else: else:
@ -115,31 +142,42 @@ class BaiduTieBaClient(AbstractApiClient):
Returns: Returns:
""" """
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) pass
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def get_note_by_keyword( async def get_notes_by_keyword(
self, keyword: str, self, keyword: str,
page: int = 1, page: int = 1,
page_size: int = 10, page_size: int = 10,
sort: SearchSortType = SearchSortType.TIME_DESC, sort: SearchSortType = SearchSortType.TIME_DESC,
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
) -> Dict: random_sleep: bool = True
) -> List[Dict]:
""" """
根据关键词搜索贴吧帖子 根据关键词搜索贴吧帖子
Args: Args:
keyword: 关键词 keyword: 关键词
page: 分页第几页 page: 分页第几页
page_size: 每页肠病毒 page_size: 每页大小
sort: 结果排序方式 sort: 结果排序方式
note_type: 帖子类型主题贴主题+回复混合模式 note_type: 帖子类型主题贴主题+回复混合模式
random_sleep: 是否随机休眠
Returns: Returns:
""" """
# todo impl it uri = "/f/search/res"
return {} params = {
"isnew": 1,
"qw": keyword,
"rn": page_size,
"pn": page,
"sm": sort.value,
"only_thread": note_type.value
}
page_content = await self.get(uri, params=params, return_ori_content=True)
if random_sleep:
random.randint(1, 5)
return self._page_extractor.extract_search_note_list(page_content)
async def get_note_by_id(self, note_id: str) -> Dict: async def get_note_by_id(self, note_id: str) -> Dict:
""" """

View File

@ -9,9 +9,10 @@ from playwright.async_api import (BrowserContext, BrowserType, Page,
import config import config
from base.base_crawler import AbstractCrawler from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool, ProxyIpPool
from store import tieba as tieba_store from store import tieba as tieba_store
from tools import utils from tools import utils
from tools.crawler_util import format_proxy_info
from var import crawler_type_var from var import crawler_type_var
from .client import BaiduTieBaClient from .client import BaiduTieBaClient
@ -29,39 +30,24 @@ class TieBaCrawler(AbstractCrawler):
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None """
Start the crawler
Returns:
"""
ip_proxy_pool, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
utils.logger.info("[BaiduTieBaCrawler.start] Begin create ip proxy pool ...")
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) _, httpx_proxy_format = format_proxy_info(ip_proxy_info)
utils.logger.info(f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}")
async with async_playwright() as playwright:
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium,
None,
self.user_agent,
headless=config.HEADLESS
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
# Create a client to interact with the baidutieba website. # Create a client to interact with the baidutieba website.
self.tieba_client = await self.create_tieba_client(httpx_proxy_format) self.tieba_client = BaiduTieBaClient(
if not await self.tieba_client.pong(): ip_pool=ip_proxy_pool,
login_obj = BaiduTieBaLogin( default_ip_proxy=httpx_proxy_format,
login_type=config.LOGIN_TYPE,
login_phone="", # input your phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
) )
await login_obj.begin()
await self.tieba_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(config.CRAWLER_TYPE) crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
@ -72,10 +58,15 @@ class TieBaCrawler(AbstractCrawler):
else: else:
pass pass
utils.logger.info("[BaiduTieBaCrawler.start] Xhs Crawler finished ...") utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
async def search(self) -> None: async def search(self) -> None:
"""Search for notes and retrieve their comment information.""" """
Search for notes and retrieve their comment information.
Returns:
"""
utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidutieba keywords") utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidutieba keywords")
tieba_limit_count = 10 # tieba limit page fixed value tieba_limit_count = 10 # tieba limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
@ -92,36 +83,26 @@ class TieBaCrawler(AbstractCrawler):
try: try:
utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}") utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}")
note_id_list: List[str] = [] note_id_list: List[str] = []
notes_res = await self.tieba_client.get_note_by_keyword( notes_list_res = await self.tieba_client.get_notes_by_keyword(
keyword=keyword, keyword=keyword,
page=page, page=page,
page_size=tieba_limit_count, page_size=tieba_limit_count,
sort=SearchSortType.TIME_DESC, sort=SearchSortType.TIME_DESC,
note_type=SearchNoteType.FIXED_THREAD note_type=SearchNoteType.FIXED_THREAD
) )
utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_res}") utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_list_res}")
if not notes_res or not notes_res.get('has_more', False): if not notes_list_res:
utils.logger.info("No more content!")
break break
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ for note_detail in notes_list_res:
self.get_note_detail(
note_id=post_item.get("id"),
semaphore=semaphore
)
for post_item in notes_res.get("items", {})
if post_item.get('model_type') not in ('rec_query', 'hot_query')
]
note_details = await asyncio.gather(*task_list)
for note_detail in note_details:
if note_detail: if note_detail:
await tieba_store.update_tieba_note(note_detail) await tieba_store.update_tieba_note(note_detail)
note_id_list.append(note_detail.get("note_id")) note_id_list.append(note_detail.get("note_id"))
page += 1 page += 1
utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {note_details}") utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {notes_list_res}")
await self.batch_get_note_comments(note_id_list) await self.batch_get_note_comments(note_id_list)
except Exception as ex: except Exception as ex:
utils.logger.error(f"[BaiduTieBaCrawler.search] Get note detail error, err: {ex}") utils.logger.error(f"[BaiduTieBaCrawler.search] Search note list error, err: {ex}")
break break
async def fetch_creator_notes_detail(self, note_list: List[Dict]): async def fetch_creator_notes_detail(self, note_list: List[Dict]):
@ -197,34 +178,20 @@ class TieBaCrawler(AbstractCrawler):
callback=tieba_store.batch_update_tieba_note_comments callback=tieba_store.batch_update_tieba_note_comments
) )
@staticmethod async def create_tieba_client(self, ip_pool: ProxyIpPool) -> BaiduTieBaClient:
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: """
"""format proxy info for playwright and httpx""" Create tieba client
playwright_proxy = { Args:
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", ip_pool:
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
async def create_tieba_client(self, httpx_proxy: Optional[str]) -> BaiduTieBaClient: Returns:
"""
"""Create tieba client""" """Create tieba client"""
utils.logger.info("[BaiduTieBaCrawler.create_tieba_client] Begin create baidutieba API client ...") utils.logger.info("[BaiduTieBaCrawler.create_tieba_client] Begin create baidutieba API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
tieba_client_obj = BaiduTieBaClient( tieba_client_obj = BaiduTieBaClient(
proxies=httpx_proxy, ip_pool=ip_pool,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Origin": "https://www.baidutieba.com",
"Referer": "https://www.baidutieba.com",
"Content-Type": "application/json;charset=UTF-8"
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
) )
return tieba_client_obj return tieba_client_obj

View File

@ -0,0 +1,69 @@
# -*- coding: utf-8 -*-
from typing import List, Dict
from parsel import Selector
class TieBaExtractor:
def __init__(self):
pass
@staticmethod
def extract_search_note_list(page_content: str) -> List[Dict]:
"""
提取贴吧帖子列表
Args:
page_content: 页面内容的HTML字符串
Returns:
包含帖子信息的字典列表
"""
xpath_selector = "//div[@class='s_post']"
post_list = Selector(text=page_content).xpath(xpath_selector)
result = []
for post in post_list:
post_id = post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip()
title = post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip()
link = post.xpath(".//span[@class='p_title']/a/@href").get(default='')
description = post.xpath(".//div[@class='p_content']/text()").get(default='').strip()
forum = post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip()
forum_link = post.xpath(".//a[@class='p_forum']/@href").get(default='')
author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip()
author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='')
date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip()
result.append({
"note_id": post_id,
"title": title,
"desc": description,
"note_url": link,
"time": date,
"tieba_name": forum,
"tieba_link": forum_link,
"nickname": author,
"nickname_link": author_link,
})
return result
@staticmethod
def extract_tieba_note_comments(page_content: str) -> List[Dict]:
"""
提取贴吧帖子评论
Args:
page_content:
Returns:
"""
pass
if __name__ == '__main__':
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
_result = extractor.extract_search_note_list(content)
print(_result)
print(f"Total: {len(_result)}")

View File

@ -0,0 +1,96 @@
<div class="s_post_list">
<div class="s_post"><span class="p_title"><a data-tid="9117888152" data-fid="26976424" class="bluelink"
href="/p/9117888152?pid=150718967291&amp;cid=0#150718967291"
target="_blank">武汉交互空间科技富士康10亿加码中国大陆印度为何逐渐失宠</a></span>
<div class="p_content">
全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告富士康将在郑州投资建设新事业总部大楼承载新事业总部功能这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心也预示着该集团业务版图的新一轮扩张与升级
项目一期选址位于郑东新区建筑面积约700公亩总投资约10亿元人民币主要建设总部管理中心研发中心和工程中心战略产业发展中心战略产业金融平台
</div>
贴吧<a data-fid="26976424" class="p_forum" href="/f?kw=%CE%E4%BA%BA%BD%BB%BB%A5%BF%D5%BC%E4"
target="_blank"><font class="p_violet">武汉交互空间</font></a>作者<a
href="/home/main?un=VR%D0%E9%C4%E2%B4%EF%C8%CB" target="_blank"><font class="p_violet">VR虚拟达人</font></a>
<font class="p_green p_date">2024-08-05 16:45</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9114743782" data-fid="90367" class="bluelink"
href="/p/9114743782?pid=150705176739&amp;cid=0#150705176739"
target="_blank">请各位急用玛尼的小心骗子最多</a></span>
<div class="p_content">
这里面到处是骗子大家小心特别那些叫出村背货的基本是卖园区天下没有那么好的事就是有这好事我们在边境上的人比你们最清楚轮不到你们边境上比你们胆子大的人大把你一不熟悉小路为什么叫你带货东南亚带货的集结地一般在南宁防城港昆明西双版纳临沧然后师机接了走小路出去南宁防城港坐船出去好多都是二十几手的中介之前卖园区一个三十万现在不知道行情但好多园区不收
</div>
贴吧<a data-fid="90367" class="p_forum" href="/f?kw=%B1%B3%B0%FC%BF%CD" target="_blank"><font class="p_violet">背包客</font></a>作者<a
href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC64AUS" target="_blank"><font class="p_violet">贴吧用户_GC64AUS</font></a>
<font class="p_green p_date">2024-08-03 07:35</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9095684158" data-fid="1388265" class="bluelink"
href="/p/9095684158?pid=150616716870&amp;cid=0#150616716870"
target="_blank">*2025泰国冷链制冷运输展*东南亚外贸出口</a></span>
<div class="p_content">**2025泰国曼谷国际冷库空调制冷仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察
展出时间2025-7具体时间待定 展出地点泰国曼谷会展中心 展会周期一年一届 组展单位北京励航国际商务会展有限公司
人员跟团观展补贴为您节省成本寻找适合您的市场
本公司为您提供观展考察机会让您在大型展会上获得世界同行**科技的资料同时感受异域文化气息展会现场走展考察当地游览当地相关市
</div>
贴吧<a data-fid="1388265" class="p_forum" href="/f?kw=%B9%FA%BC%CA%D5%B9%BB%E1" target="_blank"><font
class="p_violet">国际展会</font></a>作者<a href="/home/main?un=zhaot_188" target="_blank"><font
class="p_violet">zhaot_188</font></a> <font class="p_green p_date">2024-07-19 15:44</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9093564752" data-fid="27984246" class="bluelink"
href="/p/9093564752?pid=150606964195&amp;cid=0#150606964195"
target="_blank">京湘楼创始人肖鑫创立于北京植根长沙百年美食传承</a></span>
<div class="p_content">来源标题京湘楼创始人肖鑫创立于北京植根长沙百年美食传承 京湘楼KING HERO品牌创始人肖鑫
京湘楼KING
HERO集酱板鸭肥肠鸭头鸭脖鸭肠小龙虾牛蛙捆鸡鸡爪鱼嘴巴鱼尾鱿鱼牛肉猪头肉等特色食品卤制加工包装与生产经营2022年3月在北京朝阳区双井开设了第一家京湘楼·鲜卤集市卤味熟食快餐店2023年5月在湖南省长沙市开福区注册成立了长沙京湘楼品牌管理有限公司京湘楼作为品
</div>
贴吧<a data-fid="27984246" class="p_forum" href="/f?kw=%BE%A9%CF%E6%C2%A5" target="_blank"><font
class="p_violet">京湘楼</font></a>作者<a href="/home/main?un=%CC%EC%C9%F1%B6%C9%B3%BE" target="_blank"><font
class="p_violet">天神渡尘</font></a> <font class="p_green p_date">2024-07-17 23:43</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9088419293" data-fid="310" class="bluelink"
href="/p/9088419293?pid=150582471307&amp;cid=0#150582471307"
target="_blank">广州能争取到迪士尼与环球落户吗</a></span>
<div class="p_content">
不是二选一而是全都要上一组数据上海迪士尼2016年开业就接待游客超过1.2亿人次香港迪士尼2023全年游客人数才640万人次约等于无这么低的入园人次已经引来迪士尼方面的不悦
美国有两个迪士尼说实话迪士尼的门票并不高普通人都去的起中国完全有能力建两到三个迪士尼欧洲只有第一个迪士尼因为它的人口只有中国的一半假设中国人一年吃一包盐一年就是14包那么欧洲就是七亿包盐盐再便宜欧洲人也不可能一人吃
</div>
贴吧<a data-fid="310" class="p_forum" href="/f?kw=%B5%D8%C0%ED" target="_blank"><font
class="p_violet">地理</font></a>作者<a href="/home/main?un=SeaRoutes" target="_blank"><font
class="p_violet">SeaRoutes</font></a> <font class="p_green p_date">2024-07-13 20:17</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9088416365" data-fid="7561034" class="bluelink"
href="/p/9088416365?pid=150582456551&amp;cid=0#150582456551"
target="_blank">#城市GDP#广州应该全力去争取迪士尼和环球影城</a></span>
<div class="p_content">
不是二选一而是全都要上一组数据上海迪士尼2016年开业就接待游客超过1.2亿人次香港迪士尼2023全年游客人数才640万人次约等于无这么低的入园人次已经引来迪士尼方面的不悦
美国有两个迪士尼说实话迪士尼的门票并不高普通人都去的起中国完全有能力建两到三个迪士尼欧洲只有第一个迪士尼因为它的人口只有中国的一半假设中国人一年吃一包盐一年就是14包那么欧洲就是七亿包盐盐再便宜欧洲人也不可能一人吃
</div>
贴吧<a data-fid="7561034" class="p_forum" href="/f?kw=%B3%C7%CA%D0gdp" target="_blank"><font class="p_violet">城市gdp</font></a>作者<a
href="/home/main?un=SeaRoutes" target="_blank"><font class="p_violet">SeaRoutes</font></a> <font
class="p_green p_date">2024-07-13 20:14</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9087419039" data-fid="46374" class="bluelink"
href="/p/9087419039?pid=150577861626&amp;cid=0#150577861626"
target="_blank">云南省首批云南日报昆明新闻头条聚焦阳宗海省级物流枢纽建设</a></span>
<div class="p_content">
7月11日云南日报昆明新闻头条刊发文章阳宗海风景名胜区立足衔接西部陆海新通道与中老铁路优势加速28个物流枢纽设施建设聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单其中昆明市有两家获批阳宗海物流枢纽上榜一起来看近日云南省
</div>
贴吧<a data-fid="46374" class="p_forum" href="/f?kw=%C0%A5%C3%F7" target="_blank"><font
class="p_violet">昆明</font></a>作者<a href="/home/main?un=%8F%EC" target="_blank"><font
class="p_violet"></font></a> <font class="p_green p_date">2024-07-12 23:04</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9085102046" data-fid="348713" class="bluelink"
href="/p/9085102046?pid=150567555367&amp;cid=0#150567555367"
target="_blank">寻找弟弟很久没跟家里联系</a></span>
<div class="p_content">Kk四期世纪园区寻找弟弟外号大佐F3 2公司cj集团</div>
贴吧<a data-fid="348713" class="p_forum" href="/f?kw=%B6%AB%C4%CF%D1%C7" target="_blank"><font
class="p_violet">东南亚</font></a>作者<a href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC2CtRa"
target="_blank"><font class="p_violet">贴吧用户_GC2CtRa</font></a>
<font class="p_green p_date">2024-07-11 07:53</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9083888071" data-fid="30" class="bluelink"
href="/p/9083888071?pid=150562129935&amp;cid=0#150562129935"
target="_blank">拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧</a></span>
<div class="p_content">拉美 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立
跟军阀谈八小时双休那么不开玩笑缅北诈骗园区就能看出来
</div>
贴吧<a data-fid="30" class="p_forum" href="/f?kw=%C0%FA%CA%B7" target="_blank"><font
class="p_violet">历史</font></a>作者<a href="/home/main?un=yoursagain" target="_blank"><font
class="p_violet">yoursagain</font></a> <font class="p_green p_date">2024-07-10 09:00</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9071937582" data-fid="8103241" class="bluelink"
href="/p/9071937582?pid=150510120873&amp;cid=0#150510120873"
target="_blank">东南亚园区 </a></span>
<div class="p_content"></div>
贴吧<a data-fid="8103241" class="p_forum" href="/f?kw=%D4%B0%C7%F8%D5%D0%C9%CC" target="_blank"><font
class="p_violet">园区招商</font></a>作者<a href="/home/main?un=QQ59052966" target="_blank"><font
class="p_violet">QQ59052966</font></a> <font class="p_green p_date">2024-06-30 12:09</font></div>
</div>

View File

@ -2,7 +2,8 @@
-- Table structure for bilibili_video -- Table structure for bilibili_video
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `bilibili_video`; DROP TABLE IF EXISTS `bilibili_video`;
CREATE TABLE `bilibili_video` ( CREATE TABLE `bilibili_video`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -29,7 +30,8 @@ CREATE TABLE `bilibili_video` (
-- Table structure for bilibili_video_comment -- Table structure for bilibili_video_comment
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `bilibili_video_comment`; DROP TABLE IF EXISTS `bilibili_video_comment`;
CREATE TABLE `bilibili_video_comment` ( CREATE TABLE `bilibili_video_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -50,7 +52,8 @@ CREATE TABLE `bilibili_video_comment` (
-- Table structure for bilibili_up_info -- Table structure for bilibili_up_info
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `bilibili_up_info`; DROP TABLE IF EXISTS `bilibili_up_info`;
CREATE TABLE `bilibili_up_info` ( CREATE TABLE `bilibili_up_info`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -69,7 +72,8 @@ CREATE TABLE `bilibili_up_info` (
-- Table structure for douyin_aweme -- Table structure for douyin_aweme
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `douyin_aweme`; DROP TABLE IF EXISTS `douyin_aweme`;
CREATE TABLE `douyin_aweme` ( CREATE TABLE `douyin_aweme`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
@ -100,7 +104,8 @@ CREATE TABLE `douyin_aweme` (
-- Table structure for douyin_aweme_comment -- Table structure for douyin_aweme_comment
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `douyin_aweme_comment`; DROP TABLE IF EXISTS `douyin_aweme_comment`;
CREATE TABLE `douyin_aweme_comment` ( CREATE TABLE `douyin_aweme_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
@ -126,7 +131,8 @@ CREATE TABLE `douyin_aweme_comment` (
-- Table structure for dy_creator -- Table structure for dy_creator
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `dy_creator`; DROP TABLE IF EXISTS `dy_creator`;
CREATE TABLE `dy_creator` ( CREATE TABLE `dy_creator`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(128) NOT NULL COMMENT '用户ID', `user_id` varchar(128) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -147,7 +153,8 @@ CREATE TABLE `dy_creator` (
-- Table structure for kuaishou_video -- Table structure for kuaishou_video
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `kuaishou_video`; DROP TABLE IF EXISTS `kuaishou_video`;
CREATE TABLE `kuaishou_video` ( CREATE TABLE `kuaishou_video`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -173,7 +180,8 @@ CREATE TABLE `kuaishou_video` (
-- Table structure for kuaishou_video_comment -- Table structure for kuaishou_video_comment
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `kuaishou_video_comment`; DROP TABLE IF EXISTS `kuaishou_video_comment`;
CREATE TABLE `kuaishou_video_comment` ( CREATE TABLE `kuaishou_video_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -195,7 +203,8 @@ CREATE TABLE `kuaishou_video_comment` (
-- Table structure for weibo_note -- Table structure for weibo_note
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `weibo_note`; DROP TABLE IF EXISTS `weibo_note`;
CREATE TABLE `weibo_note` ( CREATE TABLE `weibo_note`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -223,7 +232,8 @@ CREATE TABLE `weibo_note` (
-- Table structure for weibo_note_comment -- Table structure for weibo_note_comment
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `weibo_note_comment`; DROP TABLE IF EXISTS `weibo_note_comment`;
CREATE TABLE `weibo_note_comment` ( CREATE TABLE `weibo_note_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -250,7 +260,8 @@ CREATE TABLE `weibo_note_comment` (
-- Table structure for xhs_creator -- Table structure for xhs_creator
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `xhs_creator`; DROP TABLE IF EXISTS `xhs_creator`;
CREATE TABLE `xhs_creator` ( CREATE TABLE `xhs_creator`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID', `user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -271,7 +282,8 @@ CREATE TABLE `xhs_creator` (
-- Table structure for xhs_note -- Table structure for xhs_note
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `xhs_note`; DROP TABLE IF EXISTS `xhs_note`;
CREATE TABLE `xhs_note` ( CREATE TABLE `xhs_note`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID', `user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -302,7 +314,8 @@ CREATE TABLE `xhs_note` (
-- Table structure for xhs_note_comment -- Table structure for xhs_note_comment
-- ---------------------------- -- ----------------------------
DROP TABLE IF EXISTS `xhs_note_comment`; DROP TABLE IF EXISTS `xhs_note_comment`;
CREATE TABLE `xhs_note_comment` ( CREATE TABLE `xhs_note_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID', `user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
@ -325,15 +338,40 @@ CREATE TABLE `xhs_note_comment` (
-- alter table xhs_note_comment to support parent_comment_id -- alter table xhs_note_comment to support parent_comment_id
-- ---------------------------- -- ----------------------------
ALTER TABLE `xhs_note_comment` ALTER TABLE `xhs_note_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `douyin_aweme_comment` ALTER TABLE `douyin_aweme_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `bilibili_video_comment` ALTER TABLE `bilibili_video_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `weibo_note_comment` ALTER TABLE `weibo_note_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
SET FOREIGN_KEY_CHECKS = 1; SET
FOREIGN_KEY_CHECKS = 1;
DROP TABLE IF EXISTS `tieba_note`;
CREATE TABLE `tieba_note`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
`title` varchar(255) DEFAULT NULL COMMENT '笔记标题',
`desc` longtext COMMENT '笔记描述',
`time` varchar NOT NULL COMMENT '笔记发布时间',
`note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`nickname_link` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
`tieba_name` varchar(255) DEFAULT NULL COMMENT '贴吧名称',
`tieba_link` varchar(255) DEFAULT NULL COMMENT '贴吧链接地址',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数',
PRIMARY KEY (`id`),
KEY `idx_tieba_note_id` (`note_id`),
KEY `idx_tieba_note_time` (`time`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表';

View File

@ -22,31 +22,20 @@ class TieBaStoreFactory:
async def update_tieba_note(note_item: Dict): async def update_tieba_note(note_item: Dict):
tieba_url = "https://tieba.baidu.com"
note_id = note_item.get("note_id") note_id = note_item.get("note_id")
user_info = note_item.get("user", {})
interact_info = note_item.get("interact_info", {})
tag_list: List[Dict] = note_item.get("tag_list", [])
local_db_item = { local_db_item = {
"note_id": note_id, "note_id": note_id,
"type": note_item.get("type"),
"title": note_item.get("title") or note_item.get("desc", "")[:255], "title": note_item.get("title") or note_item.get("desc", "")[:255],
"desc": note_item.get("desc", ""), "desc": note_item.get("desc", ""),
"note_url": tieba_url + note_item.get("note_url"),
"time": note_item.get("time"), "time": note_item.get("time"),
"last_update_time": note_item.get("last_update_time", 0), "tieba_name": note_item.get("tieba_name"),
"user_id": user_info.get("user_id"), "tieba_link": tieba_url + note_item.get("tieba_link", ""),
"nickname": user_info.get("nickname"), "nickname": note_item.get("nickname"),
"avatar": user_info.get("avatar"), "nickname_link": tieba_url + note_item.get("nickname_link", ""),
"liked_count": interact_info.get("liked_count"),
"collected_count": interact_info.get("collected_count"),
"comment_count": interact_info.get("comment_count"),
"share_count": interact_info.get("share_count"),
"ip_location": note_item.get("ip_location", ""), "ip_location": note_item.get("ip_location", ""),
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
# todo: add note_url
"note_url": ""
} }
utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {local_db_item}") utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {local_db_item}")
await TieBaStoreFactory.create_store().store_content(local_db_item) await TieBaStoreFactory.create_store().store_content(local_db_item)

View File

@ -15,7 +15,7 @@ async def query_content_by_content_id(content_id: str) -> Dict:
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from baidu_tieba where note_id = '{content_id}'" sql: str = f"select * from tieba_note where note_id = '{content_id}'"
rows: List[Dict] = await async_db_conn.query(sql) rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0: if len(rows) > 0:
return rows[0] return rows[0]
@ -32,7 +32,7 @@ async def add_new_content(content_item: Dict) -> int:
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("baidu_tieba", content_item) last_row_id: int = await async_db_conn.item_to_table("tieba_note", content_item)
return last_row_id return last_row_id
@ -47,7 +47,7 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("baidu_tieba", content_item, "note_id", content_id) effect_row: int = await async_db_conn.update_table("tieba_note", content_item, "note_id", content_id)
return effect_row return effect_row
@ -62,7 +62,7 @@ async def query_comment_by_comment_id(comment_id: str) -> Dict:
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from baidu_tieba_comment where comment_id = '{comment_id}'" sql: str = f"select * from tieba_comment where comment_id = '{comment_id}'"
rows: List[Dict] = await async_db_conn.query(sql) rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0: if len(rows) > 0:
return rows[0] return rows[0]
@ -79,7 +79,7 @@ async def add_new_comment(comment_item: Dict) -> int:
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_comment", comment_item) last_row_id: int = await async_db_conn.item_to_table("tieba_comment", comment_item)
return last_row_id return last_row_id
@ -94,7 +94,7 @@ async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> i
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("baidu_tieba_comment", comment_item, "comment_id", comment_id) effect_row: int = await async_db_conn.update_table("tieba_comment", comment_item, "comment_id", comment_id)
return effect_row return effect_row
@ -108,7 +108,7 @@ async def query_creator_by_user_id(user_id: str) -> Dict:
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from baidu_tieba_creator where user_id = '{user_id}'" sql: str = f"select * from tieba_creator where user_id = '{user_id}'"
rows: List[Dict] = await async_db_conn.query(sql) rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0: if len(rows) > 0:
return rows[0] return rows[0]
@ -125,7 +125,7 @@ async def add_new_creator(creator_item: Dict) -> int:
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_creator", creator_item) last_row_id: int = await async_db_conn.item_to_table("tieba_creator", creator_item)
return last_row_id return last_row_id
@ -140,5 +140,5 @@ async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int:
""" """
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("baidu_tieba_creator", creator_item, "user_id", user_id) effect_row: int = await async_db_conn.update_table("tieba_creator", creator_item, "user_id", user_id)
return effect_row return effect_row

View File

@ -13,6 +13,7 @@ import httpx
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from playwright.async_api import Cookie, Page from playwright.async_api import Cookie, Page
from proxy import IpInfoModel
from . import utils from . import utils
@ -133,3 +134,16 @@ def match_interact_info_count(count_str: str) -> int:
return int(number) return int(number)
else: else:
return 0 return 0
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy