e82dcae02f
docs: update docs Create .gitattributes Update README.md
62 lines
2.0 KiB
Python
62 lines
2.0 KiB
Python
import sys
|
|
import asyncio
|
|
from typing import Optional, List, Dict
|
|
|
|
from playwright.async_api import async_playwright
|
|
from playwright.async_api import Page
|
|
from playwright.async_api import Cookie
|
|
from playwright.async_api import BrowserContext
|
|
|
|
import utils
|
|
from .client import DOUYINClient
|
|
from base_crawler import Crawler
|
|
|
|
|
|
class DouYinCrawler(Crawler):
|
|
def __init__(self):
|
|
self.keywords: Optional[str] = None
|
|
self.scan_qrcode_time: Optional[int] = None
|
|
self.cookies: Optional[List[Cookie]] = None
|
|
self.browser_context: Optional[BrowserContext] = None
|
|
self.context_page: Optional[Page] = None
|
|
self.proxy: Optional[Dict] = None
|
|
self.user_agent = utils.get_user_agent()
|
|
self.dy_client: Optional[DOUYINClient] = None
|
|
|
|
def init_config(self, **kwargs):
|
|
self.keywords = kwargs.get("keywords")
|
|
self.scan_qrcode_time = kwargs.get("scan_qrcode_time")
|
|
|
|
async def start(self):
|
|
async with async_playwright() as playwright:
|
|
chromium = playwright.chromium
|
|
browser = await chromium.launch(headless=False)
|
|
self.browser_context = await browser.new_context(
|
|
viewport={"width": 1920, "height": 1080},
|
|
user_agent=self.user_agent,
|
|
proxy=self.proxy
|
|
)
|
|
# execute JS to bypass anti automation/crawler detection
|
|
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
|
self.context_page = await self.browser_context.new_page()
|
|
await self.context_page.goto("https://www.douyin.com")
|
|
|
|
# scan qrcode login
|
|
await self.login()
|
|
await self.update_cookies()
|
|
|
|
# block main crawler coroutine
|
|
await asyncio.Event().wait()
|
|
|
|
async def update_cookies(self):
|
|
self.cookies = await self.browser_context.cookies()
|
|
|
|
async def login(self):
|
|
pass
|
|
|
|
def search_posts(self):
|
|
pass
|
|
|
|
def get_comments(self, item_id: str):
|
|
pass
|