187 lines
8.8 KiB
Python
187 lines
8.8 KiB
Python
|
import asyncio
|
||
|
import functools
|
||
|
import sys
|
||
|
from typing import Optional
|
||
|
|
||
|
from playwright.async_api import BrowserContext, Page
|
||
|
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||
|
wait_fixed)
|
||
|
|
||
|
import config
|
||
|
from base.base_crawler import AbstractLogin
|
||
|
from cache.cache_factory import CacheFactory
|
||
|
from tools import utils
|
||
|
|
||
|
|
||
|
class XiaoHongShuLogin(AbstractLogin):
|
||
|
|
||
|
def __init__(self,
|
||
|
login_type: str,
|
||
|
browser_context: BrowserContext,
|
||
|
context_page: Page,
|
||
|
login_phone: Optional[str] = "",
|
||
|
cookie_str: str = ""
|
||
|
):
|
||
|
config.LOGIN_TYPE = login_type
|
||
|
self.browser_context = browser_context
|
||
|
self.context_page = context_page
|
||
|
self.login_phone = login_phone
|
||
|
self.cookie_str = cookie_str
|
||
|
|
||
|
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||
|
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
||
|
"""
|
||
|
Check if the current login status is successful and return True otherwise return False
|
||
|
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||
|
if max retry times reached, raise RetryError
|
||
|
"""
|
||
|
|
||
|
if "请通过验证" in await self.context_page.content():
|
||
|
utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
||
|
|
||
|
current_cookie = await self.browser_context.cookies()
|
||
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||
|
current_web_session = cookie_dict.get("web_session")
|
||
|
if current_web_session != no_logged_in_session:
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
async def begin(self):
|
||
|
"""Start login xiaohongshu"""
|
||
|
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
|
||
|
if config.LOGIN_TYPE == "qrcode":
|
||
|
await self.login_by_qrcode()
|
||
|
elif config.LOGIN_TYPE == "phone":
|
||
|
await self.login_by_mobile()
|
||
|
elif config.LOGIN_TYPE == "cookie":
|
||
|
await self.login_by_cookies()
|
||
|
else:
|
||
|
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||
|
|
||
|
async def login_by_mobile(self):
|
||
|
"""Login xiaohongshu by mobile"""
|
||
|
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
||
|
await asyncio.sleep(1)
|
||
|
try:
|
||
|
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
||
|
login_button_ele = await self.context_page.wait_for_selector(
|
||
|
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
|
||
|
timeout=5000
|
||
|
)
|
||
|
await login_button_ele.click()
|
||
|
# 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的
|
||
|
# 另一种是需要点击切换到手机登录的
|
||
|
element = await self.context_page.wait_for_selector(
|
||
|
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
|
||
|
timeout=5000
|
||
|
)
|
||
|
await element.click()
|
||
|
except Exception as e:
|
||
|
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] have not found mobile button icon and keep going ...")
|
||
|
|
||
|
await asyncio.sleep(1)
|
||
|
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||
|
input_ele = await login_container_ele.query_selector("label.phone > input")
|
||
|
await input_ele.fill(self.login_phone)
|
||
|
await asyncio.sleep(0.5)
|
||
|
|
||
|
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
|
||
|
await send_btn_ele.click() # 点击发送验证码
|
||
|
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
|
||
|
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
|
||
|
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||
|
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||
|
no_logged_in_session = ""
|
||
|
while max_get_sms_code_time > 0:
|
||
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||
|
await asyncio.sleep(1)
|
||
|
sms_code_key = f"xhs_{self.login_phone}"
|
||
|
sms_code_value = cache_client.get(sms_code_key)
|
||
|
if not sms_code_value:
|
||
|
max_get_sms_code_time -= 1
|
||
|
continue
|
||
|
|
||
|
current_cookie = await self.browser_context.cookies()
|
||
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||
|
no_logged_in_session = cookie_dict.get("web_session")
|
||
|
|
||
|
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
|
||
|
await asyncio.sleep(0.5)
|
||
|
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
|
||
|
await agree_privacy_ele.click() # 点击同意隐私协议
|
||
|
await asyncio.sleep(0.5)
|
||
|
|
||
|
await submit_btn_ele.click() # 点击登录
|
||
|
|
||
|
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||
|
break
|
||
|
|
||
|
try:
|
||
|
await self.check_login_state(no_logged_in_session)
|
||
|
except RetryError:
|
||
|
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
|
||
|
sys.exit()
|
||
|
|
||
|
wait_redirect_seconds = 5
|
||
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||
|
await asyncio.sleep(wait_redirect_seconds)
|
||
|
|
||
|
async def login_by_qrcode(self):
|
||
|
"""login xiaohongshu website and keep webdriver login state"""
|
||
|
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
|
||
|
# login_selector = "div.login-container > div.left > div.qrcode > img"
|
||
|
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||
|
# find login qrcode
|
||
|
base64_qrcode_img = await utils.find_login_qrcode(
|
||
|
self.context_page,
|
||
|
selector=qrcode_img_selector
|
||
|
)
|
||
|
if not base64_qrcode_img:
|
||
|
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||
|
# if this website does not automatically popup login dialog box, we will manual click login button
|
||
|
await asyncio.sleep(0.5)
|
||
|
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||
|
await login_button_ele.click()
|
||
|
base64_qrcode_img = await utils.find_login_qrcode(
|
||
|
self.context_page,
|
||
|
selector=qrcode_img_selector
|
||
|
)
|
||
|
if not base64_qrcode_img:
|
||
|
sys.exit()
|
||
|
|
||
|
# get not logged session
|
||
|
current_cookie = await self.browser_context.cookies()
|
||
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||
|
no_logged_in_session = cookie_dict.get("web_session")
|
||
|
|
||
|
# show login qrcode
|
||
|
# fix issue #12
|
||
|
# we need to use partial function to call show_qrcode function and run in executor
|
||
|
# then current asyncio event loop will not be blocked
|
||
|
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||
|
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||
|
|
||
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||
|
try:
|
||
|
await self.check_login_state(no_logged_in_session)
|
||
|
except RetryError:
|
||
|
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
|
||
|
sys.exit()
|
||
|
|
||
|
wait_redirect_seconds = 5
|
||
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||
|
await asyncio.sleep(wait_redirect_seconds)
|
||
|
|
||
|
async def login_by_cookies(self):
|
||
|
"""login xiaohongshu website by cookies"""
|
||
|
utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
||
|
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||
|
if key != "web_session": # only set web_session cookie attr
|
||
|
continue
|
||
|
await self.browser_context.add_cookies([{
|
||
|
'name': key,
|
||
|
'value': value,
|
||
|
'domain': ".xiaohongshu.com",
|
||
|
'path': "/"
|
||
|
}])
|