自动化之跳过人机验证

由于时效问题，该文某些代码、技术可能已经过期，请注意！！！本文最后更新于：8 个月前

playwright

人机验证问题

在header里去掉了 User-Agent 后可以正常访问，加上就会引起网站的人机验证。

无人机验证

browser = p.chromium.launch(
    channel="chrome",  # 使用本地 Chrome 浏览器
    headless=False  # 启用 GUI 模式
)

context = browser.new_context()

有人机验证

browser = p.chromium.launch(
    channel="chrome",  # 使用本地 Chrome 浏览器
    headless=False  # 启用 GUI 模式
)

context = browser.new_context(
    user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    )

感觉这个人机验证也是个玄学啊，也可能跟网站有关系。

链接：https://mp.weixin.qq.com/s/LpsxegSY59zOoaPreyEnwQ

完整代码

import time, os
import random
import json
import math
from playwright.sync_api import sync_playwright, expect
import pandas as pd

def human_like_mouse_move(page, start_x, start_y, end_x, end_y):
    # 添加随机化的起始和结束点偏移
    start_x += random.randint(-10, 10)
    start_y += random.randint(-10, 10)
    end_x += random.randint(-10, 10)
    end_y += random.randint(-10, 10)

    # 随机化步数和非线性变化
    steps = random.randint(100, 200)
    curve_factor = random.uniform(0.1, 0.4)

    for i in range(steps + 1):
        t = i / steps
        x = start_x + (end_x - start_x) * t + random.uniform(-1, 1)
        y = start_y + (end_y - start_y) * (t ** curve_factor) + math.sin(t * math.pi) * random.randint(3, 7) + random.uniform(-1, 1)
        time.sleep(random.uniform(0.02, 0.05) if random.random() < 0.1 else random.uniform(0.005, 0.01))
        page.mouse.move(x, y)
    page.mouse.move(end_x, end_y)

# def monitor_progress(page, progress_locator, hold_time):
#     start_time = time.time()
#     while time.time() - start_time < hold_time:
#         # 获取进度条的当前值
#         progress_value = page.locator(progress_locator).get_attribute('aria-valuenow')
        
#         if progress_value:
#             print(f"Current progress: {progress_value}%")
        
#         # 如果进度达到一定值，提前松开鼠标
#         if progress_value and int(progress_value) >= 100:
#             print("Progress complete, releasing mouse early")
#             break

#         # 等待片刻，继续检查进度
#         time.sleep(0.1)

def solve_captcha(page):
    captcha_container = page.locator('.px-captcha-container')
    expect(captcha_container).to_be_visible(timeout=15000)
    button = page.locator('#px-captcha')
    expect(button).to_be_visible()
    button_box = button.bounding_box()
    start_x, start_y = random.randint(0, page.viewport_size['width']), random.randint(0, page.viewport_size['height'])
    end_x, end_y = button_box['x'] + button_box['width'] / 2, button_box['y'] + button_box['height'] / 2
    human_like_mouse_move(page, start_x, start_y, end_x, end_y)
    time.sleep(random.uniform(0.3, 0.6))
    page.mouse.down()
    hold_time = random.uniform(15, 20)
    start_time = time.time()
    while time.time() - start_time < hold_time:
        page.mouse.move(end_x + random.uniform(-1.5, 1.5), end_y + random.uniform(-1.5, 1.5))
        time.sleep(random.uniform(0.05, 0.15))
    # 模拟进度条监控，传入进度条元素的定位符，如 'div.progress-bar'
    # monitor_progress(page, 'div.progress-bar', hold_time=random.uniform(15, 25))
    page.mouse.up()
    time.sleep(random.uniform(30, 80))
    # time.sleep(5000)

def parse_url(uname, max_retries=3):
    # url = f'https://www.niche.com/api/custom-site-search/?query={uname}&page=1&category=all'
    url = f'https://www.niche.com/api/sherlock-search/?c=30&q={uname}&t=u&s=&a=0'
    with sync_playwright() as p:
        browser = p.chromium.launch(
            channel="chrome",  # 使用本地 Chrome 浏览器
            headless=False  # 启用 GUI 模式
        )

        context = browser.new_context()

        page = context.new_page()
        # page.evaluate("() => delete navigator.webdriver")  # 删除 `navigator.webdriver`
        page.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
            """)

        for attempt in range(max_retries):
            response = page.goto(url, wait_until='networkidle')
            captcha = page.locator('.px-captcha-container')
            if captcha.count() > 0:
                print(f"CAPTCHA detected, attempting to solve... (Attempt {attempt + 1})")
                solve_captcha(page)
                if not page.locator('.px-captcha-container').is_visible():
                    print("CAPTCHA solved successfully")
                    break
                else:
                    print("CAPTCHA solution failed, retrying...")
                    time.sleep(random.uniform(5, 10))
            else:
                print("No CAPTCHA detected")
                break
        content = response.text()
        try:
            json_data = json.loads(content)
            # print(json.dumps(json_data, indent=2))
            with open(f'data/{uname}.json', 'w') as js:
                json.dump(json_data, js, indent=2)
        except json.JSONDecodeError:
            print("Response is not valid JSON")
            print("Response content:", content)
        browser.close()

df_usnews = pd.read_excel('../usnews大学排名.xlsx').iloc[:300,:]
for u in df_usnews['name']:
    if os.path.exists(f'data/{u}.json'):
        continue
    parse_url(u)

工具

爬虫

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

大模型系统提示词破解上一篇

Playwright模拟登陆下一篇