自动化之跳过人机验证

由于时效问题,该文某些代码、技术可能已经过期,请注意!!!本文最后更新于:2 个月前

playwright

人机验证问题

在header里去掉了 User-Agent 后可以正常访问,加上就会引起网站的人机验证。

无人机验证

1
2
3
4
5
6
browser = p.chromium.launch(
channel="chrome", # 使用本地 Chrome 浏览器
headless=False # 启用 GUI 模式
)

context = browser.new_context()

有人机验证

1
2
3
4
5
6
7
8
browser = p.chromium.launch(
channel="chrome", # 使用本地 Chrome 浏览器
headless=False # 启用 GUI 模式
)

context = browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
)

感觉这个人机验证也是个玄学啊,也可能跟网站有关系。

  • 链接:https://mp.weixin.qq.com/s/LpsxegSY59zOoaPreyEnwQ
  • 完整代码
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    import time, os
    import random
    import json
    import math
    from playwright.sync_api import sync_playwright, expect
    import pandas as pd

    def human_like_mouse_move(page, start_x, start_y, end_x, end_y):
    # 添加随机化的起始和结束点偏移
    start_x += random.randint(-10, 10)
    start_y += random.randint(-10, 10)
    end_x += random.randint(-10, 10)
    end_y += random.randint(-10, 10)

    # 随机化步数和非线性变化
    steps = random.randint(100, 200)
    curve_factor = random.uniform(0.1, 0.4)

    for i in range(steps + 1):
    t = i / steps
    x = start_x + (end_x - start_x) * t + random.uniform(-1, 1)
    y = start_y + (end_y - start_y) * (t ** curve_factor) + math.sin(t * math.pi) * random.randint(3, 7) + random.uniform(-1, 1)
    time.sleep(random.uniform(0.02, 0.05) if random.random() < 0.1 else random.uniform(0.005, 0.01))
    page.mouse.move(x, y)
    page.mouse.move(end_x, end_y)

    # def monitor_progress(page, progress_locator, hold_time):
    # start_time = time.time()
    # while time.time() - start_time < hold_time:
    # # 获取进度条的当前值
    # progress_value = page.locator(progress_locator).get_attribute('aria-valuenow')

    # if progress_value:
    # print(f"Current progress: {progress_value}%")

    # # 如果进度达到一定值,提前松开鼠标
    # if progress_value and int(progress_value) >= 100:
    # print("Progress complete, releasing mouse early")
    # break

    # # 等待片刻,继续检查进度
    # time.sleep(0.1)

    def solve_captcha(page):
    captcha_container = page.locator('.px-captcha-container')
    expect(captcha_container).to_be_visible(timeout=15000)
    button = page.locator('#px-captcha')
    expect(button).to_be_visible()
    button_box = button.bounding_box()
    start_x, start_y = random.randint(0, page.viewport_size['width']), random.randint(0, page.viewport_size['height'])
    end_x, end_y = button_box['x'] + button_box['width'] / 2, button_box['y'] + button_box['height'] / 2
    human_like_mouse_move(page, start_x, start_y, end_x, end_y)
    time.sleep(random.uniform(0.3, 0.6))
    page.mouse.down()
    hold_time = random.uniform(15, 20)
    start_time = time.time()
    while time.time() - start_time < hold_time:
    page.mouse.move(end_x + random.uniform(-1.5, 1.5), end_y + random.uniform(-1.5, 1.5))
    time.sleep(random.uniform(0.05, 0.15))
    # 模拟进度条监控,传入进度条元素的定位符,如 'div.progress-bar'
    # monitor_progress(page, 'div.progress-bar', hold_time=random.uniform(15, 25))
    page.mouse.up()
    time.sleep(random.uniform(30, 80))
    # time.sleep(5000)

    def parse_url(uname, max_retries=3):
    # url = f'https://www.niche.com/api/custom-site-search/?query={uname}&page=1&category=all'
    url = f'https://www.niche.com/api/sherlock-search/?c=30&q={uname}&t=u&s=&a=0'
    with sync_playwright() as p:
    browser = p.chromium.launch(
    channel="chrome", # 使用本地 Chrome 浏览器
    headless=False # 启用 GUI 模式
    )

    context = browser.new_context()

    page = context.new_page()
    # page.evaluate("() => delete navigator.webdriver") # 删除 `navigator.webdriver`
    page.add_init_script("""
    Object.defineProperty(navigator, 'webdriver', {
    get: () => undefined
    });
    """)

    for attempt in range(max_retries):
    response = page.goto(url, wait_until='networkidle')
    captcha = page.locator('.px-captcha-container')
    if captcha.count() > 0:
    print(f"CAPTCHA detected, attempting to solve... (Attempt {attempt + 1})")
    solve_captcha(page)
    if not page.locator('.px-captcha-container').is_visible():
    print("CAPTCHA solved successfully")
    break
    else:
    print("CAPTCHA solution failed, retrying...")
    time.sleep(random.uniform(5, 10))
    else:
    print("No CAPTCHA detected")
    break
    content = response.text()
    try:
    json_data = json.loads(content)
    # print(json.dumps(json_data, indent=2))
    with open(f'data/{uname}.json', 'w') as js:
    json.dump(json_data, js, indent=2)
    except json.JSONDecodeError:
    print("Response is not valid JSON")
    print("Response content:", content)
    browser.close()

    df_usnews = pd.read_excel('../usnews大学排名.xlsx').iloc[:300,:]
    for u in df_usnews['name']:
    if os.path.exists(f'data/{u}.json'):
    continue
    parse_url(u)



本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!