自动化之跳过人机验证
由于时效问题,该文某些代码、技术可能已经过期,请注意!!!本文最后更新于:2 个月前
playwright
人机验证问题
在header里去掉了 User-Agent 后可以正常访问,加上就会引起网站的人机验证。
无人机验证
1 |
|
有人机验证
1 |
|
感觉这个人机验证也是个玄学啊,也可能跟网站有关系。
- 链接:https://mp.weixin.qq.com/s/LpsxegSY59zOoaPreyEnwQ
- 完整代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117import time, os
import random
import json
import math
from playwright.sync_api import sync_playwright, expect
import pandas as pd
def human_like_mouse_move(page, start_x, start_y, end_x, end_y):
# 添加随机化的起始和结束点偏移
start_x += random.randint(-10, 10)
start_y += random.randint(-10, 10)
end_x += random.randint(-10, 10)
end_y += random.randint(-10, 10)
# 随机化步数和非线性变化
steps = random.randint(100, 200)
curve_factor = random.uniform(0.1, 0.4)
for i in range(steps + 1):
t = i / steps
x = start_x + (end_x - start_x) * t + random.uniform(-1, 1)
y = start_y + (end_y - start_y) * (t ** curve_factor) + math.sin(t * math.pi) * random.randint(3, 7) + random.uniform(-1, 1)
time.sleep(random.uniform(0.02, 0.05) if random.random() < 0.1 else random.uniform(0.005, 0.01))
page.mouse.move(x, y)
page.mouse.move(end_x, end_y)
# def monitor_progress(page, progress_locator, hold_time):
# start_time = time.time()
# while time.time() - start_time < hold_time:
# # 获取进度条的当前值
# progress_value = page.locator(progress_locator).get_attribute('aria-valuenow')
# if progress_value:
# print(f"Current progress: {progress_value}%")
# # 如果进度达到一定值,提前松开鼠标
# if progress_value and int(progress_value) >= 100:
# print("Progress complete, releasing mouse early")
# break
# # 等待片刻,继续检查进度
# time.sleep(0.1)
def solve_captcha(page):
captcha_container = page.locator('.px-captcha-container')
expect(captcha_container).to_be_visible(timeout=15000)
button = page.locator('#px-captcha')
expect(button).to_be_visible()
button_box = button.bounding_box()
start_x, start_y = random.randint(0, page.viewport_size['width']), random.randint(0, page.viewport_size['height'])
end_x, end_y = button_box['x'] + button_box['width'] / 2, button_box['y'] + button_box['height'] / 2
human_like_mouse_move(page, start_x, start_y, end_x, end_y)
time.sleep(random.uniform(0.3, 0.6))
page.mouse.down()
hold_time = random.uniform(15, 20)
start_time = time.time()
while time.time() - start_time < hold_time:
page.mouse.move(end_x + random.uniform(-1.5, 1.5), end_y + random.uniform(-1.5, 1.5))
time.sleep(random.uniform(0.05, 0.15))
# 模拟进度条监控,传入进度条元素的定位符,如 'div.progress-bar'
# monitor_progress(page, 'div.progress-bar', hold_time=random.uniform(15, 25))
page.mouse.up()
time.sleep(random.uniform(30, 80))
# time.sleep(5000)
def parse_url(uname, max_retries=3):
# url = f'https://www.niche.com/api/custom-site-search/?query={uname}&page=1&category=all'
url = f'https://www.niche.com/api/sherlock-search/?c=30&q={uname}&t=u&s=&a=0'
with sync_playwright() as p:
browser = p.chromium.launch(
channel="chrome", # 使用本地 Chrome 浏览器
headless=False # 启用 GUI 模式
)
context = browser.new_context()
page = context.new_page()
# page.evaluate("() => delete navigator.webdriver") # 删除 `navigator.webdriver`
page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
for attempt in range(max_retries):
response = page.goto(url, wait_until='networkidle')
captcha = page.locator('.px-captcha-container')
if captcha.count() > 0:
print(f"CAPTCHA detected, attempting to solve... (Attempt {attempt + 1})")
solve_captcha(page)
if not page.locator('.px-captcha-container').is_visible():
print("CAPTCHA solved successfully")
break
else:
print("CAPTCHA solution failed, retrying...")
time.sleep(random.uniform(5, 10))
else:
print("No CAPTCHA detected")
break
content = response.text()
try:
json_data = json.loads(content)
# print(json.dumps(json_data, indent=2))
with open(f'data/{uname}.json', 'w') as js:
json.dump(json_data, js, indent=2)
except json.JSONDecodeError:
print("Response is not valid JSON")
print("Response content:", content)
browser.close()
df_usnews = pd.read_excel('../usnews大学排名.xlsx').iloc[:300,:]
for u in df_usnews['name']:
if os.path.exists(f'data/{u}.json'):
continue
parse_url(u)
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!