使用playwright异步自动化爬虫
            
              
                
                  由于时效问题,该文某些代码、技术可能已经过期,请注意!!!本文最后更新于:1 年前
                
              
            
            
              playwright
playwright代替selenium,异步爬虫
安装
| 12
 
 | pip install playwrightplaywright install chromium
 
 | 
使用
基本使用
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 
 | import asynciofrom playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 
 async def fetch_data(url):
 
 async with async_playwright() as p:
 
 browser = await p.chromium.launch(headless=False)
 page = await browser.new_page()
 
 
 await page.goto(url, timeout=600000)
 
 
 await asyncio.sleep(3)
 
 
 content = await page.content()
 
 
 
 
 
 await browser.close()
 
 
 asyncio.run(fetch_data(url))
 
 | 
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 
 | import asynciofrom playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 
 usnews_dict = {}
 
 async def fetch_data(page_number):
 
 url = f"https://www.usnews.com/best-colleges/api/search?format=json&schoolType=national-universities&_sort=rank&_sortDirection=asc&_page={page_number}"
 
 async with async_playwright() as p:
 
 browser = await p.chromium.launch(headless=False)
 page = await browser.new_page()
 
 
 await page.goto(url)
 
 
 json_data = await page.evaluate('''
 (url) => {
 return fetch(url)
 .then(response => response.json())
 .then(data => data)
 .catch(error => { throw new Error(error) });
 }
 ''', url)
 
 
 items = json_data.get('data', {}).get('items', [])
 for item in items:
 name = item['institution']['displayName']
 linkTxt = item['institution']['linkedDisplayName']
 linkSoup = BeautifulSoup(linkTxt, 'html.parser')
 link = 'https://www.usnews.com' + linkSoup.find('a').get('href')
 usnews_dict[name] = link
 
 
 print(f"Page {page_number}:")
 for name, link in usnews_dict.items():
 print(f"{name}: {link}")
 
 
 await browser.close()
 
 async def main():
 
 for page_number in range(1, 11):
 time.sleep(10)
 await fetch_data(page_number)
 
 
 asyncio.run(main())
 
 with open('usnews-100.json', 'w') as json_file:
 json.dump(usnews_dict, json_file, indent=2)
 
 
 
 | 
注意事项
- 异步爬虫,需要使用asyncio.run(main())来运行主函数
- 使用playwright时,需要使用async with async_playwright() as p:来启动浏览器
- 使用playwright时,需要使用await page.goto(url)来访问页面
- 使用playwright时,需要使用await page.evaluate(‘’’)来执行JavaScript代码
- 使用playwright时,需要使用await browser.close()来关闭浏览器
- 使用playwright时,需要使用await page.evaluate(‘’’)来获取页面数据
参考
playwright官方文档