玫瑰花变蚊子血,自动化无痕浏览器对比测试( 三 )


数据返回:
[{'name': 'Andorra', 'capital': 'Andorra la Vella', 'population': '84000', 'area (km sq)': '468.0'},{'name': 'United Arab Emirates', 'capital': 'Abu Dhabi', 'population': '4975593', 'area (km sq)': '82880.0'},{'name': 'Afghanistan', 'capital': 'Kabul', 'population': '29121286', 'area (km sq)': '647500.0'},{'name': 'Antigua and Barbuda', 'capital': "St. John's", 'population': '86754', 'area (km sq)': '443.0'},{'name': 'Anguilla', 'capital': 'The Valley', 'population': '13254', 'area (km sq)': '102.0'},...]
性能测试
在数据抓取量一样的前提下,我们当然需要知道到底谁的性能更好,是,还是?
这里我们使用.10内置的time模块来统计爬虫脚本的执行速度 。
:
import timefrom playwright.sync_api import sync_playwrightdef extract_data(entry):name = entry.locator("h3").inner_text().strip("\n").strip()capital = entry.locator("span.country-capital").inner_text()population = entry.locator("span.country-population").inner_text()area = entry.locator("span.country-area").inner_text()return {"name": name, "capital": capital, "population": population, "area (km sq)": area}start = time.time()with sync_playwright() as p:# launch the browser instance and define a new contextbrowser = p.chromium.launch()context = browser.new_context()# open a new tab and go to the websitepage = context.new_page()page.goto("https://www.scrapethissite.com/pages/")# click to the first page and wait while page loadspage.locator("a[href=http://www.kingceram.com/post/'/pages/simple/']").click()page.wait_for_load_state("load")# get the countriescountries = page.locator("div.country")n_countries = countries.count()data = http://www.kingceram.com/post/[]for i in range(n_countries):entry = countries.nth(i)sample = extract_data(entry)data.append(sample)browser.close()end = time.time()print(f"The whole script took: {end-start:.4f}")
:
import timefrom selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import By# web driver manager: https://github.com/SergeyPirogov/webdriver_manager# will help us automatically download the web driver binaries# then we can use `Service` to manage the web driver's state.from webdriver_manager.chrome import ChromeDriverManagerdef extract_data(row):name = row.find_element(By.TAG_NAME, "h3").text.strip("\n").strip()capital = row.find_element(By.CSS_SELECTOR, "span.country-capital").textpopulation = row.find_element(By.CSS_SELECTOR, "span.country-population").textarea = row.find_element(By.CSS_SELECTOR, "span.country-area").textreturn {"name": name, "capital": capital, "population": population, "area (km sq)": area}# start the timerstart = time.time()options = webdriver.ChromeOptions()options.headless = True# this returns the path web driver downloadedchrome_path = ChromeDriverManager().install()# define the chrome service and pass it to the driver instancechrome_service = Service(chrome_path)driver = webdriver.Chrome(service=chrome_service, options=options)url = "https://www.scrapethissite.com/pages/"driver.get(url)# get the first page and click to the linkfirst_page = driver.find_element(By.CSS_SELECTOR, "h3.page-title a")first_page.click()# get the data div and extract the data using beautifulsoupcountries_container = driver.find_element(By.CSS_SELECTOR, "section#countries div.container")countries = driver.find_elements(By.CSS_SELECTOR, "div.country")# scrape the data using extract_data functiondata = http://www.kingceram.com/post/list(map(extract_data, countries))end = time.time()print(f"The whole script took: {end-start:.4f}")driver.quit()
测试结果:
Y轴是执行时间,一望而知,比差了大概五倍左右 。