在selenium中使用Chrome DevTools Protocol库实现采集
官方Chrome DevTools Protocol(https://chromedevtools.github.io/devtools-protocol/)介绍,chrome devtools protocol(简称cdp)允许第三方程序通过 WebSocket对chrome 浏览器程序进行调试、分析等,有了这个协议就可以自己开发工具操作chrome和获取chrome的数据
github上基于cdp不同语言(nodejs,python,java...)的库
https://github.com/ChromeDevTools/awesome-chrome-devtools#chrome-devtools-protocol
selenium是通过webdriver操作浏览器的,相对cdp实现的库相比,性能慢好几倍。
研究了下org.openqa.selenium.chrome.ChromeDriver的实现,在保留旧的采集逻辑下,可以基于selenium的接口类,实现兼容的cdp和selenium的CdpDriver,
本文代码是基于cdp4j和jsoup两个库实现的selenium兼容
/**
*CdpDriver.java
**/
import io.webfolder.cdp.Launcher;
import io.webfolder.cdp.command.Network;
import io.webfolder.cdp.session.Session;
import io.webfolder.cdp.session.SessionFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.*;
import org.openqa.selenium.logging.Logs;
import java.util.*;
/**
* 基于cdp4j实现的WebDriver,目前只实现了部分接口
*/
public class CdpDriver implements WebDriver {
public final Logger log = LogManager.getLogger(CdpDriver.class);
private Launcher launcher = null;
private Session session = null;
private io.webfolder.cdp.Options options = null;
private io.webfolder.cdp.Options.Builder builder = null;
private CdpTargetLocator targetLocator = null;
private boolean headless = false;
public CdpDriver(io.webfolder.cdp.Options options){
launcher = new Launcher(options);
SessionFactory factory = launcher.launch();
session = factory.create();
}
public void get(String url) {
try{
session.navigate(url);
session.waitDocumentReady(30000);
}
catch(Exception exception){
log.error("CdpDriver get error====url:" + url + ",msg=" + exception.getMessage());
}
targetLocator = null;
}
public List<WebElement> findElements(By by) {
String pageSource = this.getPageSource();
int index = by.toString().indexOf(":");
String selector = by.toString().substring(index + 1);
Elements elements = Jsoup.parse(pageSource).select(selector);
List<WebElement> list = new ArrayList<>();
for(Element e:elements){
list.add(new CdpWebElement(e, this));
}
return list;
}
public WebElement findElement(By by) {
String pageSource = this.getPageSource();
int index = by.toString().indexOf(":");
String selector = by.toString().substring(index + 1);
Element e = Jsoup.parse(pageSource).selectFirst(selector);
return new CdpWebElement(e, this);
}
public void close() {
session.close();
}
public void quit() {
if (this.launcher != null) {
launcher.kill();
}
}
public String getCurrentUrl(){
if(targetLocator != null){
String frameId = targetLocator.getFrameId();
return (String) session.evaluate("document.getElementById('"+frameId+"').contentWindow.window.location.href");
}
else{
return (String)session.evaluate("window.location.href");
}
}
public String getTitle(){
return session.getTitle();
}
public String getPageSource(){
if(targetLocator == null){
return session.getContent();
}
else{
String frameId = targetLocator.getFrameId();
return (String) session.evaluate("document.getElementById('"+frameId+"').contentWindow.document.body.innerHTML");
}
}
@Deprecated
public Set<String> getWindowHandles(){
return null;
}
@Deprecated
public String getWindowHandle(){
return "";
}
public WebDriver.TargetLocator switchTo(){
targetLocator = new CdpTargetLocator();
return targetLocator;
}
@Deprecated
public WebDriver.Navigation navigate(){
return null;
}
public WebDriver.Options manage(){
return new CdpDriverOptions();
}
protected class CdpTargetLocator implements TargetLocator {
private String frameId = null;
protected CdpTargetLocator() {
}
public WebDriver frame(int var1){
return null;
}
public WebDriver frame(String id){
this.frameId = id;
return null;
}
public WebDriver frame(WebElement var1){
return null;
}
public WebDriver parentFrame(){
return null;
}
public WebDriver window(String var1){
return null;
}
public WebDriver defaultContent(){
return null;
}
public WebElement activeElement(){
return null;
}
public Alert alert(){
return null;
}
public String getFrameId(){
return frameId;
}
}
/**
* 只实现getCookies方法
*/
protected class CdpDriverOptions implements WebDriver.Options {
protected CdpDriverOptions() {
}
public void addCookie(Cookie var1){
}
public void deleteCookieNamed(String var1){
}
public void deleteCookie(Cookie var1){
}
public void deleteAllCookies(){
}
public Set<Cookie> getCookies(){
Network network = session.getCommand().getNetwork();
List<io.webfolder.cdp.type.network.Cookie> cookies = network.getCookies();
Set<Cookie> set = new LinkedHashSet<Cookie>();
for(io.webfolder.cdp.type.network.Cookie cookie:cookies){
Cookie seleniumCoooike = new Cookie(cookie.getName(), cookie.getValue(),
cookie.getDomain(), cookie.getPath(),new Date(cookie.getExpires().longValue()));
set.add(seleniumCoooike);
}
return set;
}
public Cookie getCookieNamed(String var1){
return null;
}
public WebDriver.Timeouts timeouts(){
return null;
}
public WebDriver.ImeHandler ime(){
return null;
}
public WebDriver.Window window(){
return null;
}
@Beta
public Logs logs(){
return null;
}
}
}
/**
* CdpWebElement.java
**/
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* 基于jsoup 实现
*/
public class CdpWebElement implements WebElement {
private Element element = null;
private WebDriver driver = null;
public CdpWebElement(Element element, WebDriver driver){
this.element = element;
this.driver = driver;
}
public void click(){
}
public void submit(){
}
public void sendKeys(CharSequence... var1){
}
public void clear(){
}
public String getTagName(){
return "";
}
public String getAttribute(String attributeKey){
if(attributeKey.toLowerCase().equals("href") || attributeKey.toLowerCase().equals("src")){
String href = element.attr(attributeKey);
String url = driver.getCurrentUrl();
if(href != null && href.startsWith("http") == false){
href = getAbsUrl(url, href);
}
return href;
}
else if(attributeKey.toLowerCase().equals("innerhtml")){ //
return element.html();
}
else if(attributeKey.toLowerCase().equals("outerhtml")){ //
return element.outerHtml();
}
else{
return element.attr(attributeKey);
}
}
public boolean isSelected(){
return true;
}
public boolean isEnabled(){
return true;
}
public String getText(){
return element.text();
}
public List<WebElement> findElements(By by){
String selector = by.toString().split(":")[1].trim();
Elements elements = element.select(selector);
List<WebElement> list = new ArrayList<>();
for(Element e:elements){
list.add(new CdpWebElement(e, driver));
}
return list;
}
public WebElement findElement(By by){
String selector = by.toString().split(":")[1].trim();
Element e = element.selectFirst(selector);
return new CdpWebElement(e, driver);
}
public boolean isDisplayed(){
return true;
}
public Point getLocation(){
return null;
}
public Dimension getSize(){
return null;
}
public Rectangle getRect(){
return null;
}
public String getCssValue(String var1){
return "";
}
public <X> X getScreenshotAs(OutputType<X> outputType) throws WebDriverException {
return outputType.convertFromBase64Png("");
}
public String getAbsUrl(String absolutePath, String relativePath){
try {
URL absoluteUrl = new URL(absolutePath);
URL parseUrl = new URL(absoluteUrl ,relativePath );
return parseUrl.toString();
}
catch (MalformedURLException e) {
e.printStackTrace();
}
return "";
}
}
使用例子
String path = System.getProperty("user.dir") + "cdp";
Options.Builder builder = io.webfolder.cdp.Options.builder()
.headless(headless)
.readTimeout(30000)
.userDataDir(new File(path).toPath());
WebDriver driver = new CdpDriver(builder.build());
把原来的ChromeDriver更换为CdpDriver,性能提高了5倍。