在selenium中使用Chrome DevTools Protocol库实现采集

原创
02/24 08:54
阅读数 26

在selenium中使用Chrome DevTools Protocol库实现采集

官方Chrome DevTools Protocol(https://chromedevtools.github.io/devtools-protocol/)介绍,chrome devtools protocol(简称cdp)允许第三方程序通过 WebSocket对chrome 浏览器程序进行调试、分析等,有了这个协议就可以自己开发工具操作chrome和获取chrome的数据

github上基于cdp不同语言(nodejs,python,java...)的库
https://github.com/ChromeDevTools/awesome-chrome-devtools#chrome-devtools-protocol

selenium是通过webdriver操作浏览器的,相对cdp实现的库相比,性能慢好几倍。
研究了下org.openqa.selenium.chrome.ChromeDriver的实现,在保留旧的采集逻辑下,可以基于selenium的接口类,实现兼容的cdp和selenium的CdpDriver,
本文代码是基于cdp4j和jsoup两个库实现的selenium兼容

/**
*CdpDriver.java
**/
import io.webfolder.cdp.Launcher;
import io.webfolder.cdp.command.Network;
import io.webfolder.cdp.session.Session;
import io.webfolder.cdp.session.SessionFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import org.openqa.selenium.*;
import org.openqa.selenium.logging.Logs;
import java.util.*;

/**
 * 基于cdp4j实现的WebDriver,目前只实现了部分接口
 */
public class CdpDriver implements WebDriver {
    public final Logger log = LogManager.getLogger(CdpDriver.class);
    private Launcher launcher = null;
    private Session session = null;
    private io.webfolder.cdp.Options options = null;
    private io.webfolder.cdp.Options.Builder builder = null;
    private CdpTargetLocator targetLocator = null;
    private boolean headless = false;
    public CdpDriver(io.webfolder.cdp.Options options){
        launcher = new Launcher(options);
        SessionFactory factory = launcher.launch();
        session = factory.create();
    }


    public void get(String url) {
        try{
            session.navigate(url);
            session.waitDocumentReady(30000);
        }
        catch(Exception exception){
            log.error("CdpDriver get error====url:" + url + ",msg=" + exception.getMessage());
        }
        targetLocator = null;
    }
    public List<WebElement> findElements(By by) {
        String pageSource = this.getPageSource();
        int index = by.toString().indexOf(":");
        String selector = by.toString().substring(index + 1);
        Elements elements = Jsoup.parse(pageSource).select(selector);
        List<WebElement> list = new ArrayList<>();
        for(Element e:elements){
            list.add(new CdpWebElement(e, this));
        }
        return list;
    }

    public WebElement findElement(By by) {
        String pageSource = this.getPageSource();
        int index = by.toString().indexOf(":");
        String selector = by.toString().substring(index + 1);
        Element e = Jsoup.parse(pageSource).selectFirst(selector);
        return new CdpWebElement(e, this);
    }

    public void close() {
        session.close();
    }

    public void quit() {
        if (this.launcher != null) {
            launcher.kill();
        }
    }


    public String getCurrentUrl(){
        if(targetLocator != null){
            String frameId = targetLocator.getFrameId();
            return (String) session.evaluate("document.getElementById('"+frameId+"').contentWindow.window.location.href");
        }
        else{
            return (String)session.evaluate("window.location.href");
        }
    }

    public String getTitle(){
        return session.getTitle();
    }



    public String getPageSource(){
        if(targetLocator == null){
            return session.getContent();
        }
        else{
            String frameId = targetLocator.getFrameId();
            return (String) session.evaluate("document.getElementById('"+frameId+"').contentWindow.document.body.innerHTML");
        }
    }


    @Deprecated
    public Set<String> getWindowHandles(){
        return null;
    }
    @Deprecated
    public String getWindowHandle(){
        return "";
    }

    public WebDriver.TargetLocator switchTo(){
        targetLocator = new CdpTargetLocator();
        return targetLocator;
    }
    @Deprecated
    public WebDriver.Navigation navigate(){
        return null;
    }

    public WebDriver.Options manage(){
        return new CdpDriverOptions();
    }

    protected class CdpTargetLocator implements TargetLocator {
        private String frameId = null;
        protected CdpTargetLocator() {

        }
        public WebDriver frame(int var1){
            return null;
        }

        public WebDriver frame(String id){
            this.frameId = id;
            return null;
        }

        public WebDriver frame(WebElement var1){
            return null;
        }

        public WebDriver parentFrame(){
            return null;
        }

        public WebDriver window(String var1){
            return null;
        }

        public WebDriver defaultContent(){
            return null;
        }

        public WebElement activeElement(){
            return null;
        }

        public Alert alert(){
            return null;
        }
        public String getFrameId(){
            return frameId;
        }
    }

    /**
     * 只实现getCookies方法
     */
    protected class CdpDriverOptions implements WebDriver.Options {
        protected CdpDriverOptions() {

        }

        public void addCookie(Cookie var1){

        }

        public void deleteCookieNamed(String var1){

        }

        public void deleteCookie(Cookie var1){

        }

        public void deleteAllCookies(){

        }

        public Set<Cookie> getCookies(){
            Network network = session.getCommand().getNetwork();
            List<io.webfolder.cdp.type.network.Cookie> cookies = network.getCookies();
            Set<Cookie> set = new LinkedHashSet<Cookie>();
            for(io.webfolder.cdp.type.network.Cookie cookie:cookies){
                Cookie seleniumCoooike = new Cookie(cookie.getName(), cookie.getValue(),
                        cookie.getDomain(), cookie.getPath(),new Date(cookie.getExpires().longValue()));
                set.add(seleniumCoooike);
            }
            return set;
        }

        public Cookie getCookieNamed(String var1){
            return null;
        }

        public WebDriver.Timeouts timeouts(){
            return null;
        }

        public WebDriver.ImeHandler ime(){
            return null;
        }

        public WebDriver.Window window(){
            return null;
        }

        @Beta
        public Logs logs(){
            return null;
        }
    }
}

/**
* CdpWebElement.java
**/
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
 * 基于jsoup 实现
 */
public class CdpWebElement implements WebElement {
    private Element element = null;
    private WebDriver driver = null;
    public CdpWebElement(Element element, WebDriver driver){
        this.element = element;
        this.driver = driver;
    }

    public void click(){

    }

    public void submit(){

    }

    public void sendKeys(CharSequence... var1){

    }

    public void clear(){

    }

    public String getTagName(){
        return "";
    }

    public String getAttribute(String attributeKey){
        if(attributeKey.toLowerCase().equals("href") || attributeKey.toLowerCase().equals("src")){
            String href = element.attr(attributeKey);
            String url = driver.getCurrentUrl();
            if(href != null && href.startsWith("http") == false){
                href = getAbsUrl(url, href);
            }

            return href;
        }
        else if(attributeKey.toLowerCase().equals("innerhtml")){ //
            return element.html();
        }
        else if(attributeKey.toLowerCase().equals("outerhtml")){ //
            return element.outerHtml();
        }
        else{
            return element.attr(attributeKey);
        }
    }

    public boolean isSelected(){
        return true;
    }

    public boolean isEnabled(){
        return true;
    }

    public String getText(){
        return element.text();
    }

    public List<WebElement> findElements(By by){
        String selector = by.toString().split(":")[1].trim();
        Elements elements = element.select(selector);
        List<WebElement> list = new ArrayList<>();
        for(Element e:elements){
            list.add(new CdpWebElement(e, driver));
        }
        return list;
    }

    public WebElement findElement(By by){
        String selector = by.toString().split(":")[1].trim();
        Element e = element.selectFirst(selector);
        return new CdpWebElement(e, driver);
    }

    public boolean isDisplayed(){
        return true;
    }

    public Point getLocation(){
        return null;
    }

    public Dimension getSize(){
        return null;
    }

    public Rectangle getRect(){
        return null;
    }

    public String getCssValue(String var1){
        return "";
    }

    public <X> X getScreenshotAs(OutputType<X> outputType) throws WebDriverException {
        return outputType.convertFromBase64Png("");
    }
    public String getAbsUrl(String absolutePath, String relativePath){
        try {
            URL absoluteUrl = new URL(absolutePath);
            URL parseUrl = new URL(absoluteUrl ,relativePath );
            return parseUrl.toString();
        }
        catch (MalformedURLException e) {
            e.printStackTrace();
        }
        return "";
    }
}

使用例子

String path =  System.getProperty("user.dir") +  "cdp";
Options.Builder builder = io.webfolder.cdp.Options.builder()
                .headless(headless)
                .readTimeout(30000)
                .userDataDir(new File(path).toPath());
WebDriver driver = new CdpDriver(builder.build());

把原来的ChromeDriver更换为CdpDriver,性能提高了5倍。

展开阅读全文
打赏
1
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
1
分享
返回顶部
顶部