WebDriver 登陆 Jsoup抓取内容
WebDriver 登陆 Jsoup抓取内容
KongFanhao 发表于2年前
WebDriver 登陆 Jsoup抓取内容
  • 发表于 2年前
  • 阅读 66
  • 收藏 0
  • 点赞 1
  • 评论 0

标题:腾讯云 新注册用户域名抢购1元起>>>   

摘要: 使用WebDriver登陆西祠胡同并抓取的的demo

 

1. 环境

pom:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>mybatis</groupId>
	<artifactId>test</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>test</name>
	<url>http://maven.apache.org</url>

	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>

	<!-- 添加mybatis-generator插件 -->
	<!-- ——>在Goals框中输入:mybatis-generator:generate 运行mybatis插件 -->
	<build>
		<plugins>
			<plugin>
				<groupId>org.mybatis.generator</groupId>
				<artifactId>mybatis-generator-maven-plugin</artifactId>
				<version>1.3.2</version>
				<configuration>
					<verbose>true</verbose>
					<overwrite>true</overwrite>
				</configuration>
			</plugin>
		</plugins>
	</build>


	<dependencies>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.38</version>
		</dependency>

		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>3.8.1</version>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.mybatis</groupId>
			<artifactId>mybatis</artifactId>
			<version>3.3.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>3.12</version>
		</dependency>

		<dependency>
			<groupId>commons-logging</groupId>
			<artifactId>commons-logging</artifactId>
			<version>1.2</version>
		</dependency>
		<dependency>
			<groupId>net.sourceforge.jexcelapi</groupId>
			<artifactId>jxl</artifactId>
			<version>2.6.12</version>
		</dependency>

		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.2</version>
		</dependency>

		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
		</dependency>
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.8.3</version>
		</dependency>
		<dependency>
			<groupId>org.seleniumhq.selenium</groupId>
			<artifactId>selenium-server</artifactId>
			<version>2.53.0</version>
		</dependency>
		<dependency>
			<groupId>log4j</groupId>
			<artifactId>log4j</artifactId>
			<version>1.2.17</version>
		</dependency>


	</dependencies>

</project>

2. 初始化WebDriver的类 DriverFactory.java

 

 

package test;

import java.util.Arrays;

import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;

public class DriverFactory {

	public static ChromeDriver create() {

		// TODO Auto-generated method stub
		String chromdriver = "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";
		System.setProperty("webdriver.chrome.driver", chromdriver);
		ChromeOptions options = new ChromeOptions();

		DesiredCapabilities capabilities = DesiredCapabilities.chrome();
		capabilities.setCapability("chrome.switches", Arrays.asList("--start-maximized"));
		options.addArguments("--test-type", "--start-maximized");
		ChromeDriver driver = new ChromeDriver(options);
		return driver;
	}

}

 

 

3. 西祠胡同的登陆抓取类

package test;

import java.io.File;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.openqa.jetty.http.SSORealm;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.Platform;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
import mx4j.log.Log;

public class XiciLogin2 {
	public Logger log = Logger.getLogger(Main.class);
	public static Set<Cookie> cookies = new HashSet<Cookie>();
	public static ChromeDriver driver = DriverFactory.create();

	/** * 抓取到每一个分页上所有详细页链接 * *@param url */
	public List<String> crawlSource(String url) {
		int time = 1;
		System.out.println("开始抓: " + url);
		log.info("开始抓: " + url);
		List<String> sourceUrls = new ArrayList<String>();
		String baseUrl = "http://www.xici.net";
		driver.get(url);
		Document document = Jsoup.parse(driver.getPageSource());
		WebElement webElement = driver.findElement(By.xpath(".//*[@id='board_t']/tbody/tr/td[2]/a"));
		Elements elements = document.select("table#board_t tbody tr");
		System.out.println(elements);
		if (elements != null) {
			for (Element element : elements) {
				if (element.select("td").isEmpty()) {
					continue;
				}
				String targets = element.select("td a[onclick=this.parentNode.className ='visited';]").attr("href");
				if (targets == "" || targets == null) {
					continue;
				}
				targets = baseUrl + targets; // System.out.println(targets);
				sourceUrls.add(targets);
			}
		} else

		{
			System.out.println(url + "中没有详细页链接~~");
		}
		System.out.println(sourceUrls.size());
		if (sourceUrls.size() == 0 && time <= 5) {
			System.out.println("抓不到啦~ 重新抓一下");
			crawlSource(url);
			time++;
		}
		return sourceUrls;
	}

	/** * 解析详细页 出东西 */
	public void crawlTarget(String url) {
		driver.get(url);
		Document document = Jsoup.parse(driver.getPageSource());
		System.out.println("抓" + url + "的标题"); // 取标题 Element element =
		document.select("div#doc_tit h1").first();
		if (element != null) {
			System.out.println("标题:" + element.text());
		} else {
			System.out.println("");
		}
	}

	public static void main(String[] args) {
		PropertyConfigurator.configure("log4j.properties");
		XiciLogin2 xc = new XiciLogin2();
		String site = "http://www.xici.net/b1513005/";
		try {
			xc.xiciLogin(); //
			xc.crawlTarget("http://www.xici.net/d191739198.htm");
			xc.getMaxPageNum("http://www.xici.net/b1468535/");
			int page = 1; //
			int maxPageNum = xc.getMaxPageNum(site);
			do {
				String sourceUrl = site + page;
				System.out.println("分页: " + sourceUrl);
				List<String> targetsList = xc.crawlSource(sourceUrl);
				if (targetsList.isEmpty()) {
					System.out.println("没抓到详细页!!");
				} else {
					for (String target : targetsList) {
						try {
							xc.crawlTarget(target);
							Thread.sleep(3000);
						} catch (Exception e) {
							e.printStackTrace();
						}
					}
				}
				page++;
				Thread.sleep(3500);
			} while (page <= 15);
		} catch (

		Exception e) {
			e.printStackTrace();
		}
	}

	/** * 获取当前入口site的最大分页数 **/
	public int getMaxPageNum(String site) {
		Document document = null;
		int maxPageNum = 0;
		try {
			document = Jsoup.connect(site).get();
			Element element = document.select("div#page").first();
			String s = element.text();
			if (s.contains("共")) {
				s = s.split("共")[1];
				s = s.split("页")[0];
			}
			System.out.println(s);
			maxPageNum = Integer.parseInt(s);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return maxPageNum;
	}

	public void xiciLogin() throws Exception {
		System.setProperty("webdriver.chrome.driver",
				"C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe");
		driver.get("http://account.xici.net/login");
		WebElement user = driver.findElement(By.name("username"));
		WebElement pwa = driver.findElement(By.name("password")); // 分别将用户名和密码文本框清空
																	// user.clear();
		pwa.clear(); // 输入用户名和密码 user.sendKeys("*******");
		pwa.sendKeys("*********"); // 找到登陆按钮点击 //
		driver.findElement(By.name("TANGRAM__PSP_3__submit")).click();
		driver.findElement(By.xpath("html/body/div[3]/div[2]/div[2]/form/div[4]/button")).click();
		// 输出title System.out.println(driver.getTitle()); cookies =
		driver.manage().getCookies();
		System.out.println(cookies);
		for (Cookie cookie2 : cookies) {
			driver.manage().addCookie(cookie2);
			System.out.println(cookie2);
		} // 能打开15页说明登陆成功 //
		driver.get("http://www.xici.net/b1402132/15");
	}
}
 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

共有 人打赏支持
粉丝 4
博文 75
码字总数 42265
×
KongFanhao
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: