上次为了车载记录仪听个响,写了个爬取酷狗的爬虫,爬取热门歌曲。。听了这么久听腻了,感觉抖音的热门歌曲不错,于是就去网上找了下,9ku上有一个抖音热门歌曲,于是决定直接爬下来,但是用普通的java http请求不太好分析,正好前段时间研究了下selenium,决定用它来爬取。
下面是爬取过程的效果:
功能不复杂,主要就是用到了selenium来找到audio的src,还有httpClient等常用jar包,然后http直接下载就可以了,废话不多说,放一下主要代码。
package com.ath365.utils;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import com.bing.download.FileDownload;
public class Get9KuVideo {
private static String savePath = "F:\\movie\\9ku\\";
private static String baseUrl = "http://www.9ku.com";
private static WebDriver driver = null;
private static String search_handle = null;
public static void main(String[] args) throws Exception {
String url = baseUrl + "/douyin/bang.htm";
System.setProperty("webdriver.chrome.driver", "./source/chromedriver.exe");// chromedriver服务地址
driver = new ChromeDriver(); // 新建一个WebDriver 的对象,但是new 的是谷歌的驱动
//String url = "http://www.9ku.com/play/876666.htm";
driver.get(url);
getList(url);
}
public static void getList(String url) throws Exception {
search_handle = driver.getWindowHandle();
List<WebElement> list = driver.findElements(By.className("songName"));
System.out.println(list.size());
for(int i = 0 ; i < list.size(); i++) {
driver.switchTo().window(search_handle);
WebElement ele = list.get(i);
//ele.click();
((JavascriptExecutor)driver).executeScript("arguments[0].click()", ele);
try {
getDetail(savePath);
}catch(Exception e) {
continue;
}
}
}
public static void getDetail(String path) throws IOException {
//获得所有窗口句柄
Set<String> handles = driver.getWindowHandles();
//判断是否为注册窗口, 并操作注册窗口上的元素
for(String handle : handles){
if (handle.equals(search_handle)==false){
//切换到注册页面
driver.switchTo().window(handle);
break;
}
}
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String title = driver.findElement(By.className("playingTit")).findElement(By.tagName("h1")).getText();
String src = driver.findElement(By.tagName("audio")).getAttribute("src");
System.out.println(title);
System.out.println(src);
gosave(title, src,path);
}
private static String gosave(String title, String link,String path) {
path = path + title + ".mp4";
if (new File(path).exists())
return null;
save(path, link);
return path;
}
private static void save(String savePath, String link) {
FileDownload.download(link, savePath);
}
}
下面是一个自己的工具类FileDownload.java
package com.bing.download;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
/**
* @说明:
* @author: gaoll
* @CreateTime:2014-11-20
* @ModifyTime:2014-11-20
*/
public class FileDownload {
public static void main(String[] args) {
download("http://media.redianduanzi.com/convert/2019/03/22/5c94061e942d9-sd.mp4?t=1569652014&key=660566b478c5cbd92becf65e7e975286","E://20只狮子猎捕一头大象, 看大象能否狮口逃生?.mp4");
}
/**
* 文件下载
* @param url 链接地址
* @param path 要保存的路径及文件名
* @return
*/
public static boolean download(String url,String path){
boolean flag = false;
CloseableHttpClient httpclient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().
setSocketTimeout(2000).
setCookieSpec(CookieSpecs.STANDARD)
.setConnectTimeout(2000)
.build();
HttpGet get = new HttpGet(url);
get.setConfig(requestConfig);
BufferedInputStream in = null;
BufferedOutputStream out = null;
File file = new File(path);
try{
for(int i=0;i<3;i++){
try{
if(!file.exists()) {
file.createNewFile();
}
CloseableHttpResponse result = httpclient.execute(get);
System.out.println(result.getStatusLine());
if(result.getStatusLine().getStatusCode() == 200){
in = new BufferedInputStream(result.getEntity().getContent());
out = new BufferedOutputStream(new FileOutputStream(file));
byte[] buffer = new byte[4096];
int len = -1;
while((len = in.read(buffer,0,4096)) > -1){
out.write(buffer,0,len);
}
flag = true;
break;
}else if(result.getStatusLine().getStatusCode() == 500){
continue ;
}else{
break ;
}
}catch(Exception e){
continue;
}
}
}catch(Exception e){
try{
if(in != null){
in.close();
}
if(out != null){
out.close();
}
}catch(Exception ec){
ec.printStackTrace();
flag = false;
}
file.delete();
e.printStackTrace();
flag = false;
}finally{
get.releaseConnection();
try{
if(in != null){
in.close();
}
if(out != null){
out.close();
}
}catch(Exception e){
e.printStackTrace();
flag = false;
}
}
return flag;
}
private static Log log = LogFactory.getLog(FileDownload.class);
}