java爬虫之爬取抖音热门歌曲

原创
2019/10/13 23:06
阅读数 2.5K

上次为了车载记录仪听个响,写了个爬取酷狗的爬虫,爬取热门歌曲。。听了这么久听腻了,感觉抖音的热门歌曲不错,于是就去网上找了下,9ku上有一个抖音热门歌曲,于是决定直接爬下来,但是用普通的java  http请求不太好分析,正好前段时间研究了下selenium,决定用它来爬取。

下面是爬取过程的效果:

功能不复杂,主要就是用到了selenium来找到audio的src,还有httpClient等常用jar包,然后http直接下载就可以了,废话不多说,放一下主要代码。

package com.ath365.utils;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Set;

import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;

import com.bing.download.FileDownload;

public class Get9KuVideo {

	private static String savePath = "F:\\movie\\9ku\\";
	private static String baseUrl = "http://www.9ku.com";
	private static WebDriver driver = null;
	private static String search_handle = null;

	public static void main(String[] args) throws Exception {

		String url = baseUrl + "/douyin/bang.htm";
		System.setProperty("webdriver.chrome.driver", "./source/chromedriver.exe");// chromedriver服务地址
		driver = new ChromeDriver(); // 新建一个WebDriver 的对象,但是new 的是谷歌的驱动

		//String url = "http://www.9ku.com/play/876666.htm";
		
		driver.get(url);

		getList(url);
		
	}
	
	public static void getList(String url) throws Exception {
		
		search_handle = driver.getWindowHandle();
		List<WebElement> list = driver.findElements(By.className("songName"));
		System.out.println(list.size());
		for(int i = 0 ; i < list.size(); i++) {
			driver.switchTo().window(search_handle);
			WebElement ele = list.get(i);
			//ele.click();
			((JavascriptExecutor)driver).executeScript("arguments[0].click()", ele);
			
			try {
				getDetail(savePath);
			}catch(Exception e) {
				continue;
			}
		}
	}

	public static void getDetail(String path) throws IOException {
		
		//获得所有窗口句柄
	    Set<String> handles = driver.getWindowHandles();

	    //判断是否为注册窗口, 并操作注册窗口上的元素
	    for(String handle : handles){
	      if (handle.equals(search_handle)==false){
	        //切换到注册页面
	        driver.switchTo().window(handle);
	        break;
	      }
	    }
		
		try {
			Thread.sleep(2000);
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
		String title = driver.findElement(By.className("playingTit")).findElement(By.tagName("h1")).getText();
		String src = driver.findElement(By.tagName("audio")).getAttribute("src");
		System.out.println(title);
		System.out.println(src);
		
		gosave(title, src,path);
	}

	private static String gosave(String title, String link,String path) {

		path = path + title + ".mp4";
		if (new File(path).exists())
			return null;
		save(path, link);
		
		return path;
	}

	private static void save(String savePath, String link) {
		FileDownload.download(link, savePath);
	}
}

下面是一个自己的工具类FileDownload.java

package com.bing.download;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

/**
 * @说明:
 * @author: gaoll
 * @CreateTime:2014-11-20
 * @ModifyTime:2014-11-20
 */
public class FileDownload {
	
	public static void main(String[] args) {
		download("http://media.redianduanzi.com/convert/2019/03/22/5c94061e942d9-sd.mp4?t=1569652014&key=660566b478c5cbd92becf65e7e975286","E://20只狮子猎捕一头大象, 看大象能否狮口逃生?.mp4");
	}
	/**
	 * 文件下载
	 * @param url 链接地址
	 * @param path 要保存的路径及文件名
	 * @return
	 */
	public static boolean download(String url,String path){
		
		boolean flag = false;
		
		CloseableHttpClient httpclient = HttpClients.createDefault();
		RequestConfig requestConfig = RequestConfig.custom().
				setSocketTimeout(2000).
				setCookieSpec(CookieSpecs.STANDARD)
				.setConnectTimeout(2000)
				.build();

		
		HttpGet get = new HttpGet(url);
		get.setConfig(requestConfig);
		
		
		BufferedInputStream in = null;
		BufferedOutputStream out = null;
		File file = new File(path);
		
		
		try{

			for(int i=0;i<3;i++){
				try{
					if(!file.exists()) {
						file.createNewFile();
					}
					
					CloseableHttpResponse result = httpclient.execute(get);
					System.out.println(result.getStatusLine());
					if(result.getStatusLine().getStatusCode() == 200){
						in = new BufferedInputStream(result.getEntity().getContent());
						out = new BufferedOutputStream(new FileOutputStream(file));
						byte[] buffer = new byte[4096];
						int len = -1;
						while((len = in.read(buffer,0,4096)) > -1){
							out.write(buffer,0,len);
						}
						flag = true;
						break;
					}else if(result.getStatusLine().getStatusCode() == 500){
						continue ;
					}else{
						break ;
					}
				}catch(Exception e){
					continue;
				}
			}
			
		}catch(Exception e){
			try{
				if(in != null){
					in.close();
				}
				if(out != null){
					out.close();
				}
			}catch(Exception ec){
				ec.printStackTrace();
				flag = false;
			}
			
			file.delete();
			e.printStackTrace();
			flag = false;
		}finally{
			get.releaseConnection();
			try{
				if(in != null){
					in.close();
				}
				if(out != null){
					out.close();
				}
			}catch(Exception e){
				e.printStackTrace();
				flag = false;
			}
		}
		return flag;
	}

	private static Log log = LogFactory.getLog(FileDownload.class);
}

 

展开阅读全文
加载中
点击引领话题📣 发布并加入讨论🔥
0 评论
0 收藏
0
分享
返回顶部
顶部