java抓取豆瓣电影数据,分析电影评分,生成统计图表 ---servlet
博客专区 > YK_IT 的博客 > 博客详情
java抓取豆瓣电影数据,分析电影评分,生成统计图表 ---servlet
YK_IT 发表于4个月前
java抓取豆瓣电影数据,分析电影评分,生成统计图表 ---servlet
  • 发表于 4个月前
  • 阅读 16
  • 收藏 0
  • 点赞 0
  • 评论 0

    最近花时间学习了一下使用Java获取网站数据的方法,自己也亲自动手实践一下;共获取3000+数据,去除重复的数据剩余2000+,使用JFreeChart根据电影评分做出几张简单的统计图。

电影评分统计图:

    JFreeChart生成图片

  

    使用jsoup获取该网站的电影数据信息,此网站动态加载数据,如果直接查看网页源代码是看不到数据的。可以通过js文件,获取相应的数据:

部分代码如下:

movieServlet.java

    主要的功能为:获取网站的电影数据

    首先获取每一个电影分类的链接:

        HashMap<String, String> urlandnames = new HashMap<String, String>();
		MovieService movieService = new MovieService();
		// 排行榜页面
		String url = "http://movie.douban.com/chart";
		// 获取分类的所有相对链接和分类名称
		try {
			Document kinds = Jsoup.connect(url)
							  .userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36")
							  .timeout(10000)
							  .get();
			Elements elements = kinds.select("#content .types a");
			for(Element element : elements){
				String kindurl = element.attr("href");	// 链接地址
				String name = element.text();			// 类别
				urlandnames.put(kindurl,name);
			}
		} catch (IOException e) {
			e.printStackTrace();
			System.out.println("获取urlandname出现错误!!");
		}
		//获取所有的key
		Set<String> keySet = urlandnames.keySet();	
		//迭代key值
		Iterator<String> iterator = keySet.iterator();
		List<Movie> allMovies = new ArrayList<Movie>();
		while(iterator.hasNext()){
			// 获取到key值,即url
			String next = iterator.next();
			// 根据某一个类别的链接,获取行对应的电影数据
			List<Movie> listMovie = getMovieInfo(next);
			allMovies.addAll(listMovie);
		}

 根据对应的链接获取相应的数据,保存至数据库:


	/**
	 * 获取种类电影信息,保存到数据库
	 * @param url 某一个种类的链接地址
	 */
	private List<Movie> getMovieInfo(String url){
		String[] tempurl = url.split("&");
		String finalurl = "http://movie.douban.com/j/chart/top_list_count?"+tempurl[1]+"&"+tempurl[2];
		// finalurl ---------http://movie.douban.com/j/chart/top_list_count?type=18&interval_id=100:90
		String document = null;
		try {
			//获取该类别影片的数量total、可在线观看数量playable_count
			document = Jsoup.connect(finalurl).timeout(10000).ignoreContentType(true).execute().body();	
			// document------{"playable_count":18,"total":32,"unwatched_count":32}可在线观看18部,共32部,未观看32部
		} catch (IOException e) {
			e.printStackTrace();
		}

		//json解析器
		JsonParser parser = new JsonParser();
		//获取json对象
		JsonObject jsonObject = (JsonObject) parser.parse(document);
		//将json数据转为int型数据
		int movienum = jsonObject.get("total").getAsInt();
		System.out.println(movienum);//该类型的数量
		String nameurl = "http://movie.douban.com/j/chart/top_list?"+tempurl[1]+"&"+tempurl[2]+"&action=&start=0&limit="+movienum;
		// nameurl-------------http://movie.douban.com/j/chart/top_list?type=18&interval_id=100:90&action=&start=0&limit=32
		FileWriter fw = null;
		String doc = null;
		try {
			//获取该类别的所有影片的信息
			doc = Jsoup.connect(nameurl).timeout(10000).ignoreContentType(true).execute().body();
		} catch (Exception e) {
			e.printStackTrace();
		}
		//将json的一个对象数组解析成JsonElement对象
		JsonElement element = null;
		try {
			//通过JsonParser对象可以把json格式的字符串解析成一个JsonElement对象
			element = parser.parse(doc);
		} catch (NullPointerException e) {
			e.printStackTrace();
		}
		
		JsonArray jsonArray = null;
		if(element.isJsonArray()){
			//JsonElement对象如果是一个数组的话转化成jsonArray
			jsonArray = element.getAsJsonArray();
		}
		
		//遍历json的对象数组
		Iterator it = jsonArray.iterator();
		List<Movie> listMovie = new ArrayList<Movie>();
		while (it.hasNext()) {
			JsonObject e = (JsonObject)it.next();
			//电影名称
			String name = e.get("title").getAsString();
			//豆瓣评分
			float score = e.get("score").getAsFloat();
			//发布时间
			String release_date = e.get("release_date").getAsString();
			//类型
			JsonArray jsonArray2 = e.get("types").getAsJsonArray();
			String types = jsonArray2.toString();
			//链接地址
			String movieUrl = e.get("url").getAsString();
			//是否可以在线播放
			String is_playable = e.get("is_playable").getAsString();
			
			String substring = movieUrl.substring(0, movieUrl.lastIndexOf("/"));
			String keyID = substring.substring(substring.lastIndexOf("/"), substring.length());
			
			if(cache.get(keyID) != null){
				String value = (String) cache.get(keyID).getObjectValue();
				if(!name.equals(value)){
					net.sf.ehcache.Element element2 = new net.sf.ehcache.Element(keyID,name);
					cache.put(element2);
				}else {
//					System.out.println("重复的 movie Info");
					continue;
				}
			}else {
				net.sf.ehcache.Element element2 = new net.sf.ehcache.Element(keyID,name);
				cache.put(element2);
			}
			
			Movie movie = new Movie();
			
			movie.setName(name);
			movie.setTypes(types);
			movie.setRelease_date(release_date);
			movie.setScore(score);
			movie.setMovieUrl(movieUrl);
			movie.setIs_playable(is_playable);

			//在控制台输出
//			System.out.println(movie.toString());
//			System.out.println("正在获取数据ing...");
			
			listMovie.add(movie);
		}
		return listMovie;
	}

ScoreServlet.java    主要是生成图表

    生成柱状图:



	protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
		String method = request.getParameter("method");
		System.out.println(method+"===================method");
		MovieService movieService = new MovieService();
		
		Map<String, Integer> map = movieService.Count();
		Integer one = map.get("one");
		Integer two = map.get("two");
		Integer three = map.get("three");
		Integer four = map.get("four");
		Integer five = map.get("five");
		
		if(method.equals("barChart")){
			double [][]data = new double[][]{{one},{two},{three},{four},{five}};
			String []rowKeys = {">=9",">=8.5",">=8",">=7.5","<7.5"}; 
			String []columnKeys = {"评分"};
			
			CategoryDataset dataset = DatasetUtilities.createCategoryDataset(rowKeys, columnKeys, data);
			
	        JFreeChart chart = ChartFactory.createBarChart3D(
	        		"电影评分柱状图", // 图表标题
	                "电影", // 目录轴的显示标签
	                "数量", // 数值轴的显示标签
	                 dataset, // 数据集
	                 PlotOrientation.VERTICAL, // 图表方向:水平、垂直
	                 true,  // 是否显示图例(对于简单的柱状图必须是 false)
	                 false, // 是否创建工具提示 (tooltip) 
	                 false  // 是否生成 URL 链接
	                 ); 
	        
	        CategoryPlot plot = chart.getCategoryPlot();
	        // 设置网格背景颜色
	 		plot.setBackgroundPaint(Color.white);
	 		// 设置网格竖线颜色
	 		plot.setDomainGridlinePaint(Color.pink);
	 		// 设置网格横线颜色
	 		plot.setRangeGridlinePaint(Color.pink);
	 		
	 		// 显示每个柱的数值,并修改该数值的字体属性
	 		BarRenderer3D renderer=new BarRenderer3D();
	 		renderer.setBaseItemLabelGenerator(new StandardCategoryItemLabelGenerator());
	 		renderer.setBaseItemLabelsVisible(true);
	 		
	 		renderer.setBasePositiveItemLabelPosition(new ItemLabelPosition(ItemLabelAnchor.OUTSIDE12, TextAnchor.BASELINE_LEFT));
	 		renderer.setItemLabelAnchorOffset(10D);  
	 		
	 		// 设置平行柱的之间距离
	 		renderer.setItemMargin(0.4);
	 		plot.setRenderer(renderer);
	        
	        FileOutputStream fos_jpg = null; 
	        try { 
	        	//将图片保存至Tomcat服务器WebRoot下的img目录中
	            fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"barChart.jpg");
	            ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null); 
	        } catch (Exception e) {
	        	System.out.println("error");
			} finally { 
	            try { 
	                fos_jpg.close(); 
	            } catch (Exception e) {
	            	System.out.println("error2");
	            } 
	        }
	        request.setAttribute("barChart", "barChart.jpg");
			
		}

生成饼状图:

        MovieService movieService = new MovieService();
		
		Map<String, Integer> map = movieService.Count();
		Integer one = map.get("one");
		Integer two = map.get("two");
		Integer three = map.get("three");
		Integer four = map.get("four");
		Integer five = map.get("five");
		
        if (method.equals("pieChart")) {
			
			DefaultPieDataset data = new DefaultPieDataset();
			data.setValue(">=9",one); 
			data.setValue(">=8.5",two); 
			data.setValue(">=8",three); 
			data.setValue(">=7.5",four); 
			data.setValue("<7.5",five); 
	        
	        JFreeChart chart = ChartFactory.createPieChart3D(
	        		"评分饼状图",  		// 图表标题
			        data, 
			        true, 			// 是否显示图例
			        false, 			// 是否创建工具提示 (tooltip) 
	                false  			// 是否生成 URL 链接
			        ); 
	        
			//显示百分比
			PiePlot pieplot = (PiePlot)chart.getPlot();
	        pieplot.setLabelFont(new Font("宋体", 0, 12));
	        pieplot.setNoDataMessage("无数据");
	        pieplot.setCircular(true);
	        pieplot.setLabelGap(0.02D);
	        pieplot.setLabelGenerator(new StandardPieSectionLabelGenerator("{0} {2}",NumberFormat.getNumberInstance(),new DecimalFormat("0.00%")));
	        
	        PiePlot3D pieplot3d = (PiePlot3D)chart.getPlot(); 
			//设置开始角度  
			pieplot3d.setStartAngle(120D);  
			//设置方向为”顺时针方向“  
			pieplot3d.setDirection(Rotation.CLOCKWISE);  
			//设置透明度,0.5F为半透明,1为不透明,0为全透明  
			pieplot3d.setForegroundAlpha(0.7F); 
	        
	        FileOutputStream fos_jpg = null; 
	        try { 
	        	//将图片保存至Tomcat服务器WebRoot目录下
	            fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"pieChart.jpg");
	            ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null); 
	        } catch (Exception e) {
	        	System.out.println("error");
			} finally { 
	            try { 
	                fos_jpg.close(); 
	            } catch (Exception e) {
	            	System.out.println("error2");
	            } 
	        }
	        request.setAttribute("pieChart", "pieChart.jpg");
			
		}

生成折线图

       if (method.equals("lineChart")) {
			XYSeriesCollection collection = new XYSeriesCollection();
			XYSeries series = new XYSeries("折线");
			
			Map<String, Integer> map2 = movieService.lineChart();
			int number = 99;
			for(int i=0; i<map2.size(); i++){
				String s= number+"";
				String score = s.charAt(0)+"."+s.charAt(1);
				series.add(Double.parseDouble(score),map2.get(score));
//				System.out.println(Double.parseDouble(score)+"--"+map2.get(score));
				number--;
			}
			collection.addSeries(series);
			
			JFreeChart chart = ChartFactory.createXYLineChart(
				        "评分折线图",
				        "评分",
				        "数量",				
				        collection,
				        PlotOrientation.VERTICAL,
				        true, 
				        true, 
				        false);
			
			XYPlot plot = (XYPlot) chart.getPlot(); 
			//设置曲线是否显示数据点
			XYLineAndShapeRenderer xylinerenderer = (XYLineAndShapeRenderer)plot.getRenderer();
			xylinerenderer.setBaseShapesVisible(true); 
			
			//设置曲线显示各数据点的值
			XYItemRenderer xyitem = plot.getRenderer(); 
			xyitem.setBaseItemLabelsVisible(true);
			xyitem.setBasePositiveItemLabelPosition(new ItemLabelPosition(ItemLabelAnchor.OUTSIDE12, TextAnchor.BASELINE_CENTER)); 
			xyitem.setBaseItemLabelGenerator(new StandardXYItemLabelGenerator());
			xyitem.setBaseItemLabelFont(new Font("Dialog", 1, 10)); 
			plot.setRenderer(xyitem);
			
			FileOutputStream fos_jpg = null; 
	        try { 
	        	//将图片保存至Tomcat服务器WebRoot目录下
	            fos_jpg = new FileOutputStream(request.getSession().getServletContext().getRealPath("/")+"lineChart.jpg");
	            ChartUtilities.writeChartAsJPEG(fos_jpg,1,chart,700,500,null); 
	        } catch (Exception e) {
	        	System.out.println("error");
			} finally { 
	            try { 
	                fos_jpg.close(); 
	            } catch (Exception e) {
	            	System.out.println("error2");
	            } 
	        }
	        request.setAttribute("lineChart", "lineChart.jpg");
		}

MovieDao.java

把数据插入到数据库

public class MovieDao {
	
	/**
	 * 把获取的数据,一次性插入
	 * @param listMovie
	 */
	public void save(List<Movie> listMovie){
		Connection connection = null;
		PreparedStatement statement = null;
		
		connection = JdbcUtils.getConnection();

		try {
			int i = 1;
			for(Movie movie : listMovie){
				System.out.println("正在插入第"+(i++)+"条数据到数据库ing...");
				String sql = " INSERT INTO movie(NAME,TYPES,release_date,score,movieUrl,is_playable) VALUE( ?,?,?,?,?,? ) ";
				
				statement = connection.prepareStatement(sql);
				
				statement.setString(1, movie.getName());
				statement.setString(2, movie.getTypes());
				statement.setString(3, movie.getRelease_date());
				statement.setFloat(4, movie.getScore());
				statement.setString(5, movie.getMovieUrl());
				statement.setString(6, movie.getIs_playable());
				
				statement.execute();
			}
			System.out.println("保存数据完成");
		} catch (SQLException e) {
			System.out.println("保存数据出现错误 MovieDao error");
			e.printStackTrace();
			throw new RuntimeException(e);
		} finally {
			try {
				connection.close();
				statement.close();
			} catch (SQLException e) {
				e.printStackTrace();
				throw new RuntimeException(e);
			}
		}
	}
	

 查询所有数据

   /**
	 * 查询所有数据
	 * @return
	 */
	public List<Movie> findAll(){
		
		Connection connection = null;
		PreparedStatement statement = null;
		ResultSet resultSet = null;
		
		try {
			connection = JdbcUtils.getConnection();
			
			String sql = " select * from movie ";
			
			statement = connection.prepareStatement(sql);
			
			resultSet = statement.executeQuery();

			List<Movie> list = new ArrayList<Movie>();
			while (resultSet.next()) {
				Movie movie = new Movie();
				
				movie.setId(resultSet.getInt("id"));
				movie.setName(resultSet.getString("name"));
				movie.setTypes(resultSet.getString("types"));
				movie.setRelease_date(resultSet.getString("release_date"));
				movie.setScore(resultSet.getFloat("score"));
				movie.setMovieUrl(resultSet.getString("movieUrl"));
				movie.setIs_playable(resultSet.getString("is_playable"));
				
				list.add(movie);
			}
			
			return list;
		} catch (SQLException e) {
			e.printStackTrace();
			throw new RuntimeException(e);
		} finally {
			try {
				connection.close();
				statement.close();
			} catch (SQLException e) {
				e.printStackTrace();
				throw new RuntimeException(e);
			}
		}
	}

获取不同分数等级的电影数量

   /**
	 * 统计不同分数级别的电影数量
	 * @return
	 */
	public Map<String,Integer> Count(){
		
		Connection conn = null;
		PreparedStatement stmt = null;
		ResultSet resultSet = null;
		
		Map<String,Integer> mapCount = new HashMap<String, Integer>();
		
		conn = JdbcUtils.getConnection();
		String sql = null;
		String key = null;
		int i=0;
		while (i<5) {
			switch (i) {
				case 0:
					sql = "SELECT COUNT(1) FROM movie WHERE score>=9 ";
					key = "one";
					break;
				case 1:
					sql = "SELECT COUNT(1) FROM movie WHERE score>=8.5 && score<9 ";
					key = "two";
					break;
				case 2:
					sql = "SELECT COUNT(1) FROM movie WHERE score>=8 && score<8.5 ";
					key = "three";
					break;
				case 3:
					sql = "SELECT COUNT(1) FROM movie WHERE score>=7.5 && score<8 ";
					key = "four";
					break;
				case 4:
					sql = "SELECT COUNT(1) FROM movie WHERE score<7.5 ";
					key = "five";
					break;
			}
			try {
				stmt = conn.prepareStatement(sql);
				resultSet = stmt.executeQuery();
				while (resultSet.next()) {
					mapCount.put(key, resultSet.getInt(1));
				}
			} catch (SQLException e) {
				e.printStackTrace();
			}
			i++;
		}
		return mapCount;
	}
	

获取每个电影评分的电影数量

   /**
	 * 统计每个分数对应的数量
	 * @return
	 */
	public Map<String,Integer> lineChart(){
		
		Connection conn = null;
		PreparedStatement stmt = null;
		ResultSet resultSet = null;
		
		Map<String,Integer> mapCount = new HashMap<String, Integer>();
		
		conn = JdbcUtils.getConnection();
		String sql = null;
		int number = 99;
		for( ; number>=70; number-=1){
			//获取9.9 9.1 7.4 .....
			String s = number+"";
			String score = s.charAt(0)+"."+s.charAt(1);
			sql = "SELECT COUNT(1) FROM movie WHERE score=" + score ;
			try {
				stmt = conn.prepareStatement(sql);
				resultSet = stmt.executeQuery();
				while (resultSet.next()) {
					mapCount.put(score, resultSet.getInt(1));
				}
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
		return mapCount;
	}
}

     两分钟抓取数据2000+并保存至数据库中,感觉还是挺慢的,有待优化代码

代码源码:

GitHub:https://github.com/YanKuan-IT/DouBanMoviesInfo_DB.git

注:如有什么做的不对的,请指教

共有 人打赏支持
粉丝 8
博文 19
码字总数 76523
×
YK_IT
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: