基于 webmagic 爬取 网页数据

原创
2017/04/14 17:24
阅读数 194

概述:

       webmagic是一个开源的java语言爬虫框架,参考官网http://webmagic.io/,

       本篇博客介绍爬取  码云的"最新推荐",网址:https://git.oschina.net/explore/recommend

代码

package leap.crawler;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

/**
 * Created by FromX on 2017/4/14.
 * 爬取 开源中国
 */
public class OschinaCrawler implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {


        //  爬取项目名, 作者, 描述, 星等
        List<String> contents = page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']").all();

        StringBuffer sb = new StringBuffer();

        String title="";
        String language="";
        String attention="";
        String collection="";
        String fork="";
        String desc="";
        Html html = null;
        int i=0;
        for (String content:contents){
             html = new Html(content);
            if(html.xpath("//div[@class='project-title']/a/text()").equals(null)){
                title=" ";
            }else{
                title=html.xpath("//div[@class='project-title']/a/text()").toString();
            }

            if(html.xpath("//div[@class='project-title']//div[@class='ui small label lang-label']/a/text()").equals(null)){
                language=" ";
            }else{
                language=html.xpath("//div[@class='project-title']//div[@class='ui small label lang-label']/a/text()").toString();
            }

            if(html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='关注数']//span/text()").equals(null)){
                attention=" ";
            }else{
                attention=html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='关注数']//span/text()").toString();
            }

            if(html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='收藏数']//span/text()").equals(null)){
                collection=" ";
            }else{
                collection=html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='收藏数']//span/text()").toString();
            }

            if(html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='fork数']//span/text()").equals(null)){
                fork=" ";
            }else{
                fork=html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='fork数']//span/text()").toString();
            }

            if(html.xpath("//div[@class='project-desc']/text()").equals(null)){
                desc=" ";
            }else{
                desc=html.xpath("//div[@class='project-desc']/text()").toString();
            }


            page.putField("data" + i, "{'title':"+title+",'language':"+language+",'attention':"+attention+"" +
                    ",'collection':"+collection+",'fork':"+fork+",'desc':"+desc+"}");
            sb.setLength(0);
            i++;
        }





//        page.putField("language", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='ui small label lang-label']/a/text()").toString());
//        page.putField("attention", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='pull-right']//a[@title='关注数']//span/text()").toString());
//        page.putField("Collection", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='pull-right']//a[@title='收藏数']//span/text()").toString());
//        page.putField("fork", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='pull-right']//a[@title='fork数']//span/text()").toString());
//        page.putField("desc", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-desc']/text()").toString());

// 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().xpath("//div[@class=\"ui tiny pagination menu\"]").links().all());

    }

    public static void main(String[] args) {

        Spider.create(new OschinaCrawler())
                //从"https://github.com/code4craft"开始抓
                .addUrl("https://git.oschina.net/explore/recommend")
                .addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
                //开启5个线程抓取
                .thread(10)
                //启动爬虫
                .run();
    }

}

maven 依赖

<!-- webmagic 爬虫-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.6.1</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.6.1</version>
        </dependency>

 

展开阅读全文
打赏
0
1 收藏
分享
加载中
更多评论
打赏
0 评论
1 收藏
0
分享
返回顶部
顶部