webmagic0.6.0抓取aliyun博客
博客专区 > Mr_Damen 的博客 > 博客详情
webmagic0.6.0抓取aliyun博客
Mr_Damen 发表于11个月前
webmagic0.6.0抓取aliyun博客
  • 发表于 11个月前
  • 阅读 94
  • 收藏 1
  • 点赞 0
  • 评论 0

标题:腾讯云 新注册用户域名抢购1元起>>>   

抓取网站:https://yq.aliyun.com/articles

lib包:https://github.com/code4craft/webmagic/releases/download/WebMagic-0.6.0/webmagic-0.6.0-all.tar.gz

代码

import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

public class CrawlAliyun  implements PageProcessor{ 
	
	//入口
	public static final String URL_START ="https://yq.aliyun.com/articles/type_all";
	//分页
    public static final String URL_PAGE = "https://yq.aliyun.com/articles/type_all-order_createtime-page_[0-9]+";
    //标签
    public static final String URL_TAGS = "https://yq.aliyun.com/tags/type_blog-tagid_[0-9]+";
    //博文
    public static final String URL_CONTENT = "https://yq.aliyun.com/articles/[0-9]+";
	
	// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setCharset("utf-8");

    @Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
    	Selectable select=null;
    	List<String> urls=null;
    	if (page.getUrl().regex(URL_PAGE).match()||page.getUrl().regex(URL_START).match()) {
    		System.out.println("=====pageurl====="+page.getUrl());
    		select = page.getHtml().xpath("//section[@class='yq-new-list yq-n-l-blog']");
    		
    		urls = select.links().regex(URL_PAGE).all();
    		page.addTargetRequests(urls);
    		
    		urls = select.links().regex(URL_CONTENT).all();
    		page.addTargetRequests(urls);
    	}
    	if (page.getUrl().regex(URL_CONTENT).match()) {
    		System.out.println("=====arturl====="+page.getUrl());
    		select = page.getHtml().xpath("//p[@class='blog-tags']");
    		urls = select.links().regex(URL_TAGS).all();
    		page.addTargetRequests(urls);
    		
    		page.putField("url", page.getUrl().toString());
    		page.putField("title", page.getHtml().xpath("//h2[@class='blog-title']/text()").toString() );
    		page.putField("author", page.getHtml().xpath("//a[@class='b-author']/text()").toString());
    		page.putField("authorUrl", page.getHtml().xpath("//a[@class='b-author']").$("a","href").toString());
    		page.putField("createtime", page.getHtml().xpath("//span[@class='b-time']/text()").toString());
    		page.putField("watched", page.getHtml().xpath("//span[@class='b-watch']/text()").toString().replace("浏览", ""));
    		page.putField("tags",page.getHtml().xpath("//p[@class='blog-tags']/a/text()").all() );
    		page.putField("summary", page.getHtml().xpath("//p[@class='blog-summary']/text()").toString());
    		page.putField("content", page.getHtml().xpath("//div[@class='content-detail']/html()").toString());
    	}
    }

    @Override
   	public Site getSite() {
   		site.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
   		return site;
   	}

    public static void main(String args[]) {
	    Spider.create(new CrawlAliyun())
	    //从"https://github.com/code4craft"开始抓
	    .addUrl(URL_START)
	    .addPipeline(new JsonFilePipeline("F:\\webmagic\\"))
	    //开启5个线程抓取
	    .thread(5)
	    //启动爬虫
	    .run();
	}
 }

结果

标签: webmagic
共有 人打赏支持
粉丝 4
博文 35
码字总数 6809
×
Mr_Damen
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: