nodejs抓取https://themeforest.net网页

原创
2016/11/24 02:34
阅读数 564

使用nodejs抓取 https://themeforest.net 网页模版预览放入mongoose。

以后找模版就不用那么费劲了,我是太懒了。。。

执行图片:

代码:

var request = require('request');
var cheerio = require('cheerio');
var mongoose = require('mongoose');

var Schema = mongoose.Schema;
mongoose.connect('mongodb://localhost/theme');
var Theme = new Schema({
    text: String,
    url: String,
    img: String
});
var ThemeModel = mongoose.model('Themes', Theme);
var urlPrefix = 'https://themeforest.net/';
var baseUrl = 'https://themeforest.net/category/site-templates/creative?page=';
var currentPage = 1;
var countPage = 60; // 抓取网页总页数
//延时
var itemCount = 0;
var itemLoad = 0;

function getPageList(page) {
    if (page > countPage) {
        console.log('所有数据加载完毕!');
        process.exit(0);
    }
    console.log(`当前加载第 ${page} 页数据`);
    request(baseUrl + page, function(err, response, body) {
        if (!err && response.statusCode == 200) {
            analysisPage(body);
        } else {
            console.log('get page error url => ' + baseUrl + page, err);
        }
    });
}
getPageList(currentPage);

function analysisPage(body) {
    var items;
    var url;
    var $ = cheerio.load(body);
    // 抽出列表 li
    items = $('.js-google-analytics__list-event-container');
    itemCount = items.length;
    items.map(function(i, item) {
        item = $(item);
        var linkA = item.find('.js-google-analytics__list-event-trigger.t-link');

        var text = linkA.text();
        var url = linkA.attr('href');
        var img = item.find('.landscape-image-magnifier').attr('data-preview-url');
        saveFile(
            urlPrefix + url,
            text,
            img
        );
    });
}

function saveFile(url, text, img) {
    var theme = new ThemeModel({
        url: url,
        text: text,
        img: img
    });
    theme.save(function(err) {
        if (err) {
            console.log('save mongoose err! ');
        } else {
            itemLoad++;
            if (itemLoad === itemCount) {
                // 已保存完数据清空当前加载数
                itemLoad = 0;
                getPageList(++currentPage);
            }
        }
    });
}

package.json

{
  "name": "theme",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "cheerio": "^0.22.0",
    "mongoose": "^4.7.0",
    "request": "^2.79.0"
  }
}

更新一个前端查看页面:

var express = require('express');
var app = express();
var mongoose = require('mongoose');
var Schema = mongoose.Schema;

var Theme = new Schema({
    text: String,
    url: String,
    img: String
});
var ThemeModel = mongoose.model('Themes', Theme);
var html = `
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>themes</title>
</head>
<body>
%{body}
</body>
</html>
`;

mongoose.connect('mongodb://localhost/theme');
app.get('/', function(req, res) {
    var body = '<div>';
    var responseText = '';

    res.set('Content-Type', 'text/html');
    ThemeModel.find({}, function(err, items) {
        items.map(function(item, i) {
            body += [
                `<img src="${item.img}" />`,
                `<p><a href="${item.url}" target="_blank">${item.text}</a></p>`
            ].join('');
        });
        body += '</div>';
        responseText = html.replace('%{body}', body);
        res.send(responseText);
    });
});

var server = app.listen(3000, function() {
    var host = server.address().address;
    var port = server.address().port;

    console.log('Example app listening at http://%s:%s', host, port);
});
展开阅读全文
打赏
0
1 收藏
分享
加载中
更多评论
打赏
0 评论
1 收藏
0
分享
返回顶部
顶部