文档章节

21天打造分布式爬虫-简书整站爬取(十)

o
 osc_1ee7cxmx
发布于 2018/08/06 17:19
字数 620
阅读 0
收藏 0

精选30+云产品,助力企业轻松上云!>>>

10.1.简书整站爬虫

创建项目

scrapy startproject jianshu

scrapy genspider -t crawl jianshu_spider "jianshu.com"

jianshu_spider.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import JianshuItem

class JianshuSpiderSpider(CrawlSpider):
    name = 'jianshu_spider'
    allowed_domains = ['jianshu.com']
    start_urls = ['http://jianshu.com/']

    rules = (
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z][12].*'), callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a/text()").get()
        pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
        #获取文章id
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        #文章内容,包括标签,而不是存文本内容
        content = response.xpath("//div[@class='show-content']").get()
        word_count = response.xpath("//span[@class='wordage']/text()").get()
        comment_count = response.xpath("//span[@class='comments-count']/text()").get()
        read_count = response.xpath("//span[@class='views-count']/text()").get()
        like_count = response.xpath("//span[@class='likes-count']/text()").get()
        subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())

        item = JianshuItem(
            title=title,
            avatar=avatar,
            pub_time=pub_time,
            author=author,
            origin_url=response.url,
            content=content,
            article_id=article_id,
            subjects=subjects,
            word_count=word_count,
            comment_count=comment_count,
            like_count=like_count,
            read_count=read_count
        )
        yield item

items.py

import scrapy

class JianshuItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time = scrapy.Field()
    read_count = scrapy.Field()
    like_count = scrapy.Field()
    word_count = scrapy.Field()
    subjects = scrapy.Field()
    comment_count = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-
# import pymysql
#
# class JianshuPipeline(object):
#     def __init__(self):
#         dbparams = {
#             'host': '127.0.0.1',
#             'port': 3306,
#             'user': 'root',
#             'password': '123456',
#             'database': 'jianshu',
#             'charset': 'utf8'
#         }
#         self.conn = pymysql.connect(**dbparams)
#         self.cursor = self.conn.cursor()
#         self._sql = None
#
#     def process_item(self, item, spider):
#         self.cursor.execute(self.sql, (item['title'], item['content'],
#                                        item['author'], item['avatar'], item['pub_time'], item['article_id'],
#                                        item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count']))
#         self.conn.commit()
#         return item
#
#     @property
#     def sql(self):
#         if not self._sql:
#             self._sql = """
#                 insert into article(id,title,content,author,avatar,pub_time,
#                 article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
#             """
#             return self._sql
#         return self._sql


# 采用twisted异步保存到mysql

import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors


class JianshuTwistedPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': '123456',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
                            insert into article(id,title,content,author,avatar,pub_time,
                            article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                        """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)

    def insert_item(self, cursor, item):
        cursor.execute(self.sql, (item['title'], item['content'],
                                       item['author'], item['avatar'], item['pub_time'], item['article_id'],
                                       item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count']))

    def handle_error(self, error, item, spider):
        # print(error)
        pass

middlewares.py

# -*- coding: utf-8 -*-

from selenium import webdriver
import time
from scrapy.http.response.html import HtmlResponse


class SeleniumDownloadMiddleware(object):
    def __init__(self):
        self.driver = webdriver.Chrome()

    def process_request(self,request,spider):
        self.driver.get(request.url)
        time.sleep(1)
        try:
            while True:
                showmore = self.driver.find_element_by_class_name('show-more')
                showmore.click()
                time.sleep(0.5)
                if not showmore:
                    break
        except:
            pass
        source = self.driver.page_source
        response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')
        return response

settings.py

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 1

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}

DOWNLOADER_MIDDLEWARES = {
   'jianshu.middlewares.SeleniumDownloadMiddleware': 543,
}

ITEM_PIPELINES = {
   # 'jianshu.pipelines.JianshuPipeline': 300,
   'jianshu.pipelines.JianshuTwistedPipeline': 1,
}

start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl jianshu_spider".split())
o
粉丝 0
博文 500
码字总数 0
作品 0
私信 提问
加载中
请先登录后再评论。

暂无文章

访问者模式Visitor

一 概述 场景:通常来说,用于封装数据所用到的pojo类,其只包含get、set,对应的业务逻辑是在Service上完成的;但如果出现多个pojo类都共用一套逻辑时,则应该考虑将逻辑进行抽象,不同类型...

小明不觉小
7分钟前
0
0
jQuery Ajax错误处理,显示自定义异常消息 - jQuery Ajax error handling, show custom exception messages

问题: Is there some way I can show custom exception messages as an alert in my jQuery AJAX error message? 有没有什么方法可以在我的jQuery AJAX错误消息中显示自定义异常消息作为警报...

法国红酒甜
13分钟前
18
0
告别传统机房:3D 机房数据可视化实现智能化与VR技术的新碰撞

前言 随着各行业对计算机依赖性的日益提高,计算机信息系统的发展使得作为其网络设备、主机服务器、数据存储设备、网络安全设备等核心设备存放地的计算机机房日益显现出它的重要地位,而机房...

xhload3d
昨天
19
0
如何使用.css()应用!important? - How to apply !important using .css()?

问题: I am having trouble applying a style that is !important . 我在应用!important样式时遇到麻烦。 I've tried: 我试过了: $("#elem").css("width", "100px !important"); This doe......

富含淀粉
昨天
5
0
spring源码解析-xml配置文件读取

整个 XML配置文件读取的大致流程如下: 通过继承自AbstractBeanDefinitionReader中的方法,来使用ResourLoader将资源文件路径转换为对应的Resource文件(读取资源文件并将其转为Resource) ...

wc_飞豆
昨天
16
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部