文档章节

【Scrapy】 抓取开源中国 招聘信息

ok嘣
 ok嘣
发布于 2017/06/27 19:50
字数 1124
阅读 44
收藏 0
python 2.7 
scrapy 1.3.0
sqlalchemy
navicate

JobSpider.py

# -*- coding:utf-8 -*-
import logging
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from miao.items import JobDetail,CompanyDetail
from hashlib import md5
import os
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class JobSpider(Spider):
	name = 'job'
	# start_urls = ['https://job.oschina.net/search?type=%E8%81%8C%E4%BD%8D%E6%90%9C%E7%B4%A2&key=web%E5%89%8D%E7%AB%AF&exp=0&edu=0&nat=1&city=%E5%85%A8%E5%9B%BD&p=1']
	keys = ['Java','Python','Hadoop','自然语言处理','搜索算法','全栈工程师','数据挖掘','后端其他','web前端',
			'HTML5','JavaScript','前端开发工程师','前端其他'
	]
	def start_requests(self):
		for k in self.keys:

			for i in range(1,12):
				url = 'https://job.oschina.net/search?type=职位搜索&key=%s&exp=0&edu=0&nat=1&city=全国&p=%s' %(k,i)
				yield Request(url=url,callback=self.parse)
	def parse(self,response):
		sel = Selector(response)

		list_jobdivs = sel.xpath('//div[@class="box clear layout"]')

		for div in list_jobdivs:
			job_url = div.xpath('./div[@class="flex-item-6 "]/div[@class="layout"]/div[@class="layout-left title"]/a/@href').extract()[0]
			company_url = div.xpath('./div[@class="flex-item-6 com-info"]/div[@class="layout clear"]/div[@class="layout-column"]/div[@class="layout"]/a/@href').extract()[0]
			yield Request(url=job_url,callback =self.parse_job)
			yield Request(url=company_url,callback =self.parse_company)


	def parse_company(self,response):
		sel = Selector(response)
		item = CompanyDetail()
		item['companyurl'] = response.url # 公司链接
		title = sel.xpath("//small/text()").extract()[0]
		item['companyname'] = title # 公司名称
		companydescs = sel.xpath("//div[@class='panel-body']")[0].re(r'<p>(.*?)</p>')
		companydesc = ''
		for c in companydescs:
			companydesc = companydesc + c
		companydesc = companydesc.replace("<br>","")
		item['companydesc'] = companydesc # 公司介绍
		# companyguimo = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[1].xpath("./span[@class='size']/text()").extract()[0]
		# item['companyguimo'] = companyguimo # 公司规模
		# companyguanwang  = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[2].xpath("./span[@class='page']/a/@href").extract()[0]
		# item['companyguanwang'] = companyguanwang
		# companyjieduan = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[3].xpath("./span[@class='stage']").extract()[0]
		# item['companyjieduan'] = companyjieduan
		yield item
	def parse_job(self,response):
		# print response.url
		sel = Selector(response)
		item = JobDetail()
		item['joburl'] = response.url # 职位链接
		company = sel.xpath("//div[@class='col-xs-12']/h3[@class='text-left']/strong/a/@title").extract()[0]
		item['jobcompany'] = company # 公司名称
		title = sel.xpath("//h1/@title").extract()[0]
		item['jobcontent'] = title # 公司岗位
		jobmoney = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/b/text()").extract()[0]
		item['jobmoney'] = jobmoney # 工作薪资
		locations = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/a")
		jobneed = ''
		for l in locations:
			jobneed = jobneed + l.xpath("./text()").extract()[0].strip()+'/'
		item['jobneed'] = jobneed # 工作要求 包括地点 学历 经验 
		skillneed = ''
		skills = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/span[@id='ex-position-skills']/a")
		for s in skills:
			skillneed = skillneed + s.xpath("./text()").extract()[0].strip() + '/'
		skillneed = skillneed[:len(skillneed)-1]
		item['skillneed'] = skillneed # 技能要求
		pubtime = sel.xpath("//div[@class='left']/div[@class='basic']/p/text()").extract()[0]
		item['pubtime'] = pubtime # 发布时间

		jobdescs = sel.xpath("//div[@class='panel']/div[@class='panel-body']/div[@class='position-description']")[0].re(r'<p>(.*?)</p>')
		jobdesc = ''
		for j in jobdescs:
			jobdesc = jobdesc + j
		jobdesc = jobdesc.replace('<br>',"")
		item['jobdesc'] = jobdesc # 工作描述
		yield item

		# print company,title[0],response.url

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import codecs
from hashlib import md5
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Column,String,create_engine, DateTime, Integer, Text, INT
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import ForeignKey,Table
from sqlalchemy.orm import relationship,backref
from datetime import datetime
from items import JobDetail,CompanyDetail
from sqlalchemy.dialects.mysql import LONGTEXT
engine = create_engine('mysql+mysqldb://root:1234@127.0.0.1:3306/iproxypool?charset=utf8')
DBSession = sessionmaker(bind=engine)
Base = declarative_base()

class Job(Base):
	__tablename__ = 'job'

	id = Column(Integer,primary_key = True,autoincrement = True)
	joburl = Column(String(255))
	jobcompany = Column(String(255))
	jobcontent = Column(String(255))
	jobmoney = Column(String(255))
	jobneed = Column(String(255))
	skillneed = Column(String(255))
	pubtime = Column(String(255))
	jobdesc = Column(LONGTEXT)
class Company(Base):
	__tablename__ = "company"

	id = Column(Integer,primary_key = True,autoincrement = True)
	companyurl = Column(String(255))
	companyname = Column(String(255))
	companydesc = Column(LONGTEXT)

class JobPipeline(object):

	def open_spider(self,spider):
		self.session = DBSession()
	def process_item(self,item,spider):
		if isinstance(item,JobDetail):
			isexists = self.session.query(Job).filter(Job.joburl == item['joburl']).all()
			if isexists:
				self.session.query(Job).filter(Job.joburl==item['joburl']).update({Job.joburl:item['joburl']})
				self.session.query(Job).filter(Job.jobcompany==item['jobcompany']).update({Job.jobcompany:item['jobcompany']})
				self.session.query(Job).filter(Job.jobcontent==item['jobcontent']).update({Job.jobcontent:item['jobcontent']})
				self.session.query(Job).filter(Job.jobmoney==item['jobmoney']).update({Job.jobmoney:item['jobmoney']})
				self.session.query(Job).filter(Job.jobneed==item['jobneed']).update({Job.jobneed:item['jobneed']})
				self.session.query(Job).filter(Job.skillneed==item['skillneed']).update({Job.skillneed:item['skillneed']})
				self.session.query(Job).filter(Job.pubtime==item['pubtime']).update({Job.pubtime:item['pubtime']})
				self.session.query(Job).filter(Job.jobdesc==item['jobdesc']).update({Job.jobdesc:item['jobdesc']})
				self.session.commit()
			else:
				jobs = Job(joburl=item['joburl'],
					jobcompany=item['jobcompany'],
					jobcontent=item['jobcontent'],
					jobmoney=item['jobmoney'],
					jobneed=item['jobneed'],
					skillneed=item['skillneed'],
					pubtime=item['pubtime'],
					jobdesc=item['jobdesc'])
				self.session.add(jobs)
				self.session.commit()
		else:
			isexists = self.session.query(Company).filter(Company.companyname == item['companyname']).all()

			if isexists:
				self.session.query(Company).filter(Company.companyurl==item['companyurl']).update({Company.companyurl:item['companyurl']})
				self.session.query(Company).filter(Company.companyname==item['companyname']).update({Company.companyname:item['companyname']})
				self.session.query(Company).filter(Company.companydesc==item['companydesc']).update({Company.companydesc:item['companydesc']})
				# self.session.query(Company).filter(Company.companyguimo==item['companyguimo']).update({Company.companyguimo:item['companyguimo']})
				# self.session.query(Company).filter(Company.companyguanwang==item['companyguanwang']).update({Company.companyguanwang:item['companyguanwang']})
				# self.session.query(Company).filter(Company.companyjieduan==item['companyjieduan']).update({Company.companyjieduan:item['companyjieduan']})
				self.session.commit()
			else:
				companys = Company(companyurl=item['companyurl'],
					companyname=item['companyname'],
					companydesc=item['companydesc'])
					# companyguimo=item['companyguimo'],
					# companyguanwang=item['companyguanwang'],
					# companyjieduan=item['companyjieduan'])
				self.session.add(companys)
				self.session.commit()

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Field,Item

class JobDetail(Item):
	joburl = Field() # 工作链接
	jobcompany = Field() # 公司名称
	jobcontent = Field() # 工作岗位
	jobmoney = Field() # 工作薪资
	jobneed = Field() # 工作要求
	skillneed = Field() # 技能要求
	pubtime = Field() # 发布时间
	jobdesc = Field() # 职位描述

class CompanyDetail(Item):
	companyurl = Field() # 公司链接
	companyname = Field() # 公司名称
	companydesc = Field() # 公司介绍

setting.py

ITEM_PIPELINES = {
   # 'miao.pipelines.BaiduPipeline': 300,
   # 'miao.pipelines.XiciPipeline': 300,
   # 'miao.pipelines.ChinaPipeline': 300,
   'miao.pipelines.JobPipeline':300,
}

© 著作权归作者所有

共有 人打赏支持
ok嘣
粉丝 4
博文 125
码字总数 57888
作品 0
海淀
私信 提问
Go 开发者平均年薪 46 万?爬数据展示国内 Go 的市场行情到底如何

随着云原生时代的到来,拥有高并发性、语法易学等特点的 Golang 地位逐渐凸显,在云原生编程中占据了主导地位。在近期出炉的 TIOBE 10 月编程语言排行榜中,Golang 从前一个月的 16 位一跃来...

编辑部的故事
10/29
0
22
建议开源中国统计下招聘需求需要的最热200项开发技术,然后

建议开源中国到人才网抓取统计下招聘需求需要的最热200项开发技术,分语言汇总,例如 java, memcache ,avgmoney 8K, company: 2645 hibernate, avgmoney: 5K ,company: 15000 技术,薪资,,...

快速开发师
2016/11/02
309
1
【Scrapy】 Feed exports 学习记录四

为了对抓取数据进行序列化,采用 item exporters, json json lines csv xml 可以通过 FEED_EXPORTERS 设置对其进行支持。 settting 设置: FEED_URI (mandatory) FEED_FORMAT FEED_STORAGES...

ok绷forever
2017/01/01
0
0
【开源访谈】 Spiderman作者赖伟威访谈实录

【作者简介】 赖伟威 毕业刚满一年的Java Coder,立志做可靠的系统架构师。大学期间与几位志同道合的同学创办CFuture工作室。现在深圳打拼中。 【软件简介】 Spiderman 是一个基于微内核+插件...

丫头潘潘
2013/06/26
3K
12
PHP模拟登录并获取数据

cURL 是一个功能强大的PHP库,使用PHP的cURL库可以简单和有效地抓取网页并采集内容,设置cookie完成模拟登录网页,curl提供了丰富的函数, 开发者可以从PHP手册中获取更多关于cURL信息。本文...

Louis_88
2015/09/19
99
0

没有更多内容

加载失败,请刷新页面

加载更多

spark安装测试

spark安装测试 由于本地已经安装好hadoop相关组件,所以本文是在yarn的基础上对spark进行安装及测试 确保hdfs及yarn成功启动,hadoop版本为2.7.3 安装scala,由于本人安装的spark是2.4.0,对应...

-九天-
18分钟前
2
0
周末看完了《电能计量自动化技术》

整体质量还行,下面分别将心得记录如下: 第一章:发展历程可以看看,现在算是智能电网阶段 2:讲主站系统。以文件进行各模块的交互很值得思考,尤其是批量数据,多团队合作的情况下。另外线...

max佩恩
38分钟前
7
0
mybatis批量update操作的写法,及批量update报错的问题解决方法

mybatis的批量update操作写法很简单,如下: 如果想学习Java工程化、高性能及分布式、深入浅出。微服务、Spring,MyBatis,Netty源码分析的朋友可以加我的Java高级交流:854630135,群里有阿...

编程SHA
今天
16
0
EOS怎样删除钱包

在使用Eos的keosd钱包软件时,如果要删除EOS中指定名称的钱包,最简单的办法是 直接删除钱包文件,不过在删除钱包之前,需要先停止钱包软件的运行。 学习EOS应用开发要选这个:【EOS智能合约...

汇智网教程
今天
11
0
Java语言快速实现简单MQ消息队列服务

使用 JAVA 语言自己动手来写一个MQ (类似ActiveMQ,RabbitMQ) 主要角色 首先我们必须需要搞明白 MQ (消息队列) 中的三个基本角色 ProducerBrokerConsumer 整体架构如下所示 自定义协议 首...

微笑向暖wx
今天
12
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部