文档章节

【Scrapy】 抓取开源中国 招聘信息

ok嘣
 ok嘣
发布于 2017/06/27 19:50
字数 1124
阅读 43
收藏 0
python 2.7 
scrapy 1.3.0
sqlalchemy
navicate

JobSpider.py

# -*- coding:utf-8 -*-
import logging
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from miao.items import JobDetail,CompanyDetail
from hashlib import md5
import os
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class JobSpider(Spider):
	name = 'job'
	# start_urls = ['https://job.oschina.net/search?type=%E8%81%8C%E4%BD%8D%E6%90%9C%E7%B4%A2&key=web%E5%89%8D%E7%AB%AF&exp=0&edu=0&nat=1&city=%E5%85%A8%E5%9B%BD&p=1']
	keys = ['Java','Python','Hadoop','自然语言处理','搜索算法','全栈工程师','数据挖掘','后端其他','web前端',
			'HTML5','JavaScript','前端开发工程师','前端其他'
	]
	def start_requests(self):
		for k in self.keys:

			for i in range(1,12):
				url = 'https://job.oschina.net/search?type=职位搜索&key=%s&exp=0&edu=0&nat=1&city=全国&p=%s' %(k,i)
				yield Request(url=url,callback=self.parse)
	def parse(self,response):
		sel = Selector(response)

		list_jobdivs = sel.xpath('//div[@class="box clear layout"]')

		for div in list_jobdivs:
			job_url = div.xpath('./div[@class="flex-item-6 "]/div[@class="layout"]/div[@class="layout-left title"]/a/@href').extract()[0]
			company_url = div.xpath('./div[@class="flex-item-6 com-info"]/div[@class="layout clear"]/div[@class="layout-column"]/div[@class="layout"]/a/@href').extract()[0]
			yield Request(url=job_url,callback =self.parse_job)
			yield Request(url=company_url,callback =self.parse_company)


	def parse_company(self,response):
		sel = Selector(response)
		item = CompanyDetail()
		item['companyurl'] = response.url # 公司链接
		title = sel.xpath("//small/text()").extract()[0]
		item['companyname'] = title # 公司名称
		companydescs = sel.xpath("//div[@class='panel-body']")[0].re(r'<p>(.*?)</p>')
		companydesc = ''
		for c in companydescs:
			companydesc = companydesc + c
		companydesc = companydesc.replace("<br>","")
		item['companydesc'] = companydesc # 公司介绍
		# companyguimo = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[1].xpath("./span[@class='size']/text()").extract()[0]
		# item['companyguimo'] = companyguimo # 公司规模
		# companyguanwang  = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[2].xpath("./span[@class='page']/a/@href").extract()[0]
		# item['companyguanwang'] = companyguanwang
		# companyjieduan = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[3].xpath("./span[@class='stage']").extract()[0]
		# item['companyjieduan'] = companyjieduan
		yield item
	def parse_job(self,response):
		# print response.url
		sel = Selector(response)
		item = JobDetail()
		item['joburl'] = response.url # 职位链接
		company = sel.xpath("//div[@class='col-xs-12']/h3[@class='text-left']/strong/a/@title").extract()[0]
		item['jobcompany'] = company # 公司名称
		title = sel.xpath("//h1/@title").extract()[0]
		item['jobcontent'] = title # 公司岗位
		jobmoney = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/b/text()").extract()[0]
		item['jobmoney'] = jobmoney # 工作薪资
		locations = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/a")
		jobneed = ''
		for l in locations:
			jobneed = jobneed + l.xpath("./text()").extract()[0].strip()+'/'
		item['jobneed'] = jobneed # 工作要求 包括地点 学历 经验 
		skillneed = ''
		skills = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/span[@id='ex-position-skills']/a")
		for s in skills:
			skillneed = skillneed + s.xpath("./text()").extract()[0].strip() + '/'
		skillneed = skillneed[:len(skillneed)-1]
		item['skillneed'] = skillneed # 技能要求
		pubtime = sel.xpath("//div[@class='left']/div[@class='basic']/p/text()").extract()[0]
		item['pubtime'] = pubtime # 发布时间

		jobdescs = sel.xpath("//div[@class='panel']/div[@class='panel-body']/div[@class='position-description']")[0].re(r'<p>(.*?)</p>')
		jobdesc = ''
		for j in jobdescs:
			jobdesc = jobdesc + j
		jobdesc = jobdesc.replace('<br>',"")
		item['jobdesc'] = jobdesc # 工作描述
		yield item

		# print company,title[0],response.url

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import codecs
from hashlib import md5
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Column,String,create_engine, DateTime, Integer, Text, INT
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import ForeignKey,Table
from sqlalchemy.orm import relationship,backref
from datetime import datetime
from items import JobDetail,CompanyDetail
from sqlalchemy.dialects.mysql import LONGTEXT
engine = create_engine('mysql+mysqldb://root:1234@127.0.0.1:3306/iproxypool?charset=utf8')
DBSession = sessionmaker(bind=engine)
Base = declarative_base()

class Job(Base):
	__tablename__ = 'job'

	id = Column(Integer,primary_key = True,autoincrement = True)
	joburl = Column(String(255))
	jobcompany = Column(String(255))
	jobcontent = Column(String(255))
	jobmoney = Column(String(255))
	jobneed = Column(String(255))
	skillneed = Column(String(255))
	pubtime = Column(String(255))
	jobdesc = Column(LONGTEXT)
class Company(Base):
	__tablename__ = "company"

	id = Column(Integer,primary_key = True,autoincrement = True)
	companyurl = Column(String(255))
	companyname = Column(String(255))
	companydesc = Column(LONGTEXT)

class JobPipeline(object):

	def open_spider(self,spider):
		self.session = DBSession()
	def process_item(self,item,spider):
		if isinstance(item,JobDetail):
			isexists = self.session.query(Job).filter(Job.joburl == item['joburl']).all()
			if isexists:
				self.session.query(Job).filter(Job.joburl==item['joburl']).update({Job.joburl:item['joburl']})
				self.session.query(Job).filter(Job.jobcompany==item['jobcompany']).update({Job.jobcompany:item['jobcompany']})
				self.session.query(Job).filter(Job.jobcontent==item['jobcontent']).update({Job.jobcontent:item['jobcontent']})
				self.session.query(Job).filter(Job.jobmoney==item['jobmoney']).update({Job.jobmoney:item['jobmoney']})
				self.session.query(Job).filter(Job.jobneed==item['jobneed']).update({Job.jobneed:item['jobneed']})
				self.session.query(Job).filter(Job.skillneed==item['skillneed']).update({Job.skillneed:item['skillneed']})
				self.session.query(Job).filter(Job.pubtime==item['pubtime']).update({Job.pubtime:item['pubtime']})
				self.session.query(Job).filter(Job.jobdesc==item['jobdesc']).update({Job.jobdesc:item['jobdesc']})
				self.session.commit()
			else:
				jobs = Job(joburl=item['joburl'],
					jobcompany=item['jobcompany'],
					jobcontent=item['jobcontent'],
					jobmoney=item['jobmoney'],
					jobneed=item['jobneed'],
					skillneed=item['skillneed'],
					pubtime=item['pubtime'],
					jobdesc=item['jobdesc'])
				self.session.add(jobs)
				self.session.commit()
		else:
			isexists = self.session.query(Company).filter(Company.companyname == item['companyname']).all()

			if isexists:
				self.session.query(Company).filter(Company.companyurl==item['companyurl']).update({Company.companyurl:item['companyurl']})
				self.session.query(Company).filter(Company.companyname==item['companyname']).update({Company.companyname:item['companyname']})
				self.session.query(Company).filter(Company.companydesc==item['companydesc']).update({Company.companydesc:item['companydesc']})
				# self.session.query(Company).filter(Company.companyguimo==item['companyguimo']).update({Company.companyguimo:item['companyguimo']})
				# self.session.query(Company).filter(Company.companyguanwang==item['companyguanwang']).update({Company.companyguanwang:item['companyguanwang']})
				# self.session.query(Company).filter(Company.companyjieduan==item['companyjieduan']).update({Company.companyjieduan:item['companyjieduan']})
				self.session.commit()
			else:
				companys = Company(companyurl=item['companyurl'],
					companyname=item['companyname'],
					companydesc=item['companydesc'])
					# companyguimo=item['companyguimo'],
					# companyguanwang=item['companyguanwang'],
					# companyjieduan=item['companyjieduan'])
				self.session.add(companys)
				self.session.commit()

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Field,Item

class JobDetail(Item):
	joburl = Field() # 工作链接
	jobcompany = Field() # 公司名称
	jobcontent = Field() # 工作岗位
	jobmoney = Field() # 工作薪资
	jobneed = Field() # 工作要求
	skillneed = Field() # 技能要求
	pubtime = Field() # 发布时间
	jobdesc = Field() # 职位描述

class CompanyDetail(Item):
	companyurl = Field() # 公司链接
	companyname = Field() # 公司名称
	companydesc = Field() # 公司介绍

setting.py

ITEM_PIPELINES = {
   # 'miao.pipelines.BaiduPipeline': 300,
   # 'miao.pipelines.XiciPipeline': 300,
   # 'miao.pipelines.ChinaPipeline': 300,
   'miao.pipelines.JobPipeline':300,
}

© 著作权归作者所有

共有 人打赏支持
ok嘣
粉丝 4
博文 124
码字总数 56831
作品 0
海淀
【开源访谈】 Spiderman作者赖伟威访谈实录

【作者简介】 赖伟威 毕业刚满一年的Java Coder,立志做可靠的系统架构师。大学期间与几位志同道合的同学创办CFuture工作室。现在深圳打拼中。 【软件简介】 Spiderman 是一个基于微内核+插件...

丫头潘潘
2013/06/26
2.9K
12
【Scrapy】 Feed exports 学习记录四

为了对抓取数据进行序列化,采用 item exporters, json json lines csv xml 可以通过 FEED_EXPORTERS 设置对其进行支持。 settting 设置: FEED_URI (mandatory) FEED_FORMAT FEED_STORAGES...

ok绷forever
2017/01/01
0
0
OSC 职位推荐:一份“跳槽”指南

有人发邮件问小编,要跳槽了,怎么办?有推荐的企业吗?小编的内心是崩溃的,不清楚你做过什么、工作多久了、想在哪个行业工作。。。。怎可轻易推荐企业。下面,小编长话短说,教你一招如何避...

oschina
2015/12/28
4.9K
8
PHP模拟登录并获取数据

cURL 是一个功能强大的PHP库,使用PHP的cURL库可以简单和有效地抓取网页并采集内容,设置cookie完成模拟登录网页,curl提供了丰富的函数, 开发者可以从PHP手册中获取更多关于cURL信息。本文...

Louis_88
2015/09/19
99
0
OSC 职位推荐:什么?数据挖掘月薪最低都 2w 了

前些日子,有个 Oscer 爬了其他招聘网站的数据,做统计,得出结论 8月份中 Java 的招聘信息最多,Python 需求量呈递增趋势。最近小码哥在开源中国人才网,也是频频发现要求会 Python 的招聘信...

oschina
2015/09/09
12.6K
26

没有更多内容

加载失败,请刷新页面

加载更多

LSM-Tree 大数据索引技术

一、LSM-Tree概述 核心思想就是放弃部分读能力,换取写入能力的最大化。LSM-Tree ,这个概念就是结构化合并树(Log-Structured Merge Tree)的意思,它的核心思路其实非常简单,就是假定内存...

PeakFang-BOK
11分钟前
0
0
vue.js响应式原理解析与实现

从很久之前就已经接触过了angularjs了,当时就已经了解到,angularjs是通过脏检查来实现数据监测以及页面更新渲染。之后,再接触了vue.js,当时也一度很好奇vue.js是如何监测数据更新并且重新...

peakedness丶
17分钟前
0
0
Weblogic补丁升级操作步骤

linux平台: weblogic1036 1:停止weblogic服务 2:打FMJJ补丁 a:获取weblogic_home目录并执行命令:export weblogic_home="/weblogic/wls1036_x64",(此目录一般为目录“user_projects”上...

fang_faye
30分钟前
0
0
04-《Apache Tomcat 9 User Guide》之部署项目

1.Introduction Deployment is the term used for the process of installing a web application (either a 3rd party WAR or your own custom web application) into the Tomcat server. 部......

飞鱼说编程
30分钟前
0
0
一位十年的老码农他眼中的区块链

大家可能已经听说过比特币、莱特币、以太币等等,以及它们作为一种新货币在市场上的潜力。 有趣的是,很多投资比特币的人也提到了区块链之类的东西。 显然,这项技术是比特币等其他东西的"基...

小刀爱编程
38分钟前
1
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部