文档章节

【Scrapy】 抓取开源中国 招聘信息

ok绷了绷
 ok绷了绷
发布于 2017/06/27 19:50
字数 1124
阅读 102
收藏 0

#程序员薪资揭榜#你做程序员几年了?月薪多少?发量还在么?>>>

python 2.7 
scrapy 1.3.0
sqlalchemy
navicate

JobSpider.py

# -*- coding:utf-8 -*-
import logging
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from miao.items import JobDetail,CompanyDetail
from hashlib import md5
import os
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class JobSpider(Spider):
	name = 'job'
	# start_urls = ['https://job.oschina.net/search?type=%E8%81%8C%E4%BD%8D%E6%90%9C%E7%B4%A2&key=web%E5%89%8D%E7%AB%AF&exp=0&edu=0&nat=1&city=%E5%85%A8%E5%9B%BD&p=1']
	keys = ['Java','Python','Hadoop','自然语言处理','搜索算法','全栈工程师','数据挖掘','后端其他','web前端',
			'HTML5','JavaScript','前端开发工程师','前端其他'
	]
	def start_requests(self):
		for k in self.keys:

			for i in range(1,12):
				url = 'https://job.oschina.net/search?type=职位搜索&key=%s&exp=0&edu=0&nat=1&city=全国&p=%s' %(k,i)
				yield Request(url=url,callback=self.parse)
	def parse(self,response):
		sel = Selector(response)

		list_jobdivs = sel.xpath('//div[@class="box clear layout"]')

		for div in list_jobdivs:
			job_url = div.xpath('./div[@class="flex-item-6 "]/div[@class="layout"]/div[@class="layout-left title"]/a/@href').extract()[0]
			company_url = div.xpath('./div[@class="flex-item-6 com-info"]/div[@class="layout clear"]/div[@class="layout-column"]/div[@class="layout"]/a/@href').extract()[0]
			yield Request(url=job_url,callback =self.parse_job)
			yield Request(url=company_url,callback =self.parse_company)


	def parse_company(self,response):
		sel = Selector(response)
		item = CompanyDetail()
		item['companyurl'] = response.url # 公司链接
		title = sel.xpath("//small/text()").extract()[0]
		item['companyname'] = title # 公司名称
		companydescs = sel.xpath("//div[@class='panel-body']")[0].re(r'<p>(.*?)</p>')
		companydesc = ''
		for c in companydescs:
			companydesc = companydesc + c
		companydesc = companydesc.replace("<br>","")
		item['companydesc'] = companydesc # 公司介绍
		# companyguimo = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[1].xpath("./span[@class='size']/text()").extract()[0]
		# item['companyguimo'] = companyguimo # 公司规模
		# companyguanwang  = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[2].xpath("./span[@class='page']/a/@href").extract()[0]
		# item['companyguanwang'] = companyguanwang
		# companyjieduan = sel.xpath("//div[@class='col-xs-7']/ul[@class='lists text']/li")[3].xpath("./span[@class='stage']").extract()[0]
		# item['companyjieduan'] = companyjieduan
		yield item
	def parse_job(self,response):
		# print response.url
		sel = Selector(response)
		item = JobDetail()
		item['joburl'] = response.url # 职位链接
		company = sel.xpath("//div[@class='col-xs-12']/h3[@class='text-left']/strong/a/@title").extract()[0]
		item['jobcompany'] = company # 公司名称
		title = sel.xpath("//h1/@title").extract()[0]
		item['jobcontent'] = title # 公司岗位
		jobmoney = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/b/text()").extract()[0]
		item['jobmoney'] = jobmoney # 工作薪资
		locations = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/a")
		jobneed = ''
		for l in locations:
			jobneed = jobneed + l.xpath("./text()").extract()[0].strip()+'/'
		item['jobneed'] = jobneed # 工作要求 包括地点 学历 经验 
		skillneed = ''
		skills = sel.xpath("//div[@class='left']/div[@class='basic']/div[@class='clearfix row lh-md']/div[@class='col-xs-9']/div/span[@id='ex-position-skills']/a")
		for s in skills:
			skillneed = skillneed + s.xpath("./text()").extract()[0].strip() + '/'
		skillneed = skillneed[:len(skillneed)-1]
		item['skillneed'] = skillneed # 技能要求
		pubtime = sel.xpath("//div[@class='left']/div[@class='basic']/p/text()").extract()[0]
		item['pubtime'] = pubtime # 发布时间

		jobdescs = sel.xpath("//div[@class='panel']/div[@class='panel-body']/div[@class='position-description']")[0].re(r'<p>(.*?)</p>')
		jobdesc = ''
		for j in jobdescs:
			jobdesc = jobdesc + j
		jobdesc = jobdesc.replace('<br>',"")
		item['jobdesc'] = jobdesc # 工作描述
		yield item

		# print company,title[0],response.url

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import codecs
from hashlib import md5
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Column,String,create_engine, DateTime, Integer, Text, INT
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import ForeignKey,Table
from sqlalchemy.orm import relationship,backref
from datetime import datetime
from items import JobDetail,CompanyDetail
from sqlalchemy.dialects.mysql import LONGTEXT
engine = create_engine('mysql+mysqldb://root:1234@127.0.0.1:3306/iproxypool?charset=utf8')
DBSession = sessionmaker(bind=engine)
Base = declarative_base()

class Job(Base):
	__tablename__ = 'job'

	id = Column(Integer,primary_key = True,autoincrement = True)
	joburl = Column(String(255))
	jobcompany = Column(String(255))
	jobcontent = Column(String(255))
	jobmoney = Column(String(255))
	jobneed = Column(String(255))
	skillneed = Column(String(255))
	pubtime = Column(String(255))
	jobdesc = Column(LONGTEXT)
class Company(Base):
	__tablename__ = "company"

	id = Column(Integer,primary_key = True,autoincrement = True)
	companyurl = Column(String(255))
	companyname = Column(String(255))
	companydesc = Column(LONGTEXT)

class JobPipeline(object):

	def open_spider(self,spider):
		self.session = DBSession()
	def process_item(self,item,spider):
		if isinstance(item,JobDetail):
			isexists = self.session.query(Job).filter(Job.joburl == item['joburl']).all()
			if isexists:
				self.session.query(Job).filter(Job.joburl==item['joburl']).update({Job.joburl:item['joburl']})
				self.session.query(Job).filter(Job.jobcompany==item['jobcompany']).update({Job.jobcompany:item['jobcompany']})
				self.session.query(Job).filter(Job.jobcontent==item['jobcontent']).update({Job.jobcontent:item['jobcontent']})
				self.session.query(Job).filter(Job.jobmoney==item['jobmoney']).update({Job.jobmoney:item['jobmoney']})
				self.session.query(Job).filter(Job.jobneed==item['jobneed']).update({Job.jobneed:item['jobneed']})
				self.session.query(Job).filter(Job.skillneed==item['skillneed']).update({Job.skillneed:item['skillneed']})
				self.session.query(Job).filter(Job.pubtime==item['pubtime']).update({Job.pubtime:item['pubtime']})
				self.session.query(Job).filter(Job.jobdesc==item['jobdesc']).update({Job.jobdesc:item['jobdesc']})
				self.session.commit()
			else:
				jobs = Job(joburl=item['joburl'],
					jobcompany=item['jobcompany'],
					jobcontent=item['jobcontent'],
					jobmoney=item['jobmoney'],
					jobneed=item['jobneed'],
					skillneed=item['skillneed'],
					pubtime=item['pubtime'],
					jobdesc=item['jobdesc'])
				self.session.add(jobs)
				self.session.commit()
		else:
			isexists = self.session.query(Company).filter(Company.companyname == item['companyname']).all()

			if isexists:
				self.session.query(Company).filter(Company.companyurl==item['companyurl']).update({Company.companyurl:item['companyurl']})
				self.session.query(Company).filter(Company.companyname==item['companyname']).update({Company.companyname:item['companyname']})
				self.session.query(Company).filter(Company.companydesc==item['companydesc']).update({Company.companydesc:item['companydesc']})
				# self.session.query(Company).filter(Company.companyguimo==item['companyguimo']).update({Company.companyguimo:item['companyguimo']})
				# self.session.query(Company).filter(Company.companyguanwang==item['companyguanwang']).update({Company.companyguanwang:item['companyguanwang']})
				# self.session.query(Company).filter(Company.companyjieduan==item['companyjieduan']).update({Company.companyjieduan:item['companyjieduan']})
				self.session.commit()
			else:
				companys = Company(companyurl=item['companyurl'],
					companyname=item['companyname'],
					companydesc=item['companydesc'])
					# companyguimo=item['companyguimo'],
					# companyguanwang=item['companyguanwang'],
					# companyjieduan=item['companyjieduan'])
				self.session.add(companys)
				self.session.commit()

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Field,Item

class JobDetail(Item):
	joburl = Field() # 工作链接
	jobcompany = Field() # 公司名称
	jobcontent = Field() # 工作岗位
	jobmoney = Field() # 工作薪资
	jobneed = Field() # 工作要求
	skillneed = Field() # 技能要求
	pubtime = Field() # 发布时间
	jobdesc = Field() # 职位描述

class CompanyDetail(Item):
	companyurl = Field() # 公司链接
	companyname = Field() # 公司名称
	companydesc = Field() # 公司介绍

setting.py

ITEM_PIPELINES = {
   # 'miao.pipelines.BaiduPipeline': 300,
   # 'miao.pipelines.XiciPipeline': 300,
   # 'miao.pipelines.ChinaPipeline': 300,
   'miao.pipelines.JobPipeline':300,
}

© 著作权归作者所有

ok绷了绷
粉丝 4
博文 124
码字总数 57867
作品 0
海淀
私信 提问
加载中

评论(0)

Go 开发者平均年薪 46 万?爬数据展示国内 Go 的市场行情到底如何

随着云原生时代的到来,拥有高并发性、语法易学等特点的 Golang 地位逐渐凸显,在云原生编程中占据了主导地位。在近期出炉的 TIOBE 10 月编程语言排行榜中,Golang 从前一个月的 16 位一跃来...

编辑部的故事
2018/10/29
1.3W
29
建议开源中国统计下招聘需求需要的最热200项开发技术,然后

建议开源中国到人才网抓取统计下招聘需求需要的最热200项开发技术,分语言汇总,例如 java, memcache ,avgmoney 8K, company: 2645 hibernate, avgmoney: 5K ,company: 15000 技术,薪资,,...

快速开发师
2016/11/02
314
1
【开源访谈】 Spiderman作者赖伟威访谈实录

【作者简介】 赖伟威 毕业刚满一年的Java Coder,立志做可靠的系统架构师。大学期间与几位志同道合的同学创办CFuture工作室。现在深圳打拼中。 【软件简介】 Spiderman 是一个基于微内核+插件...

丫头潘潘
2013/06/26
3.2K
12
【Scrapy】 Feed exports 学习记录四

为了对抓取数据进行序列化,采用 item exporters, json json lines csv xml 可以通过 FEED_EXPORTERS 设置对其进行支持。 settting 设置: FEED_URI (mandatory) FEED_FORMAT FEED_STORAGES...

ok绷forever
2017/01/01
64
0
PHP模拟登录并获取数据

cURL 是一个功能强大的PHP库,使用PHP的cURL库可以简单和有效地抓取网页并采集内容,设置cookie完成模拟登录网页,curl提供了丰富的函数, 开发者可以从PHP手册中获取更多关于cURL信息。本文...

Louis_88
2015/09/19
181
0

没有更多内容

加载失败,请刷新页面

加载更多

面试常考:Java中synchronized和volatile有什么区别?

在我的博客和公众号中,发表过很多篇关于并发编程的文章,之前的文章中我们介绍过了两个在Java并发编程中比较重要的两个关键字:synchronized和volatile 我们简单回顾一下相关内容: 1、Jav...

码农突围
16分钟前
7
0
C++ 支持宏重载效果

C++宏默认是不支持重载的,但可以通过特殊方法让宏支持重载。 可能你要问,支持宏重载有什么用处,那么,就以实例来尝试宏的重载。 为了要实现类的动态创建,我还想构建类的继承关系图,为了...

零落年华
24分钟前
7
0
jenkins添加git源失败

Failed to connect to repository : Command "git ls-remote -h git@192.168.91.11:test/dzp.git HEAD" returned status code 128: stdout: stderr: Permission denied, please try again. P......

muoushi
27分钟前
20
0
如何在ActionScript 3中将“ Null”(真实的姓氏!)传递给SOAP Web服务

问题: We have an employee whose surname is Null. 我们有一个姓为Null的员工。 Our employee lookup application is killed when that last name is used as the search term (which happ......

fyin1314
27分钟前
17
0
事务控制

TxConfig.class /** * 声明式事务 * * 环境搭建 * 1. 引入相关依赖,数据源、数据库驱动,spring-jdbc模块 * 2. 配置数据源、JdbcTemplate(Spring提供的简化数据库操作的工具)操作数据...

与你同行7
38分钟前
16
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部