抓取搜狐汽车的汽车信息
博客专区 > tavenpy 的博客 > 博客详情
抓取搜狐汽车的汽车信息
tavenpy 发表于3年前
抓取搜狐汽车的汽车信息
  • 发表于 3年前
  • 阅读 5
  • 收藏 0
  • 点赞 0
  • 评论 0

【腾讯云】买域名送云解析+SSL证书+建站!>>>   

# coding=utf-8

__author__ = 'taven'

from blog.models import CarBrand, CarCategory, CarType
from pyquery import PyQuery as pyq
import chardet
import requests
import sys
reload(sys)
import urllib
sys.setdefaultencoding("utf-8")

# pyq相当于jquery中的$
base_url = r'http://db.auto.sohu.com/'
res = requests.get(base_url).content
html = pyq(res)

car_category_list = html('.con .category_main')
category_url_json = {}


#判断字符是否为空或特殊字符
def isblank(ch):

    if ch == u'\x20' or ch <= u'\x1f' or ch == u'\x7f' or ch == u'\xA0':
        return True

    return False


def func_model(model_string):
    try:
        encoding_dict = chardet.detect(model_string)
        if encoding_dict['encoding'] != 'unicode' and \
                        encoding_dict['encoding'] != 'UNICODE':
            model_string = unicode(model_string, encoding_dict['encoding'])
    finally:
        start_with = 0
        end_with = len(model_string)

        for idx, ch in enumerate(model_string):
            if isblank(ch):
                start_with = idx + 1
                break

        for idx, ch in enumerate(model_string[start_with:end_with]):
            if ch >= u'\u4e00' or ch == u'(':
                end_with = start_with + idx
                break

        return model_string[start_with:end_with]


for index, car_category in enumerate(car_category_list):
    car_category_html = pyq(car_category)
    car_brand_html = car_category_html.find('.meta_left p')
    car_logo_html = car_category_html.find('.meta_left a img')
    urllib.urlretrieve(pyq(car_logo_html).attr('data-original'),
                       '/home/taven/tblog/car_logo/'+'logo%s' %index)
    # car_brand = CarBrand(name=str(pyq(car_brand_html).text()).decode('utf8'),)
    # car_brand.save()
    print '\n'
    print '\n'
    print '车品牌:::::::::::::::'+pyq(car_brand_html).text()
    car_category_html_dt_list = car_category_html.\
        find('dt span:first-child span a')
    for car_category_html_dt in car_category_html_dt_list:
        print "车系:::::::::::::::"+pyq(car_category_html_dt).text()

        car_category_item = pyq(car_category_html_dt).parent().next().text()
        # car_brand = CarBrand.objects.get(name=pyq(car_brand_html).text())
        # car_type = CarType(name=pyq(car_category_html_dt).text(),
        #                    car_brand_id=car_brand.id)
        # car_type.save()

        # print "车系href:::::::::::::::"+pyq(car_category_html_dt).attr('href')
        category_url = r'http://db.auto.sohu.com'\
                       +pyq(car_category_html_dt).attr('href')

        category_url_list = []
        category_url_list.append(category_url)

        # $(".stop_sell h4 a").each(function(){alert(this)});
        i = 2001
        while i <= 2015:
            f = urllib.urlopen('http://db.auto.sohu.com'
                               +pyq(car_category_html_dt).attr('href')
                               +'year'+str(i)+'.shtml')
            if f.geturl() != 'http://db.auto.sohu.com/index.shtml':
                category_url_list.append(
                    r'http://db.auto.sohu.com'
                    +pyq(car_category_html_dt).attr('href')
                    +'year'+str(i)+'.shtml'
                )
            i += 1
        car_category_name_list = []
        for category_url in category_url_list:
            print '\n'
            print category_url
            res_car_category = requests.get(category_url).content
            if res_car_category is not None and len(res_car_category) > 10000:
                car_category_html = pyq(res_car_category)
                car_category_a_list = car_category_html('.b .ftdleft')
                for car_category_a in car_category_a_list:
                    car_category_name = pyq(car_category_a).find("a:first").text()
                    handled_category_name = func_model(car_category_name)
                    if handled_category_name not in car_category_name_list:
                        car_category_name_list.append(handled_category_name)
                        print handled_category_name
                        # car_category = CarCategory(name=handled_category_name,
                        #                            car_type_id=car_type.id)
                        # car_category.save()

print 'ok'

# 测试数据
if __name__ == '__main__':
    print func_model('2011款 1.4T Urban版')
    print func_model('2013款 40 TFSI 进取型')
    print func_model('2011款 1.4T Ego plus版')  # 这种不能处理
    print func_model('2002款 1.8i手动5速')  # 这种不能处理
    print func_model('2009款 2.0TFSI 豪华型')
    print func_model('2012款 1.8TFSI MT舒适型')
    print func_model('2010款 2.0TFSI 标准型')
    print func_model('2014款 35 TFSI 进取型')
    print func_model('2012款 40TFSI 越野型')
    print func_model('2014款 35 TFSI 进取型')
    print func_model('2013款 40 TFSI quattro运动型')
    print func_model('2001款 1.8T手动5速基本型')
    print func_model('2003款 2.4i手动5速')
    print func_model('2003款 2.8i 无级手动一体技术领先型')
    print func_model('2005款 2.0T FSI®手动标准型')
    print func_model('2006款 L 07款 3.1 FSI 无级/手动一体')
    print func_model('2007款 2.8 FSI 尊享型')
    print func_model('2008款 2.4 技术型')
    print func_model('2008款 3.2FSI quattro 豪华型')
    print func_model('2010款 2.4L 豪华型')
    print func_model('2011款 2.7TDI 舒适型')
    print func_model('2012款 35 FSI quattro 豪华型(2.8L)')
    print func_model('2015款 40 TFSI quattro 豪华型')


# 相关的model
class CarBrand(models.Model):
    name = models.CharField(max_length=30)  # 品牌名
    company_id = models.IntegerField(default=0)
    create_by = models.IntegerField(default=0)
    create_date = models.DateTimeField(auto_now=True)
    update_by = models.IntegerField(default=0)
    update_date = models.DateTimeField(auto_now=True)
    del_flag = models.BooleanField(default=False)

    class Meta:
        db_table = 'car_brand'


class CarType(models.Model):
    name = models.CharField(max_length=30)  # 车系名
    company_id = models.IntegerField(default=0)
    car_brand = models.ForeignKey(CarBrand)
    create_by = models.IntegerField(default=0)
    create_date = models.DateTimeField(auto_now=True)
    update_by = models.IntegerField(default=0)
    update_date = models.DateTimeField(auto_now=True)
    del_flag = models.BooleanField(default=False)

    class Meta:
        db_table = 'car_type'


class CarCategory(models.Model):
    name = models.CharField(max_length=30)  # 车类别名
    company_id = models.IntegerField(default=0)
    car_type = models.ForeignKey(CarType)
    create_by = models.IntegerField(default=0)
    create_date = models.DateTimeField(auto_now=True)
    update_by = models.IntegerField(default=0)
    update_date = models.DateTimeField(auto_now=True)
    del_flag = models.BooleanField(default=False)

    class Meta:
        db_table = 'car_category'

  • 打赏
  • 点赞
  • 收藏
  • 分享
共有 人打赏支持
粉丝 6
博文 88
码字总数 11490
×
tavenpy
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: