bs4爬虫

原创
2016/08/30 18:10
阅读数 68

数据库表

CREATE TABLE `mm` (
	`id` INT(11) NOT NULL AUTO_INCREMENT,
	`img_type` VARCHAR(10) NOT NULL DEFAULT '0',
	`img_name` VARCHAR(8) NOT NULL DEFAULT '0',
	`img_url` VARCHAR(50) NOT NULL,
	`status` INT(1) NOT NULL DEFAULT '0',
	PRIMARY KEY (`id`),
	UNIQUE INDEX `img_name` (`img_name`)
)
COLLATE='utf8_general_ci'
ENGINE=MyISAM
AUTO_INCREMENT=52208
;

BS4抓起图片主要部分:

#coding=utf-8
#jk409@qq.com create by 2016-8-25
#python2.7 bs4
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests,re,time,os
import pymysql
import sqlite3 #这个没有到
from bs4 import BeautifulSoup
#-------------------------------------
class Mysql:
    def __init__(self,host,user,password,db,port):
        self.host = host
        self.user = user
        self.passwd = password
        self.db = db
        self.port = int(port)
        self.charset = 'utf8'

    def read(self,sql,args=()):
        try:
            cnn = pymysql.connect(host=self.host,port=self.port,user=self.user, passwd=self.passwd, db=self.db,charset=self.charset)
            #self.cnn.autocommit(True)
            cur= cnn.cursor()
        except:
            print('数据库连接失败......')
            return False

        try:
            cur.execute(sql,args)
            data = cur.fetchall()
            #字典json格式的
            #data_dict = [dict((self.cur.description[i][0], value) for i, value in enumerate(row)) for row in self.cur.fetchall()]
            #data_dict = [dict((cur.description[i][0], value) for i, value in enumerate(row)) for row in data]
            #print('\n',data_dict)
        except:
            print('数据查询失败!')
            return False
        finally:
            cur.close()
            cnn.close()
        return data


    def write(self,sql,args=()):
        try:
            cnn = pymysql.connect(host=self.host,user=self.user, passwd=self.passwd, db=self.db, charset=self.charset)
            #self.cnn.autocommit(True)
            cur= cnn.cursor()
        except:
            print('数据库连接失败......')
            return False

        try:
            cur.execute(sql,args)
            cnn.commit()
        except:
            cnn.rollback()
            print('数据写入失败!')
            return False
        finally:
            cur.close()
            cnn.close()
        return True

mysql=Mysql('127.0.0.1','root','123456','test',3306)
#--------------------------------------
class Sqlite3():
    def __init__(self,db):
        self.db=db

    def read(self,sql):
        try:
            self.cnn = sqlite3.connect(database=self.db)
            self.cur = self.cnn.cursor()
        except:
            print'连接[%s]失败'%self.db
            return -1

        try:
            res = self.cur.execute(args)
        except:
            self.cnn.close()
            print '读取失败!'
            return -1
        self.cnn.close()
        return  res

    def write(self, sql):
        try:
            self.cnn = sqlite3.connect(database=self.db)
            self.cur = self.cnn.cursor()
        except:
            print'连接[%s]失败' % self.db
            return -1

        try:
            res = self.cur.execute(args)
            self.cnn.commit()
        except:
            self.cnn.close()
            print '写入失败!'
            return -1
        self.cnn.close()
        return 0

    def INIT(self):
        sql = '''create table blog(
            id int,
            name varchar(32),
            email varchar(32),
            title varchar(32),
            fl varchar(32),
            tag varchar(32),
            date varchar(32),
            content varchar
        ) '''
        #sql2='alter table blog add column content  varchar;'
        self.cmd(sql)

sqlite=Sqlite3('mm.db')
#--------------------------------------
def get(url):
    header = {"Content-Type": "application/text","User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"}
    res=requests.get(url,headers=header)
    #print res.status_code
    return res.content if res.status_code==200 else ''
#--------------------------------------

def get_list_url(url):
    base_url=url+'/'
    htmlline = get(url).decode('gbk').encode('utf-8')
    soup = BeautifulSoup(htmlline, "html.parser")
    #print soupids = soup.find_all('a',attrs={"target":"_blank"})
    urls = str(soup.find_all('a',attrs={"class":"page-en"})[-1])
    urls=re.split("[=,.]",urls)[2].split('_')
    urls.pop(0)
    #print urls,type(urls[1])
    get_page_url(base_url)
    for i in range(2,int(urls[1])+1):
        url_2=base_url+'list_'+urls[0]+'_'+str(i)+'.html'
        print url_2
        get_page_url(url_2)
#
def get_page_url(url):
    dd=re.split("[/]",url)[3]
    htmlline = get(url).decode('gbk').encode('utf-8')
    soup = BeautifulSoup(htmlline, "html.parser")
    # 获取类编号
    ids = soup.find_all('a',attrs={"target":"_blank"})
    cp = str(BeautifulSoup(str(ids), "html.parser"))
    # city=cp.text.decode('unicode-escape').encode('utf-8')
    res=re.split("[ ]", cp)
    n=1
    for i in res:
        if re.findall(dd,i) and n<=20:
            page_url=re.split('[="]',i)[-2]
            print page_url
            get_img_url(page_url)
            n+=1
def get_img_url(url):
    htmlline = get(url).decode('gbk').encode('utf-8')
    soup = BeautifulSoup(htmlline, "html.parser")
    total=list(soup.span.get_text())
    total.pop(0)
    total.pop()
    total= ''.join((total))
    ids=re.split("[/,.]",url)[-2]
    img_type=url.split('/')[3]
    for i in range(1,int(total)+1):
        url='http://img1.mm131.com/pic/%s/%s.jpg'%(ids,i)
        sqls='INSERT INTO `test`.`mm` (`img_type`,`img_name`,`img_url`) VALUES (%s,%s,%s);'
        name='%s_%s'%(ids,i)
        mysql.write(sqls,(img_type,name,url))
        #print url
        #print get(url)


展开阅读全文
打赏
1
3 收藏
分享
加载中
1024
2016/08/31 16:10
回复
举报
更多评论
打赏
1 评论
3 收藏
1
分享
返回顶部
顶部