多线程 爬取 趣图网
多线程 爬取 趣图网
阿豪boy 发表于9个月前
多线程 爬取 趣图网
  • 发表于 9个月前
  • 阅读 29
  • 收藏 0
  • 点赞 0
  • 评论 0

【腾讯云】新注册用户域名抢购1元起>>>   

 

# coding=utf-8

import urllib
import urllib2
import urlparse
import re
import bs4
import requests
import cookielib
from lxml import etree
from bs4 import BeautifulSoup
import os
import sys
import ssl
import threading
import time

reload(sys)
sys.setdefaultencoding('utf-8')

url = 'https://www.doutula.com/article/list/?page=2'
hdr = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Connection': 'keep-alive',
    'Referer': 'http://www.baidu.com',
    'Upgrade-Insecure-Requests': '1',
    # 'Host': 'www.doutula.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
}

c = cookielib.LWPCookieJar()
cookie = urllib2.HTTPCookieProcessor(c)
# 附带cookie提交
opener = urllib2.build_opener(cookie)


# 获取 url 的页面的html代码
def getHtml(url):
    request = urllib2.Request(url, headers=hdr)
    response = opener.open(request)
    text = response.read()
    return text


'''
获取网页的 所有 图片组的链接
输入https://www.doutula.com/article/list/?page=2
返回
https://www.doutula.com/article/detail/1923030 
'''


def get_img_url(url):
    html = getHtml(url)
    soup = BeautifulSoup(html, 'lxml')
    # 找所有的图片组a标签
    all_a = soup.find_all('a', class_='list-group-item random_list')
    '''
             <a href="https://www.doutula.com/article/detail/6424222" class="list-group-item random_list"
            <div class="random_title">我们是谁?(程序员版本)<div class="date">2017-08-19</div></div>
    '''
    # all_a是图片的链接

    all_img_url = []
    for a in all_a:
        # a['href'] : https://www.doutula.com/article/detail/2904682
        all_img_url.append(a['href'])
    return all_img_url


# 给一个  https://www.doutula.com/article/detail/2904682
# 保存为文件夹和图片的格式
def save_img(url):
    html = getHtml(url)
    '''
    <div class="artile_des">
    <table>
    <tbody>
    <tr>
    <td>
    <a href="https://www.doutula.com/photo/8759971">
    <img src="//ws2.sinaimg.cn/large/9150e4e5ly1fio901ka7jj20c80c8glz.jpg"
    alt="我敬往事一杯酒,当初眼瞎爱过狗"
    '''
    soup = BeautifulSoup(html, 'html.parser')
    ''' 获取标题,也是文件夹名称
    <li class="list-group-item">     
    <div class="pic-title">
    <h1><a href="https://www.doutula.com/article/detail/2904682">敬你一杯</a></h1>
    '''
    title = soup.select('li.list-group-item > div.pic-title > h1 > a')[0].get_text()

    # 所有图片的链接
    all_img_urls = []
    img_srcs = soup.select("div.artile_des > table > tbody > tr > td > a > img")
    for i in img_srcs:
        all_img_urls.append('https:' + i['src'])

    if not os.path.exists('imgs/' + title):
        os.makedirs('imgs/' + title)

    # 保证图片顺序
    img_num = 0
    for i in all_img_urls:
        # 文件名称
        name = '%d_' % img_num + i.split('/')[-1]
        path = 'imgs/' + title + '/' + name
        print path
        with open(path, 'wb') as f:
            f.write(getHtml(i))
        img_num += 1


'''
给url = 'https://www.doutula.com/article/list/?page=2'
保存所有网页的图片组到文件夹
'''


def save_page_imgs(url):
    img_urls = get_img_url(url)
    for i in img_urls:
        print i
        save_img(i)


url = 'https://www.doutula.com/article/detail/6424222'


# save_img(url)

# cnt = 0
# for i in range(2, 100):
#     try:
#         url = 'https://www.doutula.com/article/list/?page=%d' % i
#         save_page_imgs(url)
#     except Exception, e:
#         print(e)
#
# print cnt

class Craw(threading.Thread):
    def __init__(self, id, urls):
        threading.Thread.__init__(self)
        self.urls = urls
        self.id = id

    def run(self):
        for i in self.urls:
            try:
                print '爬虫 %d 正在爬取 %s' % (self.id, i)
                save_page_imgs(i)
            except Exception,e:
                print e

url_group = []

# 2到181 个页面
for i in range(2, 20):
    urls = []
    for j in range(10):
        url = 'https://www.doutula.com/article/list/?page=%s' % str((i - 2) * 10 + j + 2)
        urls.append(url)
    url_group.append(urls)

cnt = 0
for urls in url_group:
    craw = Craw(cnt, urls)
    craw.start()
    cnt += 1

 

 

使用python3 去掉错误文件

import os

path = r'D:\code\py\py27\爬取斗图网\imgs'

lists = os.listdir(path)

print(len(lists))


def remove_dir(path):
    if os.path.isfile(path ):
        os.remove(path)
    else:
        files = os.listdir(path)
        for i in files:
            remove_dir(path+'//'+i)
        os.removedirs(path)


for i in lists:
    print(i)
    p = path + '\\' + i
    print(p)
    t = os.listdir(p)
    print(len(t))
    if len(t) <= 2:
        remove_dir(p)

 

 

多进程版

# coding=utf-8

import urllib
import urllib2
import urlparse
import re
import bs4
import requests
import cookielib
from lxml import etree
from bs4 import BeautifulSoup
import os
import sys
import ssl
import threading
import time

reload(sys)
sys.setdefaultencoding('utf-8')

url = 'https://www.doutula.com/article/list/?page=2'
hdr = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Connection': 'keep-alive',
    'Referer': 'http://www.baidu.com',
    'Upgrade-Insecure-Requests': '1',
    # 'Host': 'www.doutula.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
}

c = cookielib.LWPCookieJar()
cookie = urllib2.HTTPCookieProcessor(c)
# 附带cookie提交
opener = urllib2.build_opener(cookie)


# 获取 url 的页面的html代码
def getHtml(url):
    request = urllib2.Request(url, headers=hdr)
    response = opener.open(request)
    text = response.read()
    return text


'''
获取网页的 所有 图片组的链接
输入https://www.doutula.com/article/list/?page=2
返回
https://www.doutula.com/article/detail/1923030 
'''


def get_img_url(url):
    html = getHtml(url)
    soup = BeautifulSoup(html, 'lxml')
    # 找所有的图片组a标签
    all_a = soup.find_all('a', class_='list-group-item random_list')
    '''
             <a href="https://www.doutula.com/article/detail/6424222" class="list-group-item random_list"
            <div class="random_title">我们是谁?(程序员版本)<div class="date">2017-08-19</div></div>
    '''
    # all_a是图片的链接

    all_img_url = []
    for a in all_a:
        # a['href'] : https://www.doutula.com/article/detail/2904682
        all_img_url.append(a['href'])
    return all_img_url


# 给一个  https://www.doutula.com/article/detail/2904682
# 保存为文件夹和图片的格式
def save_img(url):
    html = getHtml(url)
    '''
    <div class="artile_des">
    <table>
    <tbody>
    <tr>
    <td>
    <a href="https://www.doutula.com/photo/8759971">
    <img src="//ws2.sinaimg.cn/large/9150e4e5ly1fio901ka7jj20c80c8glz.jpg"
    alt="我敬往事一杯酒,当初眼瞎爱过狗"
    '''
    soup = BeautifulSoup(html, 'html.parser')
    ''' 获取标题,也是文件夹名称
    <li class="list-group-item">     
    <div class="pic-title">
    <h1><a href="https://www.doutula.com/article/detail/2904682">敬你一杯</a></h1>
    '''
    title = soup.select('li.list-group-item > div.pic-title > h1 > a')[0] \
        .get_text().strip()

    # 所有图片的链接
    all_img_urls = []
    img_srcs = soup.select("div.artile_des > table > tbody > tr > td > a > img")
    for i in img_srcs:
        all_img_urls.append('https:' + i['src'])

    if not os.path.exists('imgs/' + title):
        os.makedirs('imgs/' + title)

    # 保证图片顺序
    img_num = 0
    for i in all_img_urls:
        # 文件名称
        name = '%d_' % img_num + i.split('/')[-1]
        name = name.strip()
        path = 'imgs/' + title + '/' + name

        with open(path, 'wb') as f:
            f.write(getHtml(i))
        img_num += 1


'''
给url = 'https://www.doutula.com/article/list/?page=2'
保存所有网页的图片组到文件夹
'''


def save_page_imgs(url):
    img_urls = get_img_url(url)
    for i in img_urls:
        print i
        save_img(i)


def start(urls):
    for i in urls:
        try:
            print '爬虫正在爬取 %s' % (i)
            save_page_imgs(i)
        except Exception, e:
            print e


url_group = []

# 1到500 个页面,十个页面一组
for i in range(0, 50):
    urls = []
    for j in range(10):
        url = 'https://www.doutula.com/article/list/?page=%s' % str(i * 10 + j + 1)
        urls.append(url)
    url_group.append(urls)

import multiprocessing
import time

if __name__ == "__main__":
    try:
        pool = multiprocessing.Pool(processes=50)
        cnt = 0
        for urls in url_group:
            pool.apply_async(start, (urls,))  # 维持执行的进程总数为processes,当一个进程执行完毕后会添加新的进程进去
            cnt += 1

        print "Mark~ Mark~ Mark~~~~~~~~~~~~~~~~~~~~~~"
        pool.close()
        pool.join()  # 调用join之前,先调用close函数,否则会出错。执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
        print "finshed"
    except Exception, e:
        print e

 

  • 打赏
  • 点赞
  • 收藏
  • 分享
共有 人打赏支持
粉丝 20
博文 834
码字总数 609558
×
阿豪boy
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: