抓取微博
博客专区 > 昏鸦 的博客 > 博客详情
抓取微博
昏鸦 发表于4年前
抓取微博
  • 发表于 4年前
  • 阅读 225
  • 收藏 8
  • 点赞 0
  • 评论 0

import os

from os import path

import sys

working_dir = os.path.abspath(os.path.realpath(__file__)+ '/../../')

sys.path.append(working_dir)

import requests

import common.constants as CT

import common.utils as CU

import json

import simplejson as SJ

from Queue import Queue

import threading

from socialplatform import SocialPlatform

import common.errorcodes as ERROR

from pool.ippool import IPPool

import traceback

import threading

import re

#import grequests


class Sina(SocialPlatform):

    TIMELINE_BASE_URL = 'https://api.weibo.com/2/statuses/user_timeline.json?'

    TAG_BASE_URL = 'https://api.weibo.com/2/tags.json'

    BASEINFO_BASE_URL = 'https://api.weibo.com/2/users/show.json'

    WEIBO_BASE_URL = 'https://api.weibo.com/2/statuses/show.json'

    REPOST_WEIBO_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline.json'

    REPOST_WEIBOID_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline/ids.json' 

    FWDCN_BASE_URL = 'https://api.weibo.com/2/statuses/count.json'

    save_queue = Queue()

    sina = None


    def __init__(self):

        super(Sina, self).__init__()

        #th = threading.Thread(target = self.store)

        #th.start()


    @classmethod

    def get_instance(cls):

        if cls.sina is None:

            cls.sina = Sina()

            print 'new sina instance'

        else:

            print 'sina obj existed'

        return cls.sina


    #get trimmed origin weibo by default 

    def get_timeline(self, access_token, uid, feature='1', trim_user='1', since_id='0', count=20, proxy = None):

        res = None

        params = {

                  #'source':'1917566200',

                  'access_token':access_token, 

                  'uid':uid, 

                  'feature':str(feature), 

                  'trim_user':str(trim_user),

                  'since_id':str(since_id),

                  'count':str(count)}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = requests.get(url = Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            #res = grequests.request('GET', Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)

        except Exception as E:

            print 'sina get timeline exception:', E

            #self.request_timeout(proxy)

        

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    #self.save_queue.put({'type':'origin','content':res.text+'\n'})

        #    since_id = self.parse_weibo_json(res.text)

        #    return since_id


    def check_response(self, text):

        data = SJ.loads(text)

        if data is None:

            error_msg = {'msg':'',

                         'error_code':ERROR.NO_RETURN_VALUE,

                         'caller':''}

            raise Exception(str(error_msg))

        elif type(data) == dict:

            if data.get('error_code') is not None:

                error_msg = {'msg':data.get('error'),

                             'error_code':data.get('error_code'),

                             'caller':data.get('request')}

                raise Exception(str(error_msg))


    def request_timeout(self, proxy):

        error_msg = {'msg':'%s request time out'%proxy,

                     'error_code':ERROR.REQUEST_TIMEOUT,

                     'proxy':proxy}

        raise Exception(str(error_msg))

        


    def get_tags(self, access_token, uid, count = 20, proxy=None):

        res = None

        tags = []

        params = {'access_token':access_token, 

                  'uid':uid,

                  'count':count} 

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            #self.request_timeout(proxy)

            pass

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tags = self.parse_tag_json(res.text)

        #return tags


    def get_basicinfo(self, access_token, uid, proxy):

        res = None

        params = {'access_token':access_token, 

                  'uid':uid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            print 'sina get basicinfo exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    basicinfo = self.parse_basicinfo_json(res.text)

        #return basicinfo


    def get_weibo_by_id(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = requests.get (url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

        except Exception as E:

            print 'sina get weibo by id exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tmp_data = SJ.loads(res.text)

        #    userdata = SJ.dumps(tmp_data.get('user'))

        #    userdata = self.parse_basicinfo_json(userdata)

        #    weibodata = self.parse_single_weibo(res.text)

        #    return {'userdata':userdata, 'weibodata':weibodata}


    def get_fwdcn_by_ids(self, access_token, weiboids, proxy):

        res = None

        weiboids = ','.join(weiboids)

        params = {'access_token':access_token,

                  'ids':weiboids}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.FWDCN_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)



    def get_repost_weibo_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBO_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print 'sina repost weibo exception:', E

            self.request_timeout(proxy)


    def get_repost_weiboid_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBOID_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)


    def parse_tag_json(self, text):

        data = SJ.loads(text)

        tags = []

        try:

            for item in data:

                for k, v in item.iteritems():

                    if k != 'weight':

                        tags.append(v)

            return tags

        except Exception as E:

            print E

            traceback.print_stack()

            print data


    def parse_weibo_json(self, text):

        save_content = ''

        since_id = '0'

        try:

            data = SJ.loads(text)

            timeline = data.get('statuses',[])

            for l in timeline:

                uid = str(l.get('uid'))

                text = l.get('text')

                weiboid = str(l.get('id'))

                save_content = '%s\t%s\t%s\n'%(uid, text, weiboid)

                #self.save_queue.put({'type':'trimmed', 'content':save_content, 'usid':uid})

            if timeline not in [None,[]]:

                l = timeline[0]

                since_id = str(l.get('id'))

        except Exception as E:

            print E

            traceback.print_stack()

        finally:

            return since_id


    def parse_basicinfo_json(self, text):

        try:

            PROFILE_HEADER = 'http://weibo.com/'

            data = SJ.loads(text)

            location = CU.convert_utf8(data.get('location',''))

            gender = data.get('gender','')

            isverified = data.get('verified','')

            username = CU.convert_utf8(data.get('screen_name',''))

            icon = data.get('avatar_large','')

            usid = data.get('idstr','')

            city_code = data.get('city','')

            province_code = data.get('province','')

            followers_ct = data.get('followers_count', 0)

            if usid not in['',None]:

                profile_url = PROFILE_HEADER + usid

            if isverified:

                isverified = 1

                description = data.get('verified_reason')

            else:

                isverified = 0

                description = data.get('description')

            if description is not None:

                description = CU.convert_utf8(description)

            return {'location':location,

                    'gender':gender,

                    'isverified':isverified,

                    'username':username,

                    'icon':icon,

                    'description':description,

                    'profile_url':profile_url, 

                    'city_code':city_code,

                    'province_code':province_code, 

                    'followers_ct':followers_ct}

        except Exception as E:

            print E

            traceback.print_stack()


    def parse_single_weibo(self, text):

        try:

            data = SJ.loads(text)

            time = data.get('created_at')

            tp_time = ''.join(re.findall(r'(\+)(\d+?)( )',time)[0])

            time = time.replace(tp_time,'')

            time = CU.formatted_str_to_millis(time, '%a %b %d %H:%M:%S %Y')

            text = data.get('text')

            source = data.get('source')

            image = data.get('bmiddle_pic','')

            return {'date':time, 'content':text, 'image':image}    

        except Exception as E:

            print E

             

    def check(self):

        dead_crawl_thread_count = 0 

        crawl_thread_over = False

        dead_save_thread_count = 0 

        save_thread_over = False

        while(True):

            for thread in self.crawl_thread_pool:

                if not thread.isAlive():

                    dead_crawl_thread_count += 1

            if dead_crawl_thread_count == len(self.crawl_thread_pool):

                crawl_thread_over = True


            for thread in self.save_thread_pool:

                if not thread.isAlive():

                    dead_save_thread_count += 1

            if dead_save_thread_count == len(self.save_thread_pool):

                save_thread_over = True


            if save_thread_over and crawl_thread_over:

                break


    def run_crawler(self):

        for i in range(self.crawl_thread_amount):

            crawl_thread = threading.Thread(target=self.get_timeline())



    def test_usage_limit(self, access_token, proxy):

        url = 'https://api.weibo.com/2/account/rate_limit_status.json'

        params = {'access_token':access_token,

                   'source':'1917566200'}

        proxies = {'http':"http://%s"%proxy}

        res = requests.get(url = url, params=params, timeout = CT.TIMEOUT, proxies=proxies)

        print res.text



    def test_unit(self):

        jobs = []

        import gevent

        for i in range(100):

            jobs.append(gevent.spawn(self.test_ip_limit))

        gevent.joinall(jobs)


    def test_loop(self):

        while(True):

            self.test_unit()


    def test_ip_limit(self):

        plat = 'sina'

        #ip_port = '223.4.241.244:3128'

        #ip_port = '80.18.170.245:3128'

        ip_port = '218.57.136.202:80'

        proxy = {'http': "http://%s/" % ip_port}

        self.get_timeline('2.008kKrVCGZulFC85b3b9496f0iMsYD','1816963684', count=1, proxy = proxy)

        IPPool.inc_ip_used_count(key={'proxy':ip_port}, plat=plat, step = 1)

        print IPPool.get_ip_used_count(key={'proxy':ip_port}, plat = plat)


if __name__ == '__main__':

    sina = Sina.get_instance()

    #ip_port = '42.62.5.100:5978'

    #ip_port = '183.60.97.98:80'

    #ip_port = '222.178.37.42:1337'

    ##acs_token = '2.008kKrVCS5lgJB773f46691cvRBfDD'

    acs_token = '2.008kKrVCGZulFC85b3b9496f0iMsYD'

    #th = threading.Thread(target = sina.test_loop)

    #th.start()

    sina.get_timeline(acs_token,'2958598935', count=100, since_id = '3497205966678321')

    #sina.get_tags (acs_token, '2958598935')

    #sina.get_basicinfo (acs_token, '2958598935', None)

    #sina.get_weibo_by_id (acs_token,'3592778635816523', None)

    

    #res = sina.get_basicinfo('2.008kKrVCGZulFC5bab580682RGYueB','1881428463',ip_port)

    #res = sina.get_weibo_by_id(acs_token, '3562960347780645', None)

    #sina.test_ip_limit()

    #wids = ['354464853658119', '354316072507065', '354285872621913']

    #sina.get_fwdcn_by_ids(acs_token, wids, None)

    #sina.get_repost_weibo_list('2.008kKrVCGZulFC5bab580682RGYueB', '3557946866271328', None)

    #sina.get_timeline(acs_token,'1881428463', count=3, proxy = None)

    #sina.get_tags('2.008kKrVCGZulFCdacfff9787oYTfeE','1881428463')

    #sina.get_timeline('2.0','1881428463', count=100, since_id = '3497205966678321', proxy = proxy)

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '223.4.241.244:3128')

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '183.60.97.98:80')

    #proxy = '222.197.214.91:808'

    #requests.get('http://211.151.139.231:8031', proxies = {'http':"http://%s"%proxy})

    


共有 人打赏支持
粉丝 7
博文 147
码字总数 57285
×
昏鸦
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: