文档章节

抓取微博

昏鸦
 昏鸦
发布于 2014/03/03 15:47
字数 1242
阅读 233
收藏 8

import os

from os import path

import sys

working_dir = os.path.abspath(os.path.realpath(__file__)+ '/../../')

sys.path.append(working_dir)

import requests

import common.constants as CT

import common.utils as CU

import json

import simplejson as SJ

from Queue import Queue

import threading

from socialplatform import SocialPlatform

import common.errorcodes as ERROR

from pool.ippool import IPPool

import traceback

import threading

import re

#import grequests


class Sina(SocialPlatform):

    TIMELINE_BASE_URL = 'https://api.weibo.com/2/statuses/user_timeline.json?'

    TAG_BASE_URL = 'https://api.weibo.com/2/tags.json'

    BASEINFO_BASE_URL = 'https://api.weibo.com/2/users/show.json'

    WEIBO_BASE_URL = 'https://api.weibo.com/2/statuses/show.json'

    REPOST_WEIBO_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline.json'

    REPOST_WEIBOID_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline/ids.json' 

    FWDCN_BASE_URL = 'https://api.weibo.com/2/statuses/count.json'

    save_queue = Queue()

    sina = None


    def __init__(self):

        super(Sina, self).__init__()

        #th = threading.Thread(target = self.store)

        #th.start()


    @classmethod

    def get_instance(cls):

        if cls.sina is None:

            cls.sina = Sina()

            print 'new sina instance'

        else:

            print 'sina obj existed'

        return cls.sina


    #get trimmed origin weibo by default 

    def get_timeline(self, access_token, uid, feature='1', trim_user='1', since_id='0', count=20, proxy = None):

        res = None

        params = {

                  #'source':'1917566200',

                  'access_token':access_token, 

                  'uid':uid, 

                  'feature':str(feature), 

                  'trim_user':str(trim_user),

                  'since_id':str(since_id),

                  'count':str(count)}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = requests.get(url = Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            #res = grequests.request('GET', Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)

        except Exception as E:

            print 'sina get timeline exception:', E

            #self.request_timeout(proxy)

        

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    #self.save_queue.put({'type':'origin','content':res.text+'\n'})

        #    since_id = self.parse_weibo_json(res.text)

        #    return since_id


    def check_response(self, text):

        data = SJ.loads(text)

        if data is None:

            error_msg = {'msg':'',

                         'error_code':ERROR.NO_RETURN_VALUE,

                         'caller':''}

            raise Exception(str(error_msg))

        elif type(data) == dict:

            if data.get('error_code') is not None:

                error_msg = {'msg':data.get('error'),

                             'error_code':data.get('error_code'),

                             'caller':data.get('request')}

                raise Exception(str(error_msg))


    def request_timeout(self, proxy):

        error_msg = {'msg':'%s request time out'%proxy,

                     'error_code':ERROR.REQUEST_TIMEOUT,

                     'proxy':proxy}

        raise Exception(str(error_msg))

        


    def get_tags(self, access_token, uid, count = 20, proxy=None):

        res = None

        tags = []

        params = {'access_token':access_token, 

                  'uid':uid,

                  'count':count} 

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            #self.request_timeout(proxy)

            pass

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tags = self.parse_tag_json(res.text)

        #return tags


    def get_basicinfo(self, access_token, uid, proxy):

        res = None

        params = {'access_token':access_token, 

                  'uid':uid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            print 'sina get basicinfo exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    basicinfo = self.parse_basicinfo_json(res.text)

        #return basicinfo


    def get_weibo_by_id(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = requests.get (url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

        except Exception as E:

            print 'sina get weibo by id exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tmp_data = SJ.loads(res.text)

        #    userdata = SJ.dumps(tmp_data.get('user'))

        #    userdata = self.parse_basicinfo_json(userdata)

        #    weibodata = self.parse_single_weibo(res.text)

        #    return {'userdata':userdata, 'weibodata':weibodata}


    def get_fwdcn_by_ids(self, access_token, weiboids, proxy):

        res = None

        weiboids = ','.join(weiboids)

        params = {'access_token':access_token,

                  'ids':weiboids}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.FWDCN_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)



    def get_repost_weibo_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBO_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print 'sina repost weibo exception:', E

            self.request_timeout(proxy)


    def get_repost_weiboid_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBOID_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)


    def parse_tag_json(self, text):

        data = SJ.loads(text)

        tags = []

        try:

            for item in data:

                for k, v in item.iteritems():

                    if k != 'weight':

                        tags.append(v)

            return tags

        except Exception as E:

            print E

            traceback.print_stack()

            print data


    def parse_weibo_json(self, text):

        save_content = ''

        since_id = '0'

        try:

            data = SJ.loads(text)

            timeline = data.get('statuses',[])

            for l in timeline:

                uid = str(l.get('uid'))

                text = l.get('text')

                weiboid = str(l.get('id'))

                save_content = '%s\t%s\t%s\n'%(uid, text, weiboid)

                #self.save_queue.put({'type':'trimmed', 'content':save_content, 'usid':uid})

            if timeline not in [None,[]]:

                l = timeline[0]

                since_id = str(l.get('id'))

        except Exception as E:

            print E

            traceback.print_stack()

        finally:

            return since_id


    def parse_basicinfo_json(self, text):

        try:

            PROFILE_HEADER = 'http://weibo.com/'

            data = SJ.loads(text)

            location = CU.convert_utf8(data.get('location',''))

            gender = data.get('gender','')

            isverified = data.get('verified','')

            username = CU.convert_utf8(data.get('screen_name',''))

            icon = data.get('avatar_large','')

            usid = data.get('idstr','')

            city_code = data.get('city','')

            province_code = data.get('province','')

            followers_ct = data.get('followers_count', 0)

            if usid not in['',None]:

                profile_url = PROFILE_HEADER + usid

            if isverified:

                isverified = 1

                description = data.get('verified_reason')

            else:

                isverified = 0

                description = data.get('description')

            if description is not None:

                description = CU.convert_utf8(description)

            return {'location':location,

                    'gender':gender,

                    'isverified':isverified,

                    'username':username,

                    'icon':icon,

                    'description':description,

                    'profile_url':profile_url, 

                    'city_code':city_code,

                    'province_code':province_code, 

                    'followers_ct':followers_ct}

        except Exception as E:

            print E

            traceback.print_stack()


    def parse_single_weibo(self, text):

        try:

            data = SJ.loads(text)

            time = data.get('created_at')

            tp_time = ''.join(re.findall(r'(\+)(\d+?)( )',time)[0])

            time = time.replace(tp_time,'')

            time = CU.formatted_str_to_millis(time, '%a %b %d %H:%M:%S %Y')

            text = data.get('text')

            source = data.get('source')

            image = data.get('bmiddle_pic','')

            return {'date':time, 'content':text, 'image':image}    

        except Exception as E:

            print E

             

    def check(self):

        dead_crawl_thread_count = 0 

        crawl_thread_over = False

        dead_save_thread_count = 0 

        save_thread_over = False

        while(True):

            for thread in self.crawl_thread_pool:

                if not thread.isAlive():

                    dead_crawl_thread_count += 1

            if dead_crawl_thread_count == len(self.crawl_thread_pool):

                crawl_thread_over = True


            for thread in self.save_thread_pool:

                if not thread.isAlive():

                    dead_save_thread_count += 1

            if dead_save_thread_count == len(self.save_thread_pool):

                save_thread_over = True


            if save_thread_over and crawl_thread_over:

                break


    def run_crawler(self):

        for i in range(self.crawl_thread_amount):

            crawl_thread = threading.Thread(target=self.get_timeline())



    def test_usage_limit(self, access_token, proxy):

        url = 'https://api.weibo.com/2/account/rate_limit_status.json'

        params = {'access_token':access_token,

                   'source':'1917566200'}

        proxies = {'http':"http://%s"%proxy}

        res = requests.get(url = url, params=params, timeout = CT.TIMEOUT, proxies=proxies)

        print res.text



    def test_unit(self):

        jobs = []

        import gevent

        for i in range(100):

            jobs.append(gevent.spawn(self.test_ip_limit))

        gevent.joinall(jobs)


    def test_loop(self):

        while(True):

            self.test_unit()


    def test_ip_limit(self):

        plat = 'sina'

        #ip_port = '223.4.241.244:3128'

        #ip_port = '80.18.170.245:3128'

        ip_port = '218.57.136.202:80'

        proxy = {'http': "http://%s/" % ip_port}

        self.get_timeline('2.008kKrVCGZulFC85b3b9496f0iMsYD','1816963684', count=1, proxy = proxy)

        IPPool.inc_ip_used_count(key={'proxy':ip_port}, plat=plat, step = 1)

        print IPPool.get_ip_used_count(key={'proxy':ip_port}, plat = plat)


if __name__ == '__main__':

    sina = Sina.get_instance()

    #ip_port = '42.62.5.100:5978'

    #ip_port = '183.60.97.98:80'

    #ip_port = '222.178.37.42:1337'

    ##acs_token = '2.008kKrVCS5lgJB773f46691cvRBfDD'

    acs_token = '2.008kKrVCGZulFC85b3b9496f0iMsYD'

    #th = threading.Thread(target = sina.test_loop)

    #th.start()

    sina.get_timeline(acs_token,'2958598935', count=100, since_id = '3497205966678321')

    #sina.get_tags (acs_token, '2958598935')

    #sina.get_basicinfo (acs_token, '2958598935', None)

    #sina.get_weibo_by_id (acs_token,'3592778635816523', None)

    

    #res = sina.get_basicinfo('2.008kKrVCGZulFC5bab580682RGYueB','1881428463',ip_port)

    #res = sina.get_weibo_by_id(acs_token, '3562960347780645', None)

    #sina.test_ip_limit()

    #wids = ['354464853658119', '354316072507065', '354285872621913']

    #sina.get_fwdcn_by_ids(acs_token, wids, None)

    #sina.get_repost_weibo_list('2.008kKrVCGZulFC5bab580682RGYueB', '3557946866271328', None)

    #sina.get_timeline(acs_token,'1881428463', count=3, proxy = None)

    #sina.get_tags('2.008kKrVCGZulFCdacfff9787oYTfeE','1881428463')

    #sina.get_timeline('2.0','1881428463', count=100, since_id = '3497205966678321', proxy = proxy)

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '223.4.241.244:3128')

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '183.60.97.98:80')

    #proxy = '222.197.214.91:808'

    #requests.get('http://211.151.139.231:8031', proxies = {'http':"http://%s"%proxy})

    


© 著作权归作者所有

共有 人打赏支持
昏鸦
粉丝 6
博文 182
码字总数 59043
作品 0
程序员
新浪微博抓取问题(怎么抓取含有被评论微博或被转发微博的本条微博)

如上图,抓取新浪微博,用词1或词2可以抓取到被评论的微博(即8月1日发的那条微博),有什么办法可以使本条微博也被抓取到(注:本条微博无词1、无词2,但被评论的微博中有词1、词2)?

遊魂嘢廆
2014/08/06
1K
2
如何用好Xpath以精确定位节点位置

最近使用MetaSeeker抓取新浪微博,可最近新浪微博的格式结构发生了些变化。 对于一条含有转发内容的微博,该微博的发布时间和转发内容的发布时间都位于class名为lf的节点中,我在MetaStudio中...

司徒春运
2011/08/10
1K
0
微博python爬虫,每日百万级数据

新浪微博绝对是一个巨大的,实时的语料库!对微博数据爬取和分析,有重大的意义。 比如,现在要调查工商银行的服务态度,就可以抓取微博内容中包含工商银行的微博语料,然后做情感分析,就可...

技术小能手
07/09
0
0
sina 微博 爬虫

关于sina微博登陆后的数据抓取(sina api除外), 需要抓微博粉丝数,weiboshu, 以及谈论某一个关键字数,但是当模拟登陆成功后,连续的大批量抓取(假如连续抓取10000条数据),新浪微博就...

xutaoding
2015/01/09
1K
3
Heritrix抓取新浪微博

最近想通过爬虫抓取新浪微博的信息,前些日子使用MetaSeeker抓取了很具有规范的微博信息。这两天了解到了一个新的工具,Heritrix。 由于我刚刚配置好heritrix,对heritrix也不熟悉。想请教下...

司徒春运
2011/07/26
6.1K
5

没有更多内容

加载失败,请刷新页面

加载更多

CentOS7防火墙firewalld操作

firewalld Linux上新用的防火墙软件,跟iptables差不多的工具。 firewall-cmd 是 firewalld 的字符界面管理工具,firewalld是CentOS7的一大特性,最大的好处有两个:支持动态更新,不用重启服...

dingdayu
今天
1
0
关于组件化的最初步

一个工程可能会有多个版本,有国际版、国内版、还有针对各种不同的渠道化的打包版本、这个属于我们日常经常见到的打包差异化版本需求。 而对于工程的开发,比如以前的公司,分成了有三大块业...

DannyCoder
今天
2
0
Spring的Resttemplate发送带header的post请求

private HttpHeaders getJsonHeader() { HttpHeaders headers = new HttpHeaders(); MediaType type = MediaType.parseMediaType("application/json; charset=UTF-8"); ......

qiang123
昨天
3
0
Spring Cloud Gateway 之 Only one connection receive subscriber allowed

都说Spring Cloud Gateway好,我也来试试,可是配置了总是报下面这个错误: java.lang.IllegalStateException: Only one connection receive subscriber allowed. 困扰了我几天的问题,原来...

ThinkGem
昨天
27
0
学习设计模式——观察者模式

1. 认识观察者模式 1. 定义:定义对象之间一种一对多的依赖关系,当一个对象状态发生变化时,依赖该对象的其他对象都会得到通知并进行相应的变化。 2. 组织结构: Subject:目标对象类,会被...

江左煤郎
昨天
4
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部