文档章节

抓取微博

昏鸦
 昏鸦
发布于 2014/03/03 15:47
字数 1242
阅读 236
收藏 8

import os

from os import path

import sys

working_dir = os.path.abspath(os.path.realpath(__file__)+ '/../../')

sys.path.append(working_dir)

import requests

import common.constants as CT

import common.utils as CU

import json

import simplejson as SJ

from Queue import Queue

import threading

from socialplatform import SocialPlatform

import common.errorcodes as ERROR

from pool.ippool import IPPool

import traceback

import threading

import re

#import grequests


class Sina(SocialPlatform):

    TIMELINE_BASE_URL = 'https://api.weibo.com/2/statuses/user_timeline.json?'

    TAG_BASE_URL = 'https://api.weibo.com/2/tags.json'

    BASEINFO_BASE_URL = 'https://api.weibo.com/2/users/show.json'

    WEIBO_BASE_URL = 'https://api.weibo.com/2/statuses/show.json'

    REPOST_WEIBO_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline.json'

    REPOST_WEIBOID_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline/ids.json' 

    FWDCN_BASE_URL = 'https://api.weibo.com/2/statuses/count.json'

    save_queue = Queue()

    sina = None


    def __init__(self):

        super(Sina, self).__init__()

        #th = threading.Thread(target = self.store)

        #th.start()


    @classmethod

    def get_instance(cls):

        if cls.sina is None:

            cls.sina = Sina()

            print 'new sina instance'

        else:

            print 'sina obj existed'

        return cls.sina


    #get trimmed origin weibo by default 

    def get_timeline(self, access_token, uid, feature='1', trim_user='1', since_id='0', count=20, proxy = None):

        res = None

        params = {

                  #'source':'1917566200',

                  'access_token':access_token, 

                  'uid':uid, 

                  'feature':str(feature), 

                  'trim_user':str(trim_user),

                  'since_id':str(since_id),

                  'count':str(count)}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = requests.get(url = Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            #res = grequests.request('GET', Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)

        except Exception as E:

            print 'sina get timeline exception:', E

            #self.request_timeout(proxy)

        

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    #self.save_queue.put({'type':'origin','content':res.text+'\n'})

        #    since_id = self.parse_weibo_json(res.text)

        #    return since_id


    def check_response(self, text):

        data = SJ.loads(text)

        if data is None:

            error_msg = {'msg':'',

                         'error_code':ERROR.NO_RETURN_VALUE,

                         'caller':''}

            raise Exception(str(error_msg))

        elif type(data) == dict:

            if data.get('error_code') is not None:

                error_msg = {'msg':data.get('error'),

                             'error_code':data.get('error_code'),

                             'caller':data.get('request')}

                raise Exception(str(error_msg))


    def request_timeout(self, proxy):

        error_msg = {'msg':'%s request time out'%proxy,

                     'error_code':ERROR.REQUEST_TIMEOUT,

                     'proxy':proxy}

        raise Exception(str(error_msg))

        


    def get_tags(self, access_token, uid, count = 20, proxy=None):

        res = None

        tags = []

        params = {'access_token':access_token, 

                  'uid':uid,

                  'count':count} 

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            #self.request_timeout(proxy)

            pass

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tags = self.parse_tag_json(res.text)

        #return tags


    def get_basicinfo(self, access_token, uid, proxy):

        res = None

        params = {'access_token':access_token, 

                  'uid':uid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            print 'sina get basicinfo exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    basicinfo = self.parse_basicinfo_json(res.text)

        #return basicinfo


    def get_weibo_by_id(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = requests.get (url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

        except Exception as E:

            print 'sina get weibo by id exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tmp_data = SJ.loads(res.text)

        #    userdata = SJ.dumps(tmp_data.get('user'))

        #    userdata = self.parse_basicinfo_json(userdata)

        #    weibodata = self.parse_single_weibo(res.text)

        #    return {'userdata':userdata, 'weibodata':weibodata}


    def get_fwdcn_by_ids(self, access_token, weiboids, proxy):

        res = None

        weiboids = ','.join(weiboids)

        params = {'access_token':access_token,

                  'ids':weiboids}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.FWDCN_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)



    def get_repost_weibo_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBO_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print 'sina repost weibo exception:', E

            self.request_timeout(proxy)


    def get_repost_weiboid_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBOID_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)


    def parse_tag_json(self, text):

        data = SJ.loads(text)

        tags = []

        try:

            for item in data:

                for k, v in item.iteritems():

                    if k != 'weight':

                        tags.append(v)

            return tags

        except Exception as E:

            print E

            traceback.print_stack()

            print data


    def parse_weibo_json(self, text):

        save_content = ''

        since_id = '0'

        try:

            data = SJ.loads(text)

            timeline = data.get('statuses',[])

            for l in timeline:

                uid = str(l.get('uid'))

                text = l.get('text')

                weiboid = str(l.get('id'))

                save_content = '%s\t%s\t%s\n'%(uid, text, weiboid)

                #self.save_queue.put({'type':'trimmed', 'content':save_content, 'usid':uid})

            if timeline not in [None,[]]:

                l = timeline[0]

                since_id = str(l.get('id'))

        except Exception as E:

            print E

            traceback.print_stack()

        finally:

            return since_id


    def parse_basicinfo_json(self, text):

        try:

            PROFILE_HEADER = 'http://weibo.com/'

            data = SJ.loads(text)

            location = CU.convert_utf8(data.get('location',''))

            gender = data.get('gender','')

            isverified = data.get('verified','')

            username = CU.convert_utf8(data.get('screen_name',''))

            icon = data.get('avatar_large','')

            usid = data.get('idstr','')

            city_code = data.get('city','')

            province_code = data.get('province','')

            followers_ct = data.get('followers_count', 0)

            if usid not in['',None]:

                profile_url = PROFILE_HEADER + usid

            if isverified:

                isverified = 1

                description = data.get('verified_reason')

            else:

                isverified = 0

                description = data.get('description')

            if description is not None:

                description = CU.convert_utf8(description)

            return {'location':location,

                    'gender':gender,

                    'isverified':isverified,

                    'username':username,

                    'icon':icon,

                    'description':description,

                    'profile_url':profile_url, 

                    'city_code':city_code,

                    'province_code':province_code, 

                    'followers_ct':followers_ct}

        except Exception as E:

            print E

            traceback.print_stack()


    def parse_single_weibo(self, text):

        try:

            data = SJ.loads(text)

            time = data.get('created_at')

            tp_time = ''.join(re.findall(r'(\+)(\d+?)( )',time)[0])

            time = time.replace(tp_time,'')

            time = CU.formatted_str_to_millis(time, '%a %b %d %H:%M:%S %Y')

            text = data.get('text')

            source = data.get('source')

            image = data.get('bmiddle_pic','')

            return {'date':time, 'content':text, 'image':image}    

        except Exception as E:

            print E

             

    def check(self):

        dead_crawl_thread_count = 0 

        crawl_thread_over = False

        dead_save_thread_count = 0 

        save_thread_over = False

        while(True):

            for thread in self.crawl_thread_pool:

                if not thread.isAlive():

                    dead_crawl_thread_count += 1

            if dead_crawl_thread_count == len(self.crawl_thread_pool):

                crawl_thread_over = True


            for thread in self.save_thread_pool:

                if not thread.isAlive():

                    dead_save_thread_count += 1

            if dead_save_thread_count == len(self.save_thread_pool):

                save_thread_over = True


            if save_thread_over and crawl_thread_over:

                break


    def run_crawler(self):

        for i in range(self.crawl_thread_amount):

            crawl_thread = threading.Thread(target=self.get_timeline())



    def test_usage_limit(self, access_token, proxy):

        url = 'https://api.weibo.com/2/account/rate_limit_status.json'

        params = {'access_token':access_token,

                   'source':'1917566200'}

        proxies = {'http':"http://%s"%proxy}

        res = requests.get(url = url, params=params, timeout = CT.TIMEOUT, proxies=proxies)

        print res.text



    def test_unit(self):

        jobs = []

        import gevent

        for i in range(100):

            jobs.append(gevent.spawn(self.test_ip_limit))

        gevent.joinall(jobs)


    def test_loop(self):

        while(True):

            self.test_unit()


    def test_ip_limit(self):

        plat = 'sina'

        #ip_port = '223.4.241.244:3128'

        #ip_port = '80.18.170.245:3128'

        ip_port = '218.57.136.202:80'

        proxy = {'http': "http://%s/" % ip_port}

        self.get_timeline('2.008kKrVCGZulFC85b3b9496f0iMsYD','1816963684', count=1, proxy = proxy)

        IPPool.inc_ip_used_count(key={'proxy':ip_port}, plat=plat, step = 1)

        print IPPool.get_ip_used_count(key={'proxy':ip_port}, plat = plat)


if __name__ == '__main__':

    sina = Sina.get_instance()

    #ip_port = '42.62.5.100:5978'

    #ip_port = '183.60.97.98:80'

    #ip_port = '222.178.37.42:1337'

    ##acs_token = '2.008kKrVCS5lgJB773f46691cvRBfDD'

    acs_token = '2.008kKrVCGZulFC85b3b9496f0iMsYD'

    #th = threading.Thread(target = sina.test_loop)

    #th.start()

    sina.get_timeline(acs_token,'2958598935', count=100, since_id = '3497205966678321')

    #sina.get_tags (acs_token, '2958598935')

    #sina.get_basicinfo (acs_token, '2958598935', None)

    #sina.get_weibo_by_id (acs_token,'3592778635816523', None)

    

    #res = sina.get_basicinfo('2.008kKrVCGZulFC5bab580682RGYueB','1881428463',ip_port)

    #res = sina.get_weibo_by_id(acs_token, '3562960347780645', None)

    #sina.test_ip_limit()

    #wids = ['354464853658119', '354316072507065', '354285872621913']

    #sina.get_fwdcn_by_ids(acs_token, wids, None)

    #sina.get_repost_weibo_list('2.008kKrVCGZulFC5bab580682RGYueB', '3557946866271328', None)

    #sina.get_timeline(acs_token,'1881428463', count=3, proxy = None)

    #sina.get_tags('2.008kKrVCGZulFCdacfff9787oYTfeE','1881428463')

    #sina.get_timeline('2.0','1881428463', count=100, since_id = '3497205966678321', proxy = proxy)

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '223.4.241.244:3128')

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '183.60.97.98:80')

    #proxy = '222.197.214.91:808'

    #requests.get('http://211.151.139.231:8031', proxies = {'http':"http://%s"%proxy})

    


© 著作权归作者所有

共有 人打赏支持
昏鸦
粉丝 7
博文 183
码字总数 59203
作品 0
程序员
私信 提问
微博python爬虫,每日百万级数据

新浪微博绝对是一个巨大的,实时的语料库!对微博数据爬取和分析,有重大的意义。 比如,现在要调查工商银行的服务态度,就可以抓取微博内容中包含工商银行的微博语料,然后做情感分析,就可...

技术小能手
07/09
0
0
新浪微博抓取问题(怎么抓取含有被评论微博或被转发微博的本条微博)

如上图,抓取新浪微博,用词1或词2可以抓取到被评论的微博(即8月1日发的那条微博),有什么办法可以使本条微博也被抓取到(注:本条微博无词1、无词2,但被评论的微博中有词1、词2)?

遊魂嘢廆
2014/08/06
1K
2
如何用好Xpath以精确定位节点位置

最近使用MetaSeeker抓取新浪微博,可最近新浪微博的格式结构发生了些变化。 对于一条含有转发内容的微博,该微博的发布时间和转发内容的发布时间都位于class名为lf的节点中,我在MetaStudio中...

司徒春运
2011/08/10
1K
0
sina 微博 爬虫

关于sina微博登陆后的数据抓取(sina api除外), 需要抓微博粉丝数,weiboshu, 以及谈论某一个关键字数,但是当模拟登陆成功后,连续的大批量抓取(假如连续抓取10000条数据),新浪微博就...

xutaoding
2015/01/09
1K
3
Heritrix抓取新浪微博

最近想通过爬虫抓取新浪微博的信息,前些日子使用MetaSeeker抓取了很具有规范的微博信息。这两天了解到了一个新的工具,Heritrix。 由于我刚刚配置好heritrix,对heritrix也不熟悉。想请教下...

司徒春运
2011/07/26
6.2K
5

没有更多内容

加载失败,请刷新页面

加载更多

tomcat编译超过64k大小的jsp文件报错原因

  今天遇到一个问题,首先是在tomcat中间件上跑的web项目,一个jsp文件,因为代码行数实在是太多了,更新了几个版本之后编译报错了,页面打开都是报500的错误,500的报错,知道http协议返回...

SEOwhywhy
20分钟前
0
0
flutter http 请求客户端

1、pubspec文件管理Flutter应用程序的assets(资源,如图片、package等)。 在pubspec.yaml中,通过网址“https://pub.dartlang.org/packages/http#-installing-tab-”确认版本号后,将http(0...

渣渣曦
20分钟前
0
0
Django基本命令及moduls举例

一、Django基本命令 1.创建项目 django-admin.py startproject mysite 创建后的项目结构:- mysite - mysite #对整个程序进行配置 - init #导入包专用- settings ...

枫叶云
35分钟前
6
0
zabbix安装

rpm -ivh http://repo.webtatic.com/yum/el6/latest.rpm 安装jdk rpm -ivh (自行在网上下载rpm包) 安装php并修改相应参数 yum -y install php56w php56w-gd php56w-mysqlnd php56w-bcmath......

muoushi
36分钟前
4
0
MySQL自增属性auto_increment_increment和auto_increment_offset

MySQL的系统变量或会话变量auto_increment_increment(自增步长)和auto_increment_offset(自增偏移量)控制着数据表的自增列ID。 mysql> show tables;Empty set (0.00 sec)mysql> CREATE TA......

野雪球
今天
8
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部