class Consumer(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self._queue = queue
def run(self):
while True:
msg = self._queue.get()
if isinstance(msg, str) and msg == 'quit':
break
print "I'm a thread, and I received %s!!" % msg
print 'Bye byes!'
def producer():
queue = Queue.Queue()
worker = Consumer(queue)
worker.start() # 开启消费者线程
start_time = time.time()
while time.time() - start_time < 5:
queue.put('something at %s' % time.time())
time.sleep(1)
queue.put('quit')
worker.join()
if __name__ == '__main__':
producer()
使用多线程,在做爬虫的时候,生产者用着产生url链接,消费者用于获取url数据,在队列的帮助下可以使用多线程加快爬虫速度。
import time
import threading
import Queue
import urllib2
class Consumer(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self._queue = queue
def run(self):
while True:
content = self._queue.get()
print content
if isinstance(content, str) and content == 'quit':
break
response = urllib2.urlopen(content)
print 'Bye byes!'
def Producer():
urls = [
'http://211.103.242.133:8080/Disease/Details.aspx?id=2258',
'http://211.103.242.133:8080/Disease/Details.aspx?id=2258',
'http://211.103.242.133:8080/Disease/Details.aspx?id=2258',
'http://211.103.242.133:8080/Disease/Details.aspx?id=2258'
]
queue = Queue.Queue()
worker_threads = build_worker_pool(queue, 4)
start_time = time.time()
for url in urls:
queue.put(url)
for worker in worker_threads:
queue.put('quit')
for worker in worker_threads:
worker.join()
print 'Done! Time taken: {}'.format(time.time() - start_time)
def build_worker_pool(queue, size):
workers = []
for _ in range(size):
worker = Consumer(queue)
worker.start()
workers.append(worker)
return workers
if __name__ == '__main__':
Producer()