python3爬取页面内容并筛选

2018/04/24 10:41
阅读数 127
from urllib import request
import re
def getResponse(url):
    url_request = request.Request(url)
    url_response = request.urlopen(url_request)
    return url_response
def getData(data):
    html = re.findall(r'alt="[\u4E00-\u9FA5\\s]+"',data)
    return html
aid = 1
for aid in range(1,123):
    html = "http://www.zhijiaow.com/ShopMallList_%s_0.html" %aid
    aid +=1    
    http_response = getResponse(html)
    data = http_response.read().decode('utf8')
    l = getData(data)
    global n
    n = 1
    for info in l:
        with open('c.txt','a') as f:
            f.write(info)
        n +=1
with open('c.txt','r') as f:
    lines = f.readlines()
with open('a.txt','a') as w:
    for l in lines:
        w.write(l.replace('"alt="','\n'))

 

展开阅读全文
打赏
0
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部