Golang抓取百度图片搞笑分类并入库(mongodb)

原创
2018/06/28 14:01
阅读数 485

练手之作,编译即可运行(前提是打开mongodb),可以在win/linux下使用。勿作非法用途。

package main

import (
_"net/http"
"log"
"os"
"io/ioutil"
"crypto/md5"
"encoding/hex"
"github.com/davecgh/go-spew/spew"
"github.com/bitly/go-simplejson"
"labix.org/v2/mgo"
_"labix.org/v2/mgo/bson"
_"github.com/PuerkitoBio/gocrawl"
_"github.com/PuerkitoBio/goquery"
"./eva"
)

type Item struct {
Imgurl string
Imgwidth int
Imgheight int
Thumburl string
Thumbwidth int
Thumbheight int
Desc string
Date string
Likes int
Localfile string
ThumbLocalfile string
}

var BAIDU *mgo.Collection

func md5Byte(s []byte) string {
h := md5.New()
h.Write(s)
return hex.EncodeToString(h.Sum(nil))
}

func makeFname(basedir, s string) string {
a := md5Byte([]byte(s))
//cwd, _ := os.Getwd()
c :="./"+ basedir +"/"+ a[:3] +"/"+ a[3:6] +"/"+ a[6:9] +"/"
//log.Println(c)
_, err := os.Stat(c)
if err != nil && !os.IsExist(err) {
os.MkdirAll(c, 0755)
}
return c + a
}

func downloadImage(url string, to string) {
headers := Eva.M{
"Accept-Encoding":"gzip,deflate,sdch",
"Referer":"http://image.baidu.com/channel/funny",
"Host":"image.baidu.com", 
}
cookies := Eva.M{
"user":"baidu", 
}
req := &Eva.Request{}
err, content, resp := req.Do("GET", url, nil, headers, cookies, 30)
if err != nil {
spew.Printf("[-] download failed(%d) : %s
", resp.StatusCode, url)
return
}
//spew.Dump(content, resp)
ioutil.WriteFile(to, content, 0755)
}

func insert (item *Item) {
spew.Dump(item)
BAIDU.Insert(item)
}

func connectMongo() (*mgo.Collection, *mgo.Session) {
session, err := mgo.Dial("127.0.0.1")
 if err != nil {
panic(err)
}
 //defer session.Close()
 // Optional. Switch the session to a monotonic behavior. 
 session.SetMode(mgo.Monotonic, true)
 return session.DB("xiaohua").C("baidu"), session
}

func main() {
collection, session := connectMongo()
BAIDU = collection
defer func() {
session.Close()
if err := recover(); err != nil{
log.Println(err)
}
}()
var url ="http://image.baidu.com/channel/listjson?fr=channel&tag1=%E6%90%9E%E7%AC%91&tag2=%E5%85%A8%E9%83%A8&sorttype=0&pn=0&rn=30&ie=utf8&oe=utf-8&app=img.browse.channel.general&1393916968302"
body, err := Eva.HttpGet(url)
Eva.Check(err)
json, err := simplejson.NewJson(body)
if err != nil {
log.Fatal("error:", err)
}
//spew.Dump(json.Get("Data"))
data, err := json.Get("data").Array()
if err != nil {
log.Fatal("error:", err)
}

for _, v := range data {
vv := v.(map[string]interface {})
//spew.Dump(vv["date"])
item := new (Item)
item.Imgurl = vv["download_url"].(string)
item.Imgwidth = int(vv["image_width"].(float64))
item.Imgheight = int(vv["image_height"].(float64))
item.Thumburl = vv["thumbnail_url"].(string)
item.Thumbwidth = int(vv["thumbnail_width"].(float64))
item.Thumbheight = int(vv["thumbnail_height"].(float64))
item.Desc = vv["desc"].(string)
item.Date = vv["date"].(string)
item.Likes = 0
item.Localfile = makeFname("tmp", item.Imgurl);
//item.Localfile ="./tmp/"+ md5Byte([]byte(item.Imgurl))
downloadImage(item.Imgurl, item.Localfile)
item.ThumbLocalfile = makeFname("tmp", item.Thumburl);
//item.ThumbLocalfile ="./tmp/thumb/"+ md5Byte([]byte(item.Thumburl))
downloadImage(item.Thumburl, item.ThumbLocalfile)
//spew.Dump(item)
insert(item)
}
}
展开阅读全文
打赏
0
0 收藏
分享
加载中
万建宁博主
拷贝进编辑器,自动格式化
2018/06/29 13:54
回复
举报
没有格式化,缩进,看起来好难受
2018/06/28 17:46
回复
举报
更多评论
打赏
2 评论
0 收藏
0
分享
返回顶部
顶部