用Go写的爬取网站上的图片
package main
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strings"
"sync"
"golang.org/x/net/html"
)
var wg sync.WaitGroup
func getHtml(url string) ([]string, error) {
res, err := http.Get(url)
if err != nil {
log.Fatal(err)
return nil, err
}
defer res.Body.Close()
htmlNode, err := html.Parse(res.Body)
if err != nil {
log.Fatal(err)
return nil, err
}
imgUrlSlice := getImage(htmlNode, nil)
return imgUrlSlice, nil
}
func getImage(n *html.Node, imgSlice []string) []string {
if n.Type == html.ElementNode || n.Data == "IMG" {
for _, v := range n.Attr {
imgSlice = append(imgSlice, v.Val)
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
imgSlice = getImage(c, imgSlice)
}
return imgSlice
}
func downPic(path, url string) {
idx := strings.LastIndex(url, "/") + 1
if idx < 0 {
fmt.Println("获取文件url错误")
return
}
filename := url[idx:]
fp, err := os.Create(path + filename)
if err != nil {
log.Fatal(err)
}
defer fp.Close()
res, err := http.Get(url)
if err != nil {
log.Fatal(err)
return
}
defer res.Body.Close()
io.Copy(fp, res.Body)
wg.Done()
}
func main() {
var picSlice = make([]string, 0)
var htmSlice = make([]string, 0)
var domain = "http://big5.cri.cn"
url := domain + "/gate/big5/news.cri.cn/gb/9964/2007/08/27/1326@1734234.htm"
slice, _ := getHtml(url)
for _, v := range slice {
if strings.Contains(v, "mmsource") {
picSlice = append(picSlice, v)
} else if strings.Contains(v, "9964") {
htmSlice = append(htmSlice, v)
}
}
for i := 0; i < len(htmSlice); i++ {
slice2, _ := getHtml(domain + htmSlice[i])
for _, v := range slice2 {
if strings.Contains(v, "mmsource") {
picSlice = append(picSlice, v)
}
}
}
wg.Add(len(picSlice))
for _, v := range picSlice {
go downPic("pic/", domain+v)
}
wg.Wait()
fmt.Println("已经完成")
}