刚开始爬取的时候是用正则表达式实现的,然后爬取了不必要的链接,现在改用goquery实现:
// judgeUrl project judgeUrl.go
package judgeUrl
import (
"strings"
)
func IsUrl(str string) bool {
if strings.HasPrefix(str, "#") || strings.HasPrefix(str, "//") || strings.HasSuffix(str, ".exe") || strings.HasSuffix(str, ":void(0);") {
return false
} else if strings.HasPrefix(str, "{") && strings.HasSuffix(str, "}") {
return false
} else if strings.EqualFold(str, "javascript:;") {
return false
} else {
return true
}
return true
}
func SamePathUrl(preUrl string, url string, mark int) (newUrl string) {
last := strings.LastIndex(preUrl, "/")
if last == 6 {
newUrl = preUrl + url
} else {
if mark == 1 {
newUrl = preUrl[:last] + url
} else {
newPreUrl := preUrl[:last]
newLast := strings.LastIndex(newPreUrl, "/")
newUrl = newPreUrl[:newLast] + url
}
}
return newUrl
}
// WebUrls_Spider project main.go
package main
import (
"fmt"
"os"
"strings"
"github.com/PuerkitoBio/goquery"
"test/judgeUrl"
)
var urlMap map[string]bool //防止重复链接进入死循环,不过获取链接太多可能会内存溢出
func fetch(url string, count int) {
if count > 1 { //设定爬取深度是1页
return
}
body, err := goquery.NewDocument(url)
if err != nil {
return
}
body.Find("a").Each(func(i int, aa *goquery.Selection) {
href, IsExist := aa.Attr("href")
if IsExist == true {
href = strings.TrimSpace(href)
if len(href) > 2 && judgeUrl.IsUrl(href) {
if _, ok := urlMap[href]; ok == false {
fmt.Println("之前的url:", href)
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "./") {
href = judgeUrl.SamePathUrl(url, href, 1)
} else if strings.HasPrefix(href, "../") {
href = judgeUrl.SamePathUrl(url, href, 2)
}
fmt.Println("修改之后的url:", href)
urlMap[href] = true
fetch(href, count+1)
}
}
}
})
}
func writeValues(outfile string) error {
file, err := os.Create(outfile)
if err != nil {
fmt.Printf("创建%s文件失败!", outfile)
return err
}
defer file.Close()
for k, _ := range urlMap {
file.WriteString(k + "\n")
}
return nil
}
func main() {
urlMap = make(map[string]bool, 1000000)
fetch("http://www.hao123.com/", 0)
writeValues("urls.dat")
}