1.安装爬数据的库
$ go get github.com/PuerkitoBio/goquery
2.语法与JQuery读取选择器一致
2.1.参考JQuery语法
https://www.w3school.com.cn/jquery/jquery_selectors.asp
以下为JQuery
$(“p”) 选取<p>元素。
$(“p.intro”) 选取所有 class=“intro” 的 <p> 元素。
$(“p#demo”) 选取所有 id=“demo” 的 <p> 元素。
2.2.爬数据表达式例子:
div.info-qh > div > p
解释
div.info-qh:div元素中使用class=info-ch的元素
3.例子
DROP TABLE IF EXISTS `sp_Artical`;
CREATE TABLE `sp_Artical` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`Title` varchar(145) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=33 DEFAULT CHARSET=utf8;
DROP TABLE IF EXISTS `sp_Artical_Content`;
CREATE TABLE `sp_Artical_Content` (
`ID` int(11) NOT NULL AUTO_INCREMENT,
`Artical_ID` int(11) DEFAULT NULL,
`Content` varchar(2000) DEFAULT NULL,
PRIMARY KEY (`ID`)
) ENGINE=InnoDB AUTO_INCREMENT=1481 DEFAULT CHARSET=utf8;
package main
import (
"./parse"
)
func main() {
parse.StartToFetch()
}
package parse
import (
"log"
"../model"
"github.com/PuerkitoBio/goquery"
)
type Artical struct {
ID int
Title string
}
type ArticalContent struct {
ID int
ArticalID int
Content string
}
func StartToFetch() {
doc, err := goquery.NewDocument("http://www.xxx.xxx")
if err != nil {
log.Fatal(err)
}
doc.Find("div.ch_content > div.ch_lii > div.ch_lii_left > a").Each(func(i int, s *goquery.Selection) {
for _, obj := range s.Nodes {
var title = ""
var href = ""
for _, attr := range obj.Attr {
if attr.Key == "title" {
title = attr.Val
}
if attr.Key == "href" {
href = attr.Val
}
if href != "" && title != "" {
artical := Artical{Title: title}
if err := model.DB.Create(&artical).Error; err != nil {
log.Printf("db.Create index: err : %v", err)
} else {
doc, err := goquery.NewDocument(href)
if err == nil {
ParseContent(doc, artical.ID)
}
}
}
}
}
})
}
func ParseContent(doc *goquery.Document, articalID int) {
//log.Printf(strconv.Itoa(articalID))
doc.Find("div.info-qh > div > p").Each(func(i int, s *goquery.Selection) {
//log.Println(s.Text())
txt := s.Text()
if txt != "" {
content := ArticalContent{ArticalID: articalID, Content: txt}
if err := model.DB.Create(&content).Error; err != nil {
log.Printf("db.Create content index: , err : %v", err)
}
}
})
}
package model
import (
"fmt"
"log"
"github.com/jinzhu/gorm"
_ "github.com/jinzhu/gorm/dialects/mysql"
)
var (
DB *gorm.DB
username string = "root"
password string = "1qaz2wsx"
dbName string = "blog"
)
func init() {
var err error
DB, err = gorm.Open("mysql", fmt.Sprintf("%s:%s@/%s?charset=utf8&parseTime=True&loc=Local", username, password, dbName))
if err != nil {
log.Fatalf(" gorm.Open.err: %v", err)
}
DB.SingularTable(true)
gorm.DefaultTableNameHandler = func(db *gorm.DB, defaultTableName string) string {
return "sp_" + defaultTableName
}
}