test 包下面
zonghengtimeupdateTest.go
免责声明 在使用本程序时,本人以供读者学习使用,并没有攻击其他网站意图,请读者设置对应源码参数时酌情考虑,切记,切记
package test
//
import (
"dao"
"fmt"
"mode"
"strconv"
"strings"
"time"
"tools/otheroper"
"tools/reptilesoper/reptilesengine/gogo/dispatch"
"tools/reptilesoper/reptilesengine/gogo/timerupdate"
"tools/reptilesoper/reptilesengine/hand"
)
//测试暂停定时器
func TestStopTimeUpdate() {
urls, _ := dao.GetUrlBySite("zongheng")
ts := []*timerupdate.TimerUpdate{}
sliceInt := make([]int, 0)
max := 5
for i := 0; i < max; i++ {
sliceInt = append(sliceInt, i+1)
}
mapPage := make(map[string][]int)
for ii, vv := range urls {
tuName := "BookTimerUpdate" + strconv.Itoa(ii)
mapPage[tuName] = sliceInt
args := func(args ...interface{}) interface{} {
//任务完成的定时器
if len(mapPage[tuName]) == 0 {
//找到任务完成的调度层名称
cdName := tuName + "_CD"
insMap := make(map[string]interface{})
insMap["Instruction"] = "StopTicker"
insMap["Data"] = cdName
fmt.Println("------------------定时器:", tuName, "已经完成抓取-----------")
return insMap
} else {
fmt.Println(mapPage[tuName][0])
//测试除了定时器6 其他的定时器完成任务之后就全部关闭
//if tuName!="BookTimerUpdate6" {
// mapPage[tuName] = mapPage[tuName][1:len(mapPage[tuName])]
//}
mapPage[tuName] = mapPage[tuName][1:len(mapPage[tuName])]
}
fmt.Println(vv)
return nil
}
cyt := time.Duration(otheroper.RandInt64(5, 22, true)) * time.Second
tu1 := timerupdate.CreateExcuTu(tuName, 1, cyt, args)
ts = append(ts, tu1)
}
dispatch.GoRun(dispatch.GoWorkerNum(20), ts...)
}
func GetZX() hand.IParseProcess {
var zh hand.IParseProcess = &hand.ZongHengXS{PP: struct {
ClassifRequestUrl string
ClassifRegexpstr string
BookRequestUrl string
BookRegexpstr string
ChapterRequestUrl string
ChapterRegexpstr string
BookContentRequestUrl string
BookContentRegexpstr string
}{}}
return zh
}
//分类下的书籍并发爬虫
func GetBookListReptiles() {
urls, classifyids := dao.GetUrlBySite("zongheng")
zh := GetZX()
hand.ExcuteProcess(zh)
//生成模拟 各个分类下面的PageIndex 页码 ,
// 存放在每个分类Key下面的数组中
sliceInt := make([]int, 0)
max := 50
for i := 0; i < max; i++ {
sliceInt = append(sliceInt, i+1)
}
mapPage := make(map[string][]int)
ts := []*timerupdate.TimerUpdate{}
for ii, vv := range urls {
tuName := "BookTimerUpdate" + strconv.Itoa(ii)
//每个定时器存入一组页码 pageIndex
mapPage[tuName] = sliceInt
//转换当前这个分页的页码为# 方便传入页码替换成分页url
str := strings.Replace(vv, "p1", "p#", -1)
args := func(args ...interface{}) interface{} {
var models []interface{}
//任务完成的定时器
if len(mapPage[tuName]) == 0 {
//找到任务完成的调度层名称
cdName := tuName + "_CD"
insMap := make(map[string]interface{})
//发送指令给调用层
insMap["Instruction"] = "StopTicker"
insMap["Data"] = cdName
fmt.Println("------------------定时器:", tuName, "已经完成抓取-----------")
return insMap
} else {
fmt.Println("当前更新期的分页码:", mapPage[tuName][0])
//当前一个定时器 用了一个页码数组元素
cs, _ := zh.GetBooks(str, mapPage[tuName][0])
//然后就覆盖前一个元素页码
mapPage[tuName] = mapPage[tuName][1:len(mapPage[tuName])]
//循环读取解析网址后的内容
for i, v := range cs.Items {
m3 := mode.Books{
ClassifyId: int(classifyids[ii]),
Booksid: 0,
Name: v.(string),
SourceSiteUrl: cs.ResultUrls[i],
}
models = append(models, m3)
fmt.Println(v.(string))
fmt.Println(cs.ResultUrls[i])
}
//添加书籍信息
dao.BatchInsertBooks(models)
}
return nil
}
cyt := time.Duration(otheroper.RandInt64(40, 80, true)) * time.Second
tu1 := timerupdate.CreateExcuTu(tuName, 1, cyt, args)
ts = append(ts, tu1)
}
//调用开始
dispatch.GoRun(dispatch.WorkerNum10, ts...)
}
//某本书籍下的所有章节
func GetChatperReptiles() {
maps := dao.BookPaging(42, 2, 50)
zh := GetZX()
ts := []*timerupdate.TimerUpdate{}
for ii, vv := range maps {
bookurl := vv["SourceSiteUrl"].(string)
//必须每次定义bid 不然在 go中值传递出问题
booksID := int(vv["Booksid"].(int64))
s := strings.Split(bookurl, "/")
b := strings.Split(s[4], ".")
chapterurl := strings.Replace("http://book.zongheng.com/showchapter/#.html", "#", b[0], 1)
tuName := "BookTimerUpdate" + strconv.Itoa(ii)
args := func(args ...interface{}) interface{} {
var models []interface{}
cs := zh.GetChapters(chapterurl)
for i, v := range cs.Items {
// <a href="http://book.zongheng.com/chapter/917253/61403035.html"
s := strings.Split(cs.ResultUrls[i], "/")
b := strings.Split(s[5], ".")
//找到章节编号
chn, _ := strconv.Atoi(b[0])
fmt.Println(booksID)
chapter := mode.Chapter{
ChapterNum: chn,
Booksid: booksID,
Title: v.(string),
ContextStr: "",
Createtime: time.Now().String(),
SourceSiteUrl: cs.ResultUrls[i],
}
models = append(models, chapter)
}
//如果切片过长就拆分为 多个 小切片,在添加
if len(models) > 50 {
result := otheroper.SplitArray(models, (len(models)/50)+1)
for _, values := range result {
dao.BatchInsertChapters(values)
}
} else {
dao.BatchInsertChapters(models)
}
cdName := tuName + "_CD"
insMap := make(map[string]interface{})
//发送指令给调用层
insMap["Instruction"] = "StopTicker"
insMap["Data"] = cdName
fmt.Println("------------------定时器:", tuName, "已经完成抓取-----------")
return insMap
}
cyt := time.Duration(otheroper.RandInt64(40, 80, true)) * time.Second
tu1 := timerupdate.CreateExcuTu(tuName, 1, cyt, args)
ts = append(ts, tu1)
}
dispatch.GoRun(dispatch.WorkerNum50, ts...)
}
//某本书籍下的一部本章节内容抓取
func GetContextReptiles() {
//count :=dao.GetBookChapterSumbyBookid(106)
//下标
index:=1
maps := dao.ChapterPaging(106, index, 50)
zh := GetZX()
ts := []*timerupdate.TimerUpdate{}
for ii, vv := range maps {
chapterurl := vv["SourceSiteUrl"].(string)
//必须每次定义bid 不然在 go中值传递出问题
ChapterNum := int(vv["ChapterNum"].(int64))
tuName := "BookTimerUpdate" + strconv.Itoa(ii)
args := func(args ...interface{}) interface{} {
cs := zh.GetBookContent(chapterurl)
for i, _ := range cs.Items {
dao.ModifyChapterContextByNum(ChapterNum,cs.Items[i].(string))
}
cdName := tuName + "_CD"
insMap := make(map[string]interface{})
//发送指令给调用层
insMap["Instruction"] = "StopTicker"
insMap["Data"] = cdName
fmt.Println("------------------定时器:", tuName, "已经完成抓取-----------")
return insMap
}
cyt := time.Duration(otheroper.RandInt64(40, 80, true)) * time.Second
tu1 := timerupdate.CreateExcuTu(tuName, 1, cyt, args)
ts = append(ts, tu1)
}
dispatch.GoRun(dispatch.WorkerNum50, ts...)
}
在上述代码中出现 以下代码
cyt := time.Duration(otheroper.RandInt64(40, 80, true)) * time.Second
**是为了 针对不同的定时更新器,设置随机时间,来绕过网站某些规则。请酌情设置 两个参数值,不要给网站造成不必要的麻烦
**
main 中调用方法如下
test.GetBookListReptiles()
后面文章会加入具体的工具代码