Go语言爬虫实践-优快云博客

本文链接：https://blog.youkuaiyun.com/golangnumber1/article/details/108139218

本文介绍了使用Go语言进行网络爬虫的实践经验，重点探讨了hand包下的zonghengxs.go文件，其中包含了一个名为otheroper.GetBuildStr()的函数，该函数用于实现特定的爬取操作。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

hand 包下
zonghengxs.go
免责声明: 本文章仅供于学习使用，不能拿入其他用途，后果自负哈，


package hand

import (
	"regexp"
	"strconv"
	"strings"
	"tools/otheroper"
)

//
type ZongHengXS struct {
	PP ParseProcess
}

func (zh *ZongHengXS) GetClassifs() HandResult {
	content := zh.PP.GetContent(zh.PP.ClassifRequestUrl)
	//fmt.Printf("%s",content)
	//<a href="/tag/小说" class="tag">小说</a>
	// [^"]+ 拿到 以" 结尾的前面的字符
	//([^"]+) 得到里面的字符串
	re := regexp.MustCompile(zh.PP.ClassifRegexpstr)
	match := re.FindAllSubmatch(content, -1)

	result := HandResult{}

	for _, m := range match {

		str := otheroper.GetBuildStr(
			"http://book.zongheng.com/store/c",
			string(m[1]),
			"/c0/b0/u0/p1/v9/s9/t0/u0/i1/ALL.html")
		//http://book.zongheng.com/store/c6/c0/b0/u0/p1/v9/s9/t0/u0/i1/ALL.html

		result.ResultUrls = append(result.ResultUrls, str)
		result.Items = append(result.Items, string(m[2]))
	}

	return result
}

//获取某个分类下面的小说
func (zh *ZongHengXS) GetBooks(classifyUrl string, pageIndex int) (HandResult, bool) {
	//booksurl := otheroper.GetBuildStr(
	//	"http://book.zongheng.com/store/c1/c0/b0/u0/p", "1", "/v9/s9/t0/u0/i1/ALL.html")
	newstr:=strings.Replace(classifyUrl,"p#","p"+strconv.Itoa(pageIndex),1)

	content := zh.PP.GetContent(newstr)

	//得到 该分类下面的总页数
	//rePageCount := regexp.MustCompile(`count="([0-9]*)"`)
	//matchPageCount := rePageCount.FindAllSubmatch(content, -1)
	//fmt.Println(matchPageCount[0][1])
	//超过这个函数则为当前分类下面已经没有书籍了
	//if convertoper.BytesToInt32(matchPageCount[0][1]) < int32(pageIndex) {
	//	return HandResult{
	//		ResultUrls: nil,
	//		Items:      nil,
	//	}, false
	//}

	zh.PP.BookRequestUrl = classifyUrl
	zh.PP.BookRegexpstr = `<a href="http://book.zongheng.com/book/([0-9]*).html" target="_blank">([^"]+)</a>`

	re := regexp.MustCompile(zh.PP.BookRegexpstr)
	match := re.FindAllSubmatch(content, -1)

	result := HandResult{}

	for _, m := range match {
		str := otheroper.GetBuildStr(
			"http://book.zongheng.com/book/", string(m[1]), ".html")
		result.ResultUrls = append(result.ResultUrls, str)
		result.Items = append(result.Items, string(m[2]))
	}

	return result, true
}

//获取某个小说下面的章节
func (zh *ZongHengXS) GetChapters(ChapterRequestUrl string) HandResult {

	//zh.PP.ChapterRequestUrl=otheroper.GetBuildStr("http://book.zongheng.com/showchapter/",
	//	string(booknumber),".html")

	//zh.PP.ChapterRequestUrl = "http://book.zongheng.com/showchapter/917253.html"
	zh.PP.ChapterRequestUrl = ChapterRequestUrl
	content := zh.PP.GetContent(zh.PP.ChapterRequestUrl)

	zh.PP.ChapterRegexpstr = `href="([^"]+)" target="_blank" title="([^"]+)">([^"]+)</a>`
	// <a  href="http://book.zongheng.com/chapter/917253/61403035.html"

	re := regexp.MustCompile(zh.PP.ChapterRegexpstr)
	match := re.FindAllSubmatch(content, -1)

	result := HandResult{}

	for _, m := range match {

		result.ResultUrls = append(result.ResultUrls, string(m[1]))
		result.Items = append(result.Items, string(m[3]))
	}

	return result
}

//获取某个小说的一章
func (zh *ZongHengXS) GetBookContent(BookContentRequestUrl string) HandResult {

	//zh.PP.BookContentRequestUrl = "http://book.zongheng.com/chapter/917253/60737080.html"
	zh.PP.BookContentRequestUrl = BookContentRequestUrl
	content := zh.PP.GetContent(zh.PP.BookContentRequestUrl)

	zh.PP.BookContentRegexpstr = `<div class="content" itemprop="acticleBody">([^"]+)`
	// <a  href="http://book.zongheng.com/chapter/917253/61403035.html"

	re := regexp.MustCompile(zh.PP.BookContentRegexpstr)
	match := re.FindAllSubmatch(content, -1)

	result := HandResult{}

	result.ResultUrls = append(result.ResultUrls, zh.PP.BookContentRequestUrl)
	//去除html 标签
	re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
	//+">" 这里加上一个 > 是因为在匹配获得内容时有 残缺html 标签
	// 所以加上 > 构成完整html 标签一起替换
	wenzi := re.ReplaceAllString(string(match[0][0])+">", "\n")

	//替换连续换行
	re, _ = regexp.Compile("\\s{2,}")
	wenzi = re.ReplaceAllString(wenzi, "\n")

	result.Items = append(result.Items, wenzi)

	return result
}

在文章中有个函数 otheroper.GetBuildStr（）
下面是具体的功能代码

package otheroper

import (
	"math/rand"
	"strings"
	"time"
)
//就是拼接字符串，
func GetBuildStr(str... string )string  {
	var build strings.Builder
	for _,v:= range str{
		build.WriteString(v)
	}
	return build.String()
}
//根据等级生成随机数，level 越大 生成随机数越大
func GetRandBylevel(level int) int {
	rand.Seed(time.Now().Unix())
	return rand.Intn(level*10)

}
//生成一定范围内随机数,
// isLoop 是否在循环之内,如果在，请设为ture 因为程序太快了，会生成一样的数字
func RandInt64(min, max int64,isLoop bool) int64 {
	if min >= max || min == 0 || max == 0 {
		return max
	}
	if isLoop {
		time.Sleep(10*time.Nanosecond)
	}

	rand.Seed(time.Now().UnixNano())
	return rand.Int63n(max-min) + min
}

go爬虫3