【Golang爬虫】服务器端网页截图

最新推荐文章于 2025-11-10 21:31:07 发布

原创最新推荐文章于 2025-11-10 21:31:07 发布 · 1.1k 阅读

4 ·

CC 4.0 BY-SA版权

文章标签：

#golang #爬虫

前言

什么是爬虫

网络爬虫（又称为网页蜘蛛，网络机器人），是一种按照一定的规则，自动地抓取互联网信息的程序或者脚本。大的应用，比如搜索引擎百度、Google就是利用爬虫收集互联网网页信息用于用户搜索；小的应用就更多了，比如抓取知乎妹子头像 , 自动采集妹子图等。

Headless Chrome

在 Chrome 59中开始搭载 Headless Chrome 。这是一种在无需显示的环境下运行 Chrome 浏览器的方式。
从本质上来说，就是不用 chrome 浏览器来运行 Chrome 的功能！它将 Chromium 和 Blink 渲染引擎提供的所有现代 Web 平台的功能都带入了命令行。Headless 浏览器对于自动化测试和不需要可视化 UI 界面的服务器环境是一个很好的工具。例如，你可能需要对真实的网页运行一些测试，创建一个 PDF，或者只是检查浏览器如何呈现 URL。

GO语言爬虫

想要在golang程序里使用headless chrome，需要借助一些开源库，实现和headless chrome交互的库有很多，这里选择chromedp，它能做什么？

使用chromedp解决反爬虫JS问题
使用chromedp做网站的自动化测试
使用chromedp服务器代码渲染(主要是解决VueJS等SPA应用)
使用chromedp做网页截图程序
使用chromedp做刷点击量/刷赞/搜索引擎SEO训练….(click farming)

环境准备

安装chrome浏览器

Linux参考 CentOS7 安装Chrome
Windows自行安装下载地址
安装chromedp库
创建Golang项目，开启Go Module(在项目目录下使用终端输入go mod init)
在项目目录下使用终端输入：go get github.com/chromedp/chromedp

撸袖开干

上代码

PS：优秀的程序员，都不写注释

package main

import (
	"context"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"github.com/chromedp/cdproto/emulation"
	"github.com/chromedp/cdproto/page"
	"github.com/chromedp/chromedp"
)
//进行网页截屏
func screenshot(ctx context.Context,title string) error {
	var buf []byte
	_, _, contentSize, err := page.GetLayoutMetrics().Do(ctx)
	if err != nil {
		return err
	}

	width, height := int64(math.Ceil(contentSize.Width)), int64(math.Ceil(contentSize.Height))

	// force viewport emulation
	err = emulation.SetDeviceMetricsOverride(width, height, 1, false).
		WithScreenOrientation(&emulation.ScreenOrientation{
			Type:  emulation.OrientationTypePortraitPrimary,
			Angle: 0,
		}).
		Do(ctx)
	if err != nil {
		return err
	}

	// capture screenshot
	buf, err = page.CaptureScreenshot().
		WithQuality(90).
		WithClip(&page.Viewport{
			X:      contentSize.X,
			Y:      contentSize.Y,
			Width:  contentSize.Width,
			Height: contentSize.Height,
			Scale:  1,
		}).Do(ctx)
	if err != nil {
		return err
	}
	if err := ioutil.WriteFile(title, buf, 0644); err != nil {
		return err
	}
	return nil
}

func main(){
	//设置chrome安装路径，如果是windows请自行设置允许路径，比如：C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
	chromedp.ExecPath("/opt/apps/cn.google.chrome/files/chrome")
	//增加选项，不允许chrome窗口显示出来
	options := []chromedp.ExecAllocatorOption{
		//不允许chrome窗口显示出来
		chromedp.Flag("headless", true),
		chromedp.Flag("hide-scrollbars", false),
		chromedp.Flag("mute-audio", false),
		//设置浏览器UserAgent
		chromedp.UserAgent(`Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36`),
	}
	options = append(chromedp.DefaultExecAllocatorOptions[:], options...)
	//创建chrome窗口
	allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)
	defer cancel()
	ctx, cancel := chromedp.NewContext(allocCtx)
	defer cancel()

	var title string
	//运行爬虫代码
	err := chromedp.Run(ctx,
		//请求网址
		chromedp.Navigate("https://www.baidu.com/"),
		//等待网页加载完毕
		chromedp.WaitVisible(`body`, chromedp.ByQuery),
		//获取网页标题
		chromedp.Evaluate("document.title", &title),
		//开始进行截屏
		chromedp.ActionFunc(func(ctx context.Context) error {
			return screenshot(ctx,title+".png")
		}),
	)
	if err != nil {
		log.Fatal(err)
	}
	//log.Printf("window object keys: %v", res)
	fmt.Println(title)
}