起因前天女朋友说想下载一些公众号的文章为PDF,但是一个一个用浏览器打开,再打印的太麻烦了,网上找的又需要付费,那我自然…
由于最近golang使用的多所以我使用了chromedp,git示例地址:
Example | Description |
---|---|
click | use a selector to click on an element |
cookie | set a HTTP cookie on requests |
download_file | do headless file downloads |
download_image | do headless image downloads |
emulate | emulate a specific device such as an iPhone |
eval | evaluate javascript and retrieve the result |
fast | extract and render data from a page |
forecast | extract and render data from a page |
geoip | extract and render data from a page |
headers | add extra HTTP headers to browser requests |
keys | send key events to an element |
latlon | retrieve the latitude/longitude from google maps, using the browser’s target events |
logic | more complex logic beyond simple actions |
multi | use headless-shell and a container (Docker, Podman, other) |
capture a pdf of a page | |
proxy | authenticate a proxy server which requires authentication |
remote | connect to an existing Chrome DevTools instance using a remote WebSocket URL |
screenshot | take a screenshot of a specific element and of the entire browser viewport |
submit | fill out and submit a form |
subtree | populate and travel a subtree of the DOM |
text | extract text from a specific element |
upload | upload a file on a form |
visible | wait until an element is visible |
关于pdf的:
// Command pdf is a chromedp example demonstrating how to capture a pdf of a
// page.
package main
import (
"context"
"fmt"
"log"
"os"
"github.com/chromedp/cdproto/page"
"github.com/chromedp/chromedp"
)
func main() {
// create context
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
// capture pdf
var buf []byte
if err := chromedp.Run(ctx, printToPDF(`https://www.google.com/`, &buf)); err != nil {
log.Fatal(err)
}
if err := os.WriteFile("sample.pdf", buf, 0o644); err != nil {
log.Fatal(err)
}
fmt.Println("wrote sample.pdf")
}
// print a specific pdf page.
func printToPDF(urlstr string, res *[]byte) chromedp.Tasks {
return chromedp.Tasks{
chromedp.Navigate(urlstr),
chromedp.ActionFunc(func(ctx context.Context) error {
buf, _, err := page.PrintToPDF().WithPrintBackground(false).Do(ctx)
if err != nil {
return err
}
*res = buf
return nil
}),
}
}
更换成需要下载的url后的效果
我一开始以为成了,发给我宝,我宝直呼,你到底干了什么?第二页的图片呢?你这样我怎么用来下载PDF,我淡淡一笑,很简单,我来改bug不就行了,说罢,我的气息不再掩饰,四年编程修为!顷刻炼化…咳咳咳
很简单,直接浏览器打开链接
经过略微观察,发现其实是懒加载的问题,图片不是一次加载进来的,那怎么办呢,简单 翻翻文档,有个eval 执行JS的方法,而且有返回值,那我只需要执行一个JS的滑动动作即可。再通过返回值的特性,去获取到页面的高度,用高度除偏移量即可得出滑动次数,再乘时间,即可得到等待时间(一开始没写等待时间,导致还是没图…)代码:
jsStr := `var i = 2
var element = document.documentElement
element.scrollTop = 0; // 不管他在哪里,都让他先回到最上面
// 设置定时器,时间即为滚动速度
function main() {
if (element.scrollTop + element.clientHeight == element.scrollHeight) {
clearInterval(interval)
console.log('已经到底部了')
} else {
// 300 代表每次移动300px
element.scrollTop += 300;
console.log(i);
i += 1;
}
}
// 定义ID 200代表300毫秒滚动一次
interval = setInterval(main, 100)`
lengthStr := `var element = document.documentElement
element.scrollTop = 0;
element.scrollHeight
`
lengthInt := 0
err := chromedp.Run(ctx,
chromedp.Evaluate(lengthStr, &lengthInt),
)
if err != nil {
log.Fatal("123:", err)
}
//循环滚轮实现
if lengthInt > 0 {
fmt.Println("页高度:", lengthInt)
err = chromedp.Run(ctx,
chromedp.Evaluate(jsStr, nil),
)
sleepDuration := time.Duration(lengthInt/300*100) * time.Millisecond
fmt.Println("翻页时间:", sleepDuration)
time.Sleep(sleepDuration)
//无效代码 纯闲得 看看滚动条位置
topInt := 0
err = chromedp.Run(ctx,
chromedp.Evaluate(`element.scrollTop`, &topInt),
)
}
完整代码
package main
import (
"context"
"fmt"
"github.com/chromedp/cdproto/page"
"github.com/chromedp/chromedp"
"log"
"os"
"strings"
"time"
)
var title string
func main() {
listByte, err := os.ReadFile("./url.txt")
if err != nil {
return
}
list := strings.Split(string(listByte), "\n")
fmt.Println("共", len(list), "个")
//var wg sync.WaitGroup
for i, s := range list {
//wg.Add(1) The filename, directory name, or volume label syntax is incorrect.
//go func(s string) {
// create context
url := strings.TrimPrefix(s, "")
ctx, cancel := chromedp.NewContext(context.Background())
ctx, cancelTimeout := context.WithTimeout(ctx, 30*time.Second)
// capture pdf
fmt.Println("第", i+1, "个")
var buf []byte
if err := chromedp.Run(ctx, printToPDF(
url,
&buf, &title)); err != nil {
log.Fatal(err)
}
title = strings.ReplaceAll(title, "/", "-")
title = strings.ReplaceAll(title, ":", ":")
title = strings.ReplaceAll(title, "*", "-")
title = strings.ReplaceAll(title, "?", "?")
title = strings.ReplaceAll(title, `"`, "'")
title = strings.ReplaceAll(title, "<", "《")
title = strings.ReplaceAll(title, ">", "》")
if err := os.WriteFile(title+".pdf", buf, 0o644); err != nil {
log.Fatal(err)
}
fmt.Println("写入 ", title, ".pdf")
ctx.Done()
cancel()
cancelTimeout()
//wg.Done()
//}(s)
}
//wg.Wait()
fmt.Println("宝,结束辣")
time.Sleep(50 * time.Second)
}
// slowScrollToBottom 缓慢滚动到页面底部的操作
func slowScrollToBottomctx(ctx context.Context) {
jsStr := `var i = 2
var element = document.documentElement
element.scrollTop = 0; // 不管他在哪里,都让他先回到最上面
// 设置定时器,时间即为滚动速度
function main() {
if (element.scrollTop + element.clientHeight == element.scrollHeight) {
clearInterval(interval)
console.log('已经到底部了')
} else {
// 300 代表每次移动300px
element.scrollTop += 300;
console.log(i);
i += 1;
}
}
// 定义ID 200代表300毫秒滚动一次
interval = setInterval(main, 100)`
lengthStr := `var element = document.documentElement
element.scrollTop = 0;
element.scrollHeight
`
lengthInt := 0
err := chromedp.Run(ctx,
chromedp.Evaluate(lengthStr, &lengthInt),
)
if err != nil {
log.Fatal("123:", err)
}
//循环滚轮实现
if lengthInt > 0 {
fmt.Println("页高度:", lengthInt)
err = chromedp.Run(ctx,
chromedp.Evaluate(jsStr, nil),
)
sleepDuration := time.Duration(lengthInt/300*100) * time.Millisecond
fmt.Println("翻页时间:", sleepDuration)
time.Sleep(sleepDuration)
//无效代码 纯闲得 看看滚动条位置
topInt := 0
err = chromedp.Run(ctx,
chromedp.Evaluate(`element.scrollTop`, &topInt),
)
}
}
// printToPDF 打印特定的 PDF 页面
func printToPDF(urlstr string, res *[]byte, title *string) chromedp.Tasks {
return chromedp.Tasks{
chromedp.Navigate(urlstr),
chromedp.ActionFunc(func(ctx context.Context) error {
//获取长度
slowScrollToBottomctx(ctx)
return nil
}),
chromedp.Text(`#activity-name`, title, chromedp.NodeVisible),
chromedp.ActionFunc(func(ctx context.Context) error {
var err error
*res, _, err = page.PrintToPDF().WithPrintBackground(true).Do(ctx)
return err
}),
}
}