package main
import (
"context"
"encoding/csv"
"fmt"
"log"
"os"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
)
// 抓取结果结构
type Result struct {
Source string
Title string
URLs []string
Error string
}
// 根据 URL 返回合适的等待选择器
func getWaitSelector(url string) string {
switch {
case strings.Contains(url, "bing.com"):
return "li.b_algo"
case strings.Contains(url, "zhipin.com"):
return ".info-primary, .job-list, .company-banner"
case strings.Contains(url, "golang.google.cn"):
return ".download"
default:
return "body"
}
}
// 抓取函数
func scrapeBingOrZhipin(ctx context.Context, url, source string) (*Result, error) {
var title string
var outerHTML string
var urls []string
waitSelector := getWaitSelector(url)
tasks := chromedp.Tasks{
chromedp.Navigate(url),
chromedp.WaitVisible(waitSelector, chromedp.ByQuery),
chromedp.Title(&title),
chromedp.OuterHTML(`html`, &outerHTML, chromedp.ByQuery),
}
err := chromedp.Run(ctx, tasks)
if err != nil {
return &Result{
Source: source,
Error: fmt.Sprintf("页面加载失败: %v", err),
}, nil
}
urls = extractURLs(outerHTML)
return &Result{Source: source, Title: title, URLs: urls, Error: ""}, nil
}
// 提取链接
func extractURLs(html string) []string {
var urls []string
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
log.Printf("goquery 解析失败: %v", err)
return urls
}
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
href = strings.TrimSpace(href)
if href == "" || href == "#" || strings.HasPrefix(href, "javascript:") {
return
}
if strings.HasPrefix(href, "/") && !strings.HasPrefix(href, "//") {
return // 忽略相对路径(可选:补全)
}
urls = append(urls, href)
})
return dedup(urls)
}
// 去重
func dedup(slice []string) []string {
seen := make(map[string]bool)
var result []string
for _, item := range slice {
if !seen[item] {
seen[item] = true
result = append(result, item)
}
}
return result
}
func main() {
// 1. 设置浏览器启动参数
allocCtx, cancel := chromedp.NewExecAllocator(
context.Background(),
append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.Flag("no-sandbox", true),
chromedp.Flag("disable-gpu", true),
chromedp.Flag("headless", false), // 先 false 看效果
chromedp.Flag("disable-blink-features=AutomationControlled", true),
chromedp.Flag("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"),
)...,
)
defer cancel()
// 2. 创建主 context(不执行任务)
ctx, cancel := chromedp.NewContext(
allocCtx,
chromedp.WithLogf(log.Printf),
)
defer cancel()
// 3. 必须先执行一次 Run,确保浏览器就绪
if err := chromedp.Run(ctx, chromedp.Navigate("about:blank")); err != nil {
log.Fatal("❌ 浏览器启动失败:", err)
}
log.Println("✅ 浏览器已启动")
// 4. 并发任务(每个任务独立 tab)
var wg sync.WaitGroup
var mu sync.Mutex
var results []*Result
tasks := []struct {
url string
source string
}{
{url: "https://cn.bing.com/search?q=boss%E7%9B%B4%E8%81%98", source: "必应搜索 - boss直聘"},
{url: "https://www.zhipin.com/gongsi/3d0b6629203bb0b61nJy3t-0.html", source: "Boss直聘 - 公司详情"},
{url: "https://golang.google.cn/dl/", source: "Go 官方下载页"},
}
for _, task := range tasks {
wg.Add(1)
go func(url, source string) {
defer wg.Done()
// 创建子 context + 新 tab
taskCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
taskCtx, cancel = chromedp.NewContext(taskCtx)
defer cancel()
log.Printf("🔍 开始抓取: %s", source)
result, err := scrapeBingOrZhipin(taskCtx, url, source)
if err != nil {
log.Printf("❌ 任务失败 [%s]: %v", source, err)
mu.Lock()
results = append(results, &Result{Source: source, Error: err.Error()})
mu.Unlock()
return
}
mu.Lock()
results = append(results, result)
mu.Unlock()
log.Printf("✅ 完成: %s (标题: %s)", source, result.Title)
}(task.url, task.source)
}
wg.Wait()
// 输出结果 + 导出 CSV
exportToCSV(results)
}
func exportToCSV(results []*Result) {
file, err := os.Create("scraped_results.csv")
if err != nil {
log.Fatal(err)
}
defer file.Close()
// 写入 UTF-8 BOM
file.WriteString("\xEF\xBB\xBF")
writer := csv.NewWriter(file)
// 写表头
writer.Write([]string{"Source", "Title", "URL", "Error"})
// 展开每个 result 的 URLs
for _, r := range results {
if len(r.URLs) == 0 {
// 如果没有 URL,也输出一行
writer.Write([]string{r.Source, r.Title, "", r.Error})
} else {
for _, url := range r.URLs {
writer.Write([]string{r.Source, r.Title, url, r.Error})
}
}
}
writer.Flush()
log.Println("📄 结果已导出到: scraped_results.csv")
}
有python或者java基础 学习go语言不会难。我简单研究了以下,go下边直接操作机器码,本身运行实测占用的内存也较小,语言的便携程度很高,并且go语言本身的生态也很不错,可以尝试起来
475

被折叠的 条评论
为什么被折叠?



