GO语言爬虫爬取豆瓣电影top250存入excel

本文介绍了一个使用Go语言实现的豆瓣电影Top250数据爬虫项目,通过解析网页源代码获取电影名称、评分及评价人数等信息,并将数据存储到Excel文件中。该项目展示了如何利用HTTP请求、正则表达式和Excel操作等技术进行数据抓取和处理。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package main

import (
"fmt"
"github.com/tealeg/xlsx"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"time"
)
//定义新的数据类型
type Spider struct {
url string
header map[string]string
}


type Films struct {
rows string
name string
scores string
scores_pepoles string

}

//定义 Spider get的方法
func (keyword Spider) get_html_header() string {
client := &http.Client{}
req, err := http.NewRequest("GET", keyword.url, nil)
if err != nil {
}
for key, value := range keyword.header {
req.Header.Add(key, value)
}
resp, err := client.Do(req)
if err != nil {
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
}
return string(body)

}
func parse() {

//xlsx 创建xlsx
file := xlsx.NewFile()
sheet,_ := file.AddSheet("sheet1")
row := sheet.AddRow()
cell := row.AddCell()
cell.Value = "页码"
cell = row.AddCell()
cell.Value = "电影名称"
cell = row.AddCell()
cell.Value = "评分"
cell = row.AddCell()
cell.Value = "评价人数"


films := getFilms()
//add data
for _, film := range films{
row := sheet.AddRow()
rowsCell := row.AddCell()
rowsCell.Value = film.rows

nameCell := row.AddCell()
nameCell.Value = film.name

scoresCell := row.AddCell()
scoresCell.Value = film.scores

scores_pepolesCell := row.AddCell()
scores_pepolesCell.Value = film.scores_pepoles
}
err := file.Save("C:/file.xlsx")
if err != nil {
fmt.Printf(err.Error())
}
}

func getFilms()[]Films {
films := make([]Films, 0)
header := map[string]string{
"Host": "movie.douban.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Referer": "https://movie.douban.com/top250",
}
for i := 0; i < 10; i++ {
fmt.Println("正在抓取第" + strconv.Itoa(i) + "页......")
url := "https://movie.douban.com/top250?start=" + strconv.Itoa(i*25) + "&filter="
spider := &Spider{url, header}
html := spider.get_html_header()

//评价人数
pattern2 := `<span>(.*?)人评价</span>`
rp2 := regexp.MustCompile(pattern2)
find_txt2 := rp2.FindAllStringSubmatch(html, -1)

//评分
pattern3 := `property="v:average">(.*?)</span>`
rp3 := regexp.MustCompile(pattern3)
find_txt3 := rp3.FindAllStringSubmatch(html, -1)

//电影名称
pattern4 := `"title">([^&nbsp].*?)</span>`
rp4 := regexp.MustCompile(pattern4)
find_txt4 := rp4.FindAllStringSubmatch(html, -1)

for j := 0; j < len(find_txt2); j++ {
film := Films{}
film.rows = strconv.Itoa(i+1)
film.name = find_txt4[j][1]
film.scores = find_txt3[j][1]
film.scores_pepoles = find_txt2[j][1]
films = append(films, film)

}
}
return films

}






func main() {

t1 := time.Now() // get current time
parse()
elapsed := time.Since(t1)

fmt.Println("爬虫结束,总共耗时: ", elapsed)

}

转载于:https://www.cnblogs.com/niulanshandeniu/p/11277380.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值