1、ItemSaver的架构:
抽象出Task的概念
FetchTask,PersistTask共用一个Engine,Scheduler
需要创建FetchWorker,PersistWorker
本项目中显得过重
为每个Item创建goroutine,提交给ItemServer
package engine
type ConcurrentEngine struct {
Scheduler Scheduler
WorkerCount int
ItemChan chan interface{}
}
func (e *ConcurrentEngine) Run(seeds ...Request){
out := make(chan ParseResult)
e.Scheduler.Run()
for i:=0;i<e.WorkerCount;i++{
createWorker(e.Scheduler.WorkerChan(),out,e.Scheduler)
}
//收out
for {
result := <-out
for _,item := range result.Items {
go func() {e.IteemChan <- item}()
}
//URL dedup
for _,request := range result.Requests{
if isDuplicate(request) {
log.Printf("Duplicate request: " + "%s",request.Url)
continue
}
e.Scheduler.Submit(request) //将请求送往调度器
}
}
}
//创建ItemSaver
package persist
func ItemSaver() chan interface{} {
out := make(chan interface{})
go func() {
itemCount := 0
for {
item := <- out
log.Printf("Item Saver : got item " + "#%d: %v",itemCount,item)
itemCount++
}
}()
return out
}
2、Docker和ElasticSeaerch介绍:
ES:
全文搜索引擎
快速地存储、搜索和分析海量数据
ES能做什么:
存储我们爬取的数据
不需要建表,配置字段等
json格式的文档
寻找:男,有房,有车
原生支持,不需要写代码,不需要拼装查询语句
直接使用docker来安装ES
docker:
容器引擎
打包/发布应用程序,包括系统环境,配置,依赖
虚拟化、沙箱机制
3、Docker的安装和使用:
Daocloud国内的镜像
有Client和Server,Server是工具栏上面的docker
Daocloud有加速器
docker run -it 镜像名 sh
docker images
//-d后台运行 -p端口映射
docker run -d -p 80:80 nginx
docker kill containerID(dnaskjndj)
docker run -d -p 9200:9200 elasticsearch
docker logs
4、ElasticSearch 入门:
:9200/index/type/id
index -> database
type ->table
put操作
{
"name":jacob,
"age":18
}
<server>:9200/index/type/_search
全文搜索
<server>:9200/index/type/_search?q=golang
get获取信息
如果后面不加id也想添加信息,需要使用Post的方式提交
<server>:9200/index/type
不需要预先创建index和type
还可以使用_mapping来配置类型
<server>:9200/index/type/_mapping
使用REST接口
PUT/POST 创建/修改数据,使用POST可省略Id
GET获取数据
GET <index>/<type>/_search?q=来全文搜索
5、向ElasticSearch 存储数据:
使用ES的客户端
go get -v gopkg.in/olivere/elastiic.v5
func save(item interface{}) (string,error){
client,err :=elastic.NewClient(
//must turn off in docker
elastic.SetSniff(false))
if err != nil {
return "",err
}
resp,err := client.Index().Index("dating_profile").Type("zhenai").BodyJson(item).Do(context.Background()) //存数据
if err != nil {
return "",err
}
//fmt.Println(resp)
//fmt.Printf("%+v",resp)//打印结构体的时候把字段也打印出来
return resp.Id,nil
}
//建立test文件测试
func TestSave(t *testing.T) {
expected := model.Profile{
Age : 18,
Name : "Jacob"
}
id,err := save(expected)
if err != nil {
panic(err)
}
client,err :=elastic.NewClient(
//must turn off in docker
elastic.SetSniff(false))
if err != nil {
return "",err
}
resp,err := client.Get().Index("dating_profile").Type("zhenai").Id(id).Do(context.Background())
if err != nil {
panic(err)
}
//fmt.Printf("%+v",resp.Source)
t.Logf("%s",resp.Source)
var actual model.Profile
err = json.Unmarshal([]byte(resp.Source),&actual)
if err != nil {
panic(err)
}
if actual != expected {
t.Errorf("got %v; expected %v",actual,expected)
}
}
6、完整爬虫的运行与数据存储:
ES原本的查询就支持分页,查询的时候加上参数pretty,可以格式化输出
size 查询的记录数
size=100
q= 男 已购房 已购车
空格查询
q= 男 已购房 已购车 Age:(<30)
需要安装中文分词的插件提升中文的查询率
func ItemSaver() chan interface{} {
out := make(chan interface{})
go func() {
itemCount := 0
for {
item := <- out
log.Printf("Item Saver : got item " + "#%d: %v",itemCount,item)
itemCount++
_,err := save(item)
if err != nil {
log.Print("Item Saver: error " + "saving item %v : %v",item,err)
}
}
}()
return out
}
7、添加URL和ID:
每一次录入,系统的ID会自动分配,需要制定ID,以及添加用户的URL
URL和ID是通用的,其他属性则不是
type ParseResult struct {
Requests []Request
Items []Item
}
type Item struct {
Url string
Type string
Id string
Payload interface{}
}
func save(item interface{}) error{
client,err :=elastic.NewClient(
//must turn off in docker
elastic.SetSniff(false))
if err != nil {
return "",err
}
if item.Type == "" {
return "",errors.New("must supply Type")
}
//提取IndexService
indexService := client.Index().
Index("dating_profile").
Type(item.Type).
Id(item.Id).
BodyJson(item)
if item.Id != "" {
indexService.Id(item.Id)
}
resp,err := indexService.
Do(context.Background()) //存数据
if err != nil {
return err
}
//fmt.Println(resp)
//fmt.Printf("%+v",resp)//打印结构体的时候把字段也打印出来
return nil
}
func ItemSaver() chan engine.Item {
out := make(chan engine.Item})
go func() {
itemCount := 0
for {
item := <- out
log.Printf("Item Saver : got item " + "#%d: %v",itemCount,item)
itemCount++
_,err := save(item)
if err != nil {
log.Print("Item Saver: error " + "saving item %v : %v",item,err)
}
}
}()
return out
}
//需要解析ES获取的数据,因为payload定义的是interface的类型
func FromJsonObj(o interface{}) (Profile,error) {
var profile Profile
s,err := json.Marshal(o)
if err != nil {
return profile,err
}
err = json.Unmarshal(s,&profile)
return profile,err
}
var idUrlRe = regexp.MustCompile(`http://album.zhenai.com/u/([\d]+)`) //ID从URL中解析获取
func ParseProfile(contents []byte,url string,name string) engine.ParseResult{
profile := model.Profile{}
profile.Name = name
age,err := strconv.Atoi(
extractString(contents,ageRe))
if err != nil {
profile.Age = age
}
profile.Marriage = extractString(contents,marriageRe)
//------其他属性直接堆代码就行---
result := engine.ParseResult{
Items: []engine.Item{
Url : url,
Type: "zhenai",
Id : extractString([]byte(url),idUrlRe),
Payload : profile,
},
}
return result
}
func ParseCity(contents []byte) engine.ParseResult{
re := regexp.MustCompile(cityRe) //非尖括号
matches := re.FindAllSubmatch(contents,-1)
result := engine.ParseResult{}
for _,m :=range mathces {
url := string(m[1])
name := string(m[2])
result.Items = append(result.Items,"User " + name) //城市名字返回
result.Requests = append(result.Requests,engine.Request{
Url : string(m[1]),
ParserFunc :func(c []byte) engine.Parseresult {
return ParseProfile(c,url,name) //函数式编程,不改变原函数的参数方式
},
})
}
return result
}
8、重构与运行:
之前存入了一批系统自动分配ID的数据,需要进行清洗
之前每次save的时候都需要New一个client,消耗有点大
在ItemSaver中建client,然后传给save
func ItemSaver() chan engine.Item {
client,err :=elastic.NewClient(
//must turn off in docker
elastic.SetSniff(false))
if err != nil {
return "",err
}
out := make(chan engine.Item})
go func() {
itemCount := 0
for {
item := <- out
log.Printf("Item Saver : got item " + "#%d: %v",itemCount,item)
itemCount++
_,err := save(client,item)
if err != nil {
log.Print("Item Saver: error " + "saving item %v : %v",item,err)
}
}
}()
return out
}
func save(client *elastic.Client,item engine.Item) error{
if item.Type == "" {
return "",errors.New("must supply Type")
}
//提取IndexService
indexService := client.Index().
Index("dating_profile").
Type(item.Type).
Id(item.Id).
BodyJson(item)
if item.Id != "" {
indexService.Id(item.Id)
}
resp,err := indexService.
Do(context.Background()) //存数据
if err != nil {
return err
}
return nil
}
//提取方法
func ProfileParser(name string,url string) engine.ParserFunc {
return func(c []byte) engine.ParseResult {
return ParseProfile(c,url,name)
}
}
//后续自己再看,改动细节太多
之前的ES数据不要了,docker kill ES的容器
重新run一遍,起干净的ES
如果不希望重启ES,可以通过docker -v参数将数据存储到指定文件中
q= 男 已购房 已购车 Payload.Age:(<30)
9、标准模板库介绍:
html/template
模板引擎
服务器端生成最终网页
适合做后台或者维护页面
取值、选择、循环、函数调用
{{.Hits}}
显示从{{.Start}}起共{{len .Items}}个
type SearchResult struct {
Hits int64
Start int
//Items []engine.Item /严格
Items []interface{} //松一点
}
{{range .Items}}
{{with .Payload}}
{{.Age}} //相当于省略了.Payload,不然每一个属性都要加上.Payload
{{end}}
func TestTemplate(t *testing.T){
template := template.Must(
template.ParseFiles("template.html"))
out,err := os.Create("template.test.html")
page := model.SearchResult{}
//err := template.Execute(os.Stdout,page)
page.Hits = 123
//page.Itms //遍历生成item测试数据
err := template.Execute(out,page)
if err != nil {
panic(err)
}
}
10、实现前端展示页面:
package view
type SearchResultView struct {
template *template.Template
}
//解析模板
func CreateSearchResultView(filename string) SearchResultView {
return SearchResultView{
template: template.Must(
template.ParseFiles(filename)),
}
}
//渲染
func (s SearchResultView) Render(w io.Writer,data model.SearchResult) error{
return s.template.Execute(w,data)
}
package controller
type SearchResultHandler struct{
view view.SearchResultView
client *elastic.Client
}
func CreateSearchResultHandler(template string) SearchResultHandler {
client,err := elastic.NewClient(
elastic.SetSniff(false)//docker里一定要设置本项
)
if err != nil {
panic(err)
}
return SearchResultHandler{
view : view.CreateSearchResultView(template),
client: client,
}
}
//localhost:8888/search?q= 男 已购房&from=20
func (h SearchResultHandler) ServerHTTP(w http.ResponseWriter,req *http.Request){
q := strings.TrimSpace(req.FormValue("q"))
from,err := strconv.Atoi(req.FormValue("from"))
if err != nil {
from = 0
}
//fmt.Fprintf(w,"q=%s, from = %d",q,from)
var page model.SearchResult
page = getSearchResult(q,from)
err := h.view.Render(w,page)
if err != nil {
http.Error(w,err.Error(),http.StatusBadRequest)
}
}
func (h SearchResultHandler) getSearchResult(q string,from int) (model.SearchResult,error){
var result model.SearchResult
resp,err := h.client.Search("dating_profile").Query(elastic.NewQueryStringQuery(q)).
From(from).
Do(context.Background())
if err != nil {
return result,err
}
result.Hits = resp.TotalHits()
result.Start = from
//for _,v := range resp.Each(reflect.TypeOf(engine.Item{})){
//item := v.(engine.Item)
//}
result.Items = resp.Each(reflect.TypeOf(engine.Item{}))
return result,nil
}
func main(){
//配置访问css/js等静态文件的路由
http.Handle("/",http.FileServer(http.Dir(crawler/frontend/view)))
//http.Handle("/search",controller.SearchResultHandler{})
http.Handle("/search",controller. CreateSearchResultHandler("crawer/frontend/view/template.html"))
http.ListenAndServe(":8888",nil)
if err != nil {
panic(err)
}
}
11、完善前端展示:
//fill in query string
//support search button
//support paging
//add start page
//将Age:(<30)替换成Payload.Age:(<30)
func rewriteQueryString(q string) string {
re := regexp.MustCompile(`([A-Z][a-z])*:`)
re.ReplaceAllString(q,"Payload.$1:")
}
{{if ge .PrevFrom 0}}
<a href="search?q={{.Query}}&from={{.PrevFrom}}">上一页</a>
{{end}}
type SearchResult struct {
Hits int64
Start int
Items []interface{} //松一点
PrevFrom int
NextFrom int
}
q = 女 Height:(>165) Weight:([1 TO 50])