#!/usr/bin/env python
-- encoding: utf-8 --
Created on 2018-10-30 21:11:22
Project: zhihu
from pyspider.libs.base_handler import *
import random
import re
import pymysql
class Handler(BaseHandler):
crawl_config = {
‘headers’:{
‘User-Agent’:‘GoogledBot’
},
‘itag’:‘v236’
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.zhihu.com/topic/19550517/top-answers', callback=self.index_page,validate_cert=False)
@config(age=60)
def index_page(self, response):
for each in response.doc('div[@class="List-item TopicFeedItem"]').items():
a=each.find('h2.ContentItem-title a')
self.title=a.text()
self.author=each.find('span[@class="UserLink AuthorInfo-name"]').text()
self.comment_count=each.find('button[@class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel"]').text()
#print self.title
# print self.author
self.comment_count=re.sub("\D","",self.comment_count)
# print self.comment_count
#print a.attr.href
self.crawl(a.attr.href,callback=self.detail_page,validate_cert=False,save={"title":self.title,"author":self.author,"comment_count":self.comment_count})
@config(priority=2)
def detail_page(self, response):
self.content="hello"
for each in response.doc('div[@class="RichContent-inner"]').items():
self.content=each.text()
if self.content=="hello":
for each in response.doc('div[@class="RichText ztext Post-RichText"]').items():
self.content=each.text()
print response.save['title']
print response.save['author']
print response.save['comment_count']
print self.content
#插入数据到数据库
self.insert_db(response.save['title'],response.save['author'],response.save['comment_count'],self.content)
return{
"title":response.save['title'],
"author":response.save['author'],
"comment_count":response.save['comment_count'],
"content":self.content
}
def insert_db(self,title,author,comment_count,content):
db=pymysql.connect(host="xxxxx",user="xxxxx",password='xxxxxx',db='zhihu',charset="utf8")
try:
cursor=db.cursor()
sql='insert into article(title,author,comment_count,content,created_date) values("%s","%s","%s","%s",now())'%(title,author,comment_count,content)
cursor.execute(sql)
db.commit()
except Exception,e:
print e
db.rollback()
#数据库操作
create database zhihu;
use zhihu;
create table article(
id int primary key auto_increment,
title varchar(60),
author varchar(80),
comment_count varchar(80),
content text,
created_date date
)