pyspider爬取知乎源码

最新推荐文章于 2021-02-21 07:09:50 发布

原创最新推荐文章于 2021-02-21 07:09:50 发布 · 置顶 · 379 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python-pyspider

本文介绍了一个使用Python实现的知乎爬虫项目，详细展示了如何抓取知乎话题下的热门回答，包括标题、作者、评论数和内容，并将数据存入MySQL数据库。文章深入探讨了爬虫的配置、数据抓取及解析过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#!/usr/bin/env python

-- encoding: utf-8 --

Created on 2018-10-30 21:11:22

Project: zhihu

from pyspider.libs.base_handler import *

import random
import re
import pymysql

class Handler(BaseHandler):
crawl_config = {
‘headers’:{
‘User-Agent’:‘GoogledBot’
},
‘itag’:‘v236’
}

@every(minutes=24 * 60)
def on_start(self):
    self.crawl('https://www.zhihu.com/topic/19550517/top-answers', callback=self.index_page,validate_cert=False)

@config(age=60)
def index_page(self, response):
    for each in response.doc('div[@class="List-item TopicFeedItem"]').items():
       a=each.find('h2.ContentItem-title a')
       self.title=a.text()
       self.author=each.find('span[@class="UserLink AuthorInfo-name"]').text()
       self.comment_count=each.find('button[@class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel"]').text()
       #print self.title
      # print self.author
       self.comment_count=re.sub("\D","",self.comment_count)
      # print self.comment_count
       #print a.attr.href
       self.crawl(a.attr.href,callback=self.detail_page,validate_cert=False,save={"title":self.title,"author":self.author,"comment_count":self.comment_count})
   
        
        
        
        

@config(priority=2)
def detail_page(self, response):
     self.content="hello"
     for each in response.doc('div[@class="RichContent-inner"]').items():
       self.content=each.text()
     if self.content=="hello":
        for each in response.doc('div[@class="RichText ztext Post-RichText"]').items():
         self.content=each.text()
     print response.save['title']
     print response.save['author']
     print response.save['comment_count']
     print self.content
     #插入数据到数据库
     self.insert_db(response.save['title'],response.save['author'],response.save['comment_count'],self.content)
     return{
        "title":response.save['title'],
        "author":response.save['author'],
        "comment_count":response.save['comment_count'],
        "content":self.content 
     }
def insert_db(self,title,author,comment_count,content):
    db=pymysql.connect(host="xxxxx",user="xxxxx",password='xxxxxx',db='zhihu',charset="utf8")
    try:
        cursor=db.cursor()
        sql='insert into article(title,author,comment_count,content,created_date) values("%s","%s","%s","%s",now())'%(title,author,comment_count,content)
        cursor.execute(sql)
        db.commit()
    except Exception,e:
        print e 
        db.rollback()

#数据库操作
create database zhihu;

use zhihu;
create table article(
id int primary key auto_increment,
title varchar(60),
author varchar(80),
comment_count varchar(80),
content text,
created_date date
)