from scrapy import Request, Spider
import json
from zhihuuser.items import ZhihuuserItem
class ZhihuSpider(Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
start_user = 'excited-vczh'
user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}"
user_query = "allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics"
follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset=0&limit=20"
folllows_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user,include=self.user_query),callback=self.parse_user)
yield Request(self.follows_url.format(user=self.start_user,include=self.folllows_query),callback=self.parse_follows)
def parse_user(self, response):
result = json.loads(response.text)
item = ZhihuuserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
yield Request(self.follows_url.format(user=result.get("url_token"),include=self.folllows_query),self.parse_follows)
def parse_follows(self,response):
result = json.loads(response.text)
if 'data' in result.keys():
for result in result.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
if 'page' in result.keys() and result.get('page').get('is_end') == False:
next_page = result.get('paging').get('next')
yield Request(next_page,self.parse_follows)
setting
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) , Chrome/62.0.3202.75 Safari/537.36',
'authorization': 'Bearer Mi4xX19MSkF3QUFBQUFBY01Kc2ZsMDlEQmNBQUFCaEFsVk4zN2ZXV2dBbDdDUERobFp1dzNqbmJJcnoyUHkwcWdnRUpn|1508469215|cbf47e5ad94747e09c8b4cf5e4d280a6019fe859'
}