Python网络爬虫(Day02-3)
from hashlib import sha1
from urllib.parse import urljoin
import pickle
import re
import requests
import zlib
from bs4 import BeautifulSoup
from redis import Redis
def main():
base_url = 'http://www.zhihu.com/'
seed_url = urljoin(base_url, 'explore')
client = Redis(host='112.74.172.000', port=7266, password='wangmomo-0000')
headers = {'user-agent': 'Baiduspider'}
resp = requests.get(seed_url, headers=headers)
soup = BeautifulSoup(resp.text, 'lxml')
href_regex = re.compile(r'^/question')
for a_tag in soup.find_all('a', {'href': href_regex}):
href = a_tag.attrs['href']
full_url = urljoin(base_url, href)
hasher = sha1()
hasher.update(full_url.encode('utf-8'))
field_key = hasher.hexdigest()
if not client.hexists('zhihu', field_key):
html_page = requests.get(full_url, headers=headers).text
zipped_page = zlib.compress(pickle.dumps(html_page))
client.hset('zhihu', field_key, zipped_page)
print('Total %d question pages found.' % client.hlen('zhihu'))
if __name__ == '__main__':
main()