#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getcontent(url):
html = requests.get(url,headers=headers).text
selector = etree.HTML(html)
content = selector.xpath('//div[@id="zh-list-answer-wrap"]/div[@class="zm-item"]/h2[@class="zm-item-title"]')
for each in content:
re_href = 'https://www.zhihu.com'+each.xpath('a/@href')[0]
re_txt = each.xpath('a/text()')[0]
url_list = url.split('=')
f.write('第'+str(url_list[1])+'页\t'+re_href+'\t'+re_txt+'\n')
if __name__ == '__main__':
headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding":"gzip",
"Accept-Language":"zh-CN,zh;q=0.8",
"Referer":"http://www.example.com/",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
pool = ThreadPool(4)
f = open('zhihu.txt','a')
page = []
for i in range(1,36):
newpage = 'http://www.zhihu.com/collection/27109279?page=' + str(i)
page.append(newpage)
results = pool.map(getcontent, page)
pool.close()
pool.join()
f.close()
转载于:https://my.oschina.net/u/2411815/blog/625554