import urllib.request import urllib.parse import http.cookiejar from lxml import etree def create_request(page): url = 'https://www.qiushibaike.com/text/page/' + str(page) + '/' headers = { 'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', } request = urllib.request.Request(url=url, headers=headers) return request def save_content(request): ck = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(ck) opener = urllib.request.build_opener(handler) response = opener.open(request) content = response.read().decode('utf-8') return content def get_xpath(content): tree = etree.HTML(content) src_list = tree.xpath('//div[@id="content-left"]/div/div//img/@src') alt_list = tree.xpath('//div[@id="content-left"]/div/div//img/@alt') level_list = tree.xpath("//div[@id='content-left']/div/div/div/text()") if len(src_list) == len(alt_list): for i in range(len(src_list)): src = 'https:' + src_list[i] print(src) alt = alt_list[i] if alt == '匿名用户': level_list.insert(i, '无等级') level = level_list[i] data = '头像:' + src + ' ' + '用户名:' + alt + ' ' + '等级:' + level + '\n' with open('user_info.txt', 'a', encoding='utf-8')as fp: fp.write(data) if __name__ == '__main__': request = create_request(1) content = save_content(request) get_xpath(content)
XPath 爬取糗事百科
最新推荐文章于 2021-10-12 13:31:15 发布