import urllib.request import requests import json from lxml import etree url = "http://www.qiushibaike.com/8hr/page/2/" headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} request = urllib.request.Request(url,headers=headers) html = urllib.request.urlopen(request).read().decode() text = etree.HTML(html) #contains()模糊查询方法,第一个参数是要匹配的标签,第二个参数是标签名部分内容 node_list = text.xpath('//div[contains(@id,"qiushi_tag")]') items = {} for node in node_list: username = node.xpath('.//h2')[0].text image = node.xpath('.//div[@class="thumb"]//@src') content = node.xpath('.//div[@class="content"]/span')[0].text items = { "username":username, "image":image, "content":content, } # 因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False f_str = json.dumps(items, ensure_ascii = False) with open("qiushi.josn","a+",encoding="utf-8") as f: f.write(f_str+"\n")
etree的应用
最新推荐文章于 2024-11-23 16:19:43 发布