#导入lxml from lxml import etree # wb_data = """ <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> """ #解析字符串为html对象,自动补全HTML,body html=etree.HTML(wb_data) #解析数据,a标签的文本 #写法一:text属性 data1=html.xpath('/html/body/div/ul/a') print(data1) for i in data1: print(i.text) #写法2:text()函数 data2=html.xpath('//a/text()') print(data2) #解析文件:html文件,但是自定义解析器,因为etree默认是xml的 #自定义解析对象 parser=etree.HTMLParser(encoding='utf-8') #解析html的文件为html对象 html2=etree.parse('123.htm',parser=parser) #将文件htm2l变为字符串,解码,打印输出 #html_data=etree.tostring(html2,pretty_print=True) #res=html_data.decode('utf-8') #print(res) #任意匹配 data3=html2.xpath('//*') for i in data3: if hasattr(i,'text'): print(i.text) #打印电影名字,【】代表带有titile默认的div data4=html2.xpath('//div[@title]') for i in data4: # . 当前节点div,寻找title属性值 title=i.xpath('./@title') #寻找当前节点 desc=i.xpath('./text()') print(title,"\t\t\t\t",desc) #1.读取字符串,爬虫的网页数据存到的你的字符串 #2.读取文件,xml或html文件的数据分析 #3.xpath的路径会写,./ // #4.获取节点的文本text()函数 #5.获取节点的属性@href