解析数据利用的是xpath
处理数据利用json
from urllib.request import urlopen as ur
from lxml import etree as et
import time
import json
last_no=[]
last_img=[]
last_title_cn=[]
last_title_en=[]
last_other=[]
last_body=[]
last_rate=[]
last_num=[]
last_tail=[]
last=[]
def pageMsg(pages):
for page in range(1,pages+1):
start = (page-1)*25
url = "https://movie.douban.com/top250?start=" + str(start) + "&filter="
#建立连接
response=ur(url)
#获取源码
content=response.read().decode("utf-8")
#解析数据
tree=et.HTML(content)
movie=tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div')#每部电影信息
for m in movie:
no=m.xpath('./div[1]/em')#排名
last_no.append(",".join([t.text for t in no]))
img=m.xpath('./div[1]/a/img')#图片
last_img.append(",".join([t.get("src") for t in img]))
message=m.xpath("./div[2]")#信息框
for msg in message:
header=msg.xpath("./div[1]")#信息头
for h in header:
title_cn=h.xpath("./a/span[1]")#标题中文
last_title_cn.append(",".join([t.text for t in title_cn]))
title_en=h.xpath("./a/span[2]")#英文
last_title_en.append(",".join([t.text for t in title_en]))
other=h.xpath("./a/span[3]")#类型
last_other.append(",".join([t.text for t in other]))
body=msg.xpath("./div[2]/p[1]")#信息体
last_body.append(",".join([t.text for t in body]))
rates=msg.xpath("./div[2]/div")#评分体
for r in rates:
rate=r.xpath("./span[2]")#评分
last_rate.append(",".join([t.text for t in rate]))
num=r.xpath("./span[4]")#评分人数
last_num.append(",".join([t.text for t in num]))
tail=msg.xpath("./div[2]/p[2]/span")#名言
last_tail.append(",".join([t.text for t in tail]))
#处理结果
#json
for i in range(0,len(last_no)):
strl={"no":last_no[i],"img":last_img[i],"title_cn":last_title_cn[i],"title_en":last_title_en[i],"other":last_other[i],"body":last_body[i],"rate":last_rate[i],"rate_num":last_num[i],"quotes":last_tail[i]}
last.append(strl)
with open("e:/a.json","w",encoding="utf-8") as f:
for l in last:
json.dump(l,f,ensure_ascii=False)
f.write("\n")
time.sleep(5)
if __name__ == '__main__':
pages=10
pageMsg(pages)