Python爬虫学习-----豆瓣top250（Xpath解析）

最新推荐文章于 2024-05-31 21:24:28 发布

原创最新推荐文章于 2024-05-31 21:24:28 发布 · 493 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#python

Python 专栏收录该内容

3 篇文章

订阅专栏

此博客介绍了利用Python进行数据处理的方法，通过XPath解析数据，再利用JSON对数据进行处理，涉及信息技术领域的数据处理相关内容。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

解析数据利用的是xpath
处理数据利用json

from urllib.request import urlopen as ur
from lxml import etree as et
import time
import json

last_no=[]
last_img=[]
last_title_cn=[]
last_title_en=[]
last_other=[]
last_body=[]
last_rate=[]
last_num=[]
last_tail=[]
last=[]
def pageMsg(pages):
    for page in range(1,pages+1):
        start = (page-1)*25
        url = "https://movie.douban.com/top250?start=" + str(start) + "&filter="
        #建立连接
        response=ur(url)
        #获取源码
        content=response.read().decode("utf-8")
        #解析数据
        tree=et.HTML(content)
        movie=tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div')#每部电影信息
        for m in movie:
            no=m.xpath('./div[1]/em')#排名
            last_no.append(",".join([t.text for t in no]))
            img=m.xpath('./div[1]/a/img')#图片
            last_img.append(",".join([t.get("src") for t in img]))
            message=m.xpath("./div[2]")#信息框
            for msg in message:
                header=msg.xpath("./div[1]")#信息头
                for h in header:
                    title_cn=h.xpath("./a/span[1]")#标题中文
                    last_title_cn.append(",".join([t.text for t in title_cn]))
                    title_en=h.xpath("./a/span[2]")#英文
                    last_title_en.append(",".join([t.text for t in title_en]))
                    other=h.xpath("./a/span[3]")#类型
                    last_other.append(",".join([t.text for t in other]))
                body=msg.xpath("./div[2]/p[1]")#信息体
                last_body.append(",".join([t.text for t in body]))
                rates=msg.xpath("./div[2]/div")#评分体
                for r in rates:
                    rate=r.xpath("./span[2]")#评分
                    last_rate.append(",".join([t.text for t in rate]))
                    num=r.xpath("./span[4]")#评分人数
                    last_num.append(",".join([t.text for t in num]))
                tail=msg.xpath("./div[2]/p[2]/span")#名言
                last_tail.append(",".join([t.text for t in tail]))
    #处理结果
    #json
    for i in range(0,len(last_no)):
        strl={"no":last_no[i],"img":last_img[i],"title_cn":last_title_cn[i],"title_en":last_title_en[i],"other":last_other[i],"body":last_body[i],"rate":last_rate[i],"rate_num":last_num[i],"quotes":last_tail[i]}
        last.append(strl)
    with open("e:/a.json","w",encoding="utf-8") as f:
        for l in last:
            json.dump(l,f,ensure_ascii=False)
            f.write("\n")
        time.sleep(5)


if __name__ == '__main__':
    pages=10
    pageMsg(pages)