python爬虫

最新推荐文章于 2024-10-04 10:19:07 发布

人!=机器

最新推荐文章于 2024-10-04 10:19:07 发布

阅读量193

点赞数

分类专栏： python

本文链接：https://blog.youkuaiyun.com/zhl11112222/article/details/84946284

版权

python 专栏收录该内容

16 篇文章

订阅专栏

import re
import urllib.request
import ssl
def gethtml(url):
    header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'}
    request=urllib.request.Request(url,headers=header)
    response=urllib.request.urlopen(request)
    data=response.read().decode("utf-8")
    return data
def gettitle(hh):
    gh={}
    html=hh
    patter=re.compile('<span style="font-size:13px;">(.*?)</span>',re.S)

    scan=re.findall(patter,html)
    patter1=re.compile('<span class="rating_nums">(.*?)</span>',re.S)
    scan1=re.findall(patter1,html)
    patter2=re.compile('<span class="pl">(.*?)</span>',re.S)
    scan2=re.findall(patter2,html)
    i = 0
    j = 0
    for x in scan:
        gh[x]="   "+scan1[i]+"  "+scan2[j]
        i+=1
        j+=1
    return gh
ssl._create_default_https_context = ssl._create_unverified_context
url="https://movie.douban.com/chart"
hh=gethtml(url)
html=gettitle(hh)
print("电影名称","   评分","   人数")
for x in html:
    print(x,   html[x])