# 导 入 模 块
import requests
from lxml import etree
import json
def getOnePage(url):
"得到一页信息"
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"}
html = requests.get(url,headers=header)
return html.text
def parseOnPage(text):
html = etree.HTML(text)
#电影名
name = html.xpath("//div[@class='cinema-info']/a/text()")#并非唯一
#主演
star = html.xpath("//p[@class='cinema-address']/text()")
#上映时间
releasetime = html.xpath("//p[@class='releasetime']/text()")
for item in range(len(name)):
yield{
"index":item,
"name":name[item],
"star":star[item],
}
def wirte2File(content):
#路径需要自己重新定义
with open(r"C:\Users\chanx\Desktop\maoyan1.txt",'a',encoding='utf-8') as fp:
fp.write(json.dumps(content,ensure_ascii=False)+'\n')
def main():
for offset in range(10):
url = "https://maoyan.com/cinemas?offset={}".format(offset*12)
text = getOnePage(url)
for item in parseOnPage(text):
wirte2File(item)
print(item)
main()
猫眼电影院爬虫(可用)
最新推荐文章于 2024-12-04 16:23:46 发布