Python实现网页简单爬虫,并将内容写入txt文件
import requests
from fake_useragent import UserAgent #第三方插件
from lxml import etree #第三方插件
import json
#获取这个网页的源代码,并返回
def get_html(url):
headers={
"User-Agent":UserAgent().chrome
}
#获取这个网页的源代码,存放在response中,headers中为不同浏览器的不同User-Agent属性
response = requests.get(url,headers=headers)
response.encoding = "utf-8"
if response.status_code == 200:
return response.text
else:
return None
#获取每个item详情页url,返回数组 item的详情页url,作为一个数组返回
def parse_index(html):
#从字符串常量解析HTML文档。返回根节点(或解析器目标返回的结果)。
e = etree.HTML(html)
#获取每个电影的详情页url,作为一个数组返回
all_url = e.xpath('//div[@class="channel-detail movie-item-title"]/a/@href')
return ['http://maoyan.com{}'.format(url) for url in all_url]
#获取该详情页中的内容: name、type、actors
def parse_info(html):
e = etree.HTML(html)
name = e.xpath('//h3[@class="name"]/text()')
type = e.xpath('//li[@class="ellipsis"][1]/text()')
actors = e.xpath('//li[@class="celebrity actor"]/div[@class="info"]/a/text()')
actors = format_actors(actors)
return {
"name" : name,
"type" : type,
"actors" : actors
}
def format_actors(actors):
actor_set = set()
for actor in actors:
actor_set.add(actor.strip())
return actor_set
def main():
index_url ='http://maoyan.com/films'
#获取首页的源代码
html = get_html(index_url)
#获取每个item的详情页url,作为一个数组返回
movie_urls = parse_index(html)
#print(movie_urls)
result = []
for url in movie_urls:
movie_html = get_html(url)
movie = parse_info(movie_html)
print(movie)
#在列表末尾添加新的对象
result.append(movie)
text_save("result.txt", result)
def text_save(filename, data):#filename为写入CSV文件的路径,data为要写入数据列表.
file = open(filename,'a')
for i in range(len(data)):
s = str(data[i]).replace('[','').replace(']','')#去除[],这两行按数据不同,可以选择
s = s.replace("'",'').replace(',','') +'\n' #去除单引号,逗号,每行末尾追加换行符
file.write(s)
file.close()
# with open("result.txt", "w" ,encoding='utf-8') as file:
# file.write(json.dumps(result,indent=2,ensure_ascii=False))
if __name__ =='__main__':
main()