Python实现网页简单爬虫_python实现网页爬虫,需要有界面暂时输入条件、展示信息-优快云博客

本文链接：https://blog.youkuaiyun.com/DAIJZ12138/article/details/100523386

本文介绍了一种使用Python进行网页爬取的方法，通过requests库获取网页源代码，利用lxml库解析HTML，实现了从指定网站抓取电影名称、类型和演员信息的功能，并将抓取到的数据保存至本地txt文件。文章详细展示了如何设置请求头避免反爬，使用XPath提取所需数据，以及数据清洗和存储的具体步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Python实现网页简单爬虫，并将内容写入txt文件

import requests
from fake_useragent import UserAgent #第三方插件
from lxml import etree  #第三方插件
import json

#获取这个网页的源代码，并返回
def get_html(url):
    headers={
        "User-Agent":UserAgent().chrome
    }
    #获取这个网页的源代码，存放在response中，headers中为不同浏览器的不同User-Agent属性
    response = requests.get(url,headers=headers)
    response.encoding = "utf-8"
    if response.status_code == 200:
        return response.text
    else:
        return None

#获取每个item详情页url，返回数组                                                                                                                                                                                                                                                                                               item的详情页url，作为一个数组返回    
def parse_index(html):
    #从字符串常量解析HTML文档。返回根节点(或解析器目标返回的结果)。
    e = etree.HTML(html)
    #获取每个电影的详情页url，作为一个数组返回
    all_url = e.xpath('//div[@class="channel-detail movie-item-title"]/a/@href')
    return ['http://maoyan.com{}'.format(url) for url in all_url]

#获取该详情页中的内容: name、type、actors
def parse_info(html):
    e = etree.HTML(html)
    name = e.xpath('//h3[@class="name"]/text()')
    type = e.xpath('//li[@class="ellipsis"][1]/text()')
    actors = e.xpath('//li[@class="celebrity actor"]/div[@class="info"]/a/text()')
    actors = format_actors(actors)
    return {
        "name" : name,
        "type" : type,
        "actors" : actors
    }
    
def format_actors(actors):
    actor_set = set()
    for actor in actors:    
        actor_set.add(actor.strip())
    return actor_set    
def main():
    index_url ='http://maoyan.com/films'
    #获取首页的源代码    
    html = get_html(index_url)
    #获取每个item的详情页url，作为一个数组返回    
    movie_urls = parse_index(html)
    #print(movie_urls)
    result = []
    for url in movie_urls:
        movie_html =  get_html(url)
        movie = parse_info(movie_html)
        print(movie)
        #在列表末尾添加新的对象
        result.append(movie)
    text_save("result.txt", result)
    
def text_save(filename, data):#filename为写入CSV文件的路径，data为要写入数据列表.
    file = open(filename,'a')
    for i in range(len(data)):
        s = str(data[i]).replace('[','').replace(']','')#去除[],这两行按数据不同，可以选择
        s = s.replace("'",'').replace(',','') +'\n'   #去除单引号，逗号，每行末尾追加换行符
        file.write(s)
    file.close()
    # with open("result.txt", "w" ,encoding='utf-8') as file:
    #     file.write(json.dumps(result,indent=2,ensure_ascii=False))        

if __name__ =='__main__':
    main()