读写文件

最新推荐文章于 2024-11-26 11:57:17 发布

一只迟到的程序猿狗狗

最新推荐文章于 2024-11-26 11:57:17 发布

阅读量259

点赞数

CC 4.0 BY-SA版权

分类专栏： python 文章标签： python

本文链接：https://blog.youkuaiyun.com/weixin_41580211/article/details/79089255

python 专栏收录该内容

18 篇文章

订阅专栏

读写文件的基本语法

# coding:utf-8
import os
# 1.打开文件
# open()参数 1.要打开的文件名称 2.打开文件的方式
# w只允许写入 r只读 a追加
# w+读写 r+读写 a+读写
# wb+读写二进制数据
# w模式：如果文件不存在，会自动创建文件，
# 如果文件存在打开文件，并且一旦向文件中写入内容原来的内容将会被覆盖
# a 模式：如果文件不存在，会自动创建文件，如果文件存在打开文件，
# 并且会一直向文件中追加数据，不会将原来的数据覆盖
# r模式：不会自动创建文件，假设文件不存在，直接抛出异常

# os.path.exists 判断某个文件是否存在 如果存在返回True;不存在返回False

# is_exist = os.path.exists('1.txt')
# if is_exist == True:
#     file_handle = open('1.txt','r')

file_handle = open('1.txt', 'w')

# 2.写入数据 write()

file_handle.write('hello world\n')
file_handle.write('你好\n')
# writelines()将存放字符串的列表整个写入文件，不会自动添加换行符
name_list =['张三，李四，王五']
file_handle.writelines(name_list)

# 3.关闭文件
file_handle.close()


# 读取文件内容
if os.path.exists('1.txt'):
    # 1.打开文件 
    file_handle = open('1.txt','r')

    # 2.读取文件
    # 2.1.read() 会将文件中的所有数据都读取出来
    # content = file_handle.read()
    # 2.2 readline() 会读取一行数据，会把光标移动到该行末尾，下次再执行readline将会读取下一行
    # content = file_handle.readline()
    # print content
    # content = file_handle.readline()
    # print content
    # 2.3 readlines() 会读取文件中的所有行，把每一行的数据放在列表中返回
    con_list = file_handle.readlines()
    print con_list

    # 3 关闭文件
    file_handle.close()

例子
利用正则表达式爬取糗事百科，将内容存到文件中

# coding: utf-8
import requests
import re

# 1.准备url
url = 'https://www.qiushibaike.com/hot/'
# 2.发起请求，拿回源代码
response = requests.get(url)
# 取出html源代码
html = response.content

# ps：需要从html中根据正则匹配到总页数，转换为整数

pattern1 = re.compile(r'class="dots.*?<span.*?>(.*?)</span>',re.S)
rs = re.search(pattern1,html)
# group(index)根据分组索引查找内容
total_page = int(rs.group(1).strip('\n'))
total_page = int(total_page)
print '共%s页段子内容！'%total_page
# ******1.打开文件
file_handle = open('qsbk.txt','w')

# for 循环获取每一页的html源代码x
for x in range(1,total_page+1):
    print '正在爬去第%s页段子...'%x
    url = 'https://www.qiushibaike.com/hot/page/%s/'%x
    html = requests.get(url).content
# 根据正则匹配每一页的段子内容

    # 3.准备正则
    # re.S DOTALL模式 .可以用来匹配所有的任意字符
    pattern = re.compile(r'<div class=".*?qiushi_tag.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?>(.*?)</div>.*?<span>(.*?)</span>.*?<i class="n.*?>(.*?)</i>.*?<i.*?>(.*?)</i>', re.S)
    # 4.查找所有的符合规则的数据
    rs = re.findall(pattern,html)
    #写入每页的分割线
    file_handle.write('===========第%s页=======\n'%x)
    for detail in rs:
        # 取出数据
        name = detail[0]
        # strip函数()去除字符串中的某些字符
        name = name.strip('\n')
        age = detail[1]
        content = detail[2]
        content = content.strip('\n')
        # 准备正则
        pattern1 = re.compile(r'<br/>')
        # 使用sub函数做替换
        content = re.sub(pattern1 ,'\n', content)
        vote_number = detail[3]
        comment_number = detail[4]
        # 写入的段子第一行 用户信息
        s1 = '用户名：%s    年龄：%s\n' % (name, age)
        file_handle.write(s1)
        # 写入的段子第二行 段子评论数 好笑数
        s2 = '好笑数：%s    评论数：%s\n' % (vote_number, comment_number)
        file_handle.write(s2)
        # 写入段子内容
        file_handle.write(content)
        file_handle.write('\n\n')
# 3.关闭文件
file_handle.close()
print '数据写入成功！'

例子
改写nba贴吧爬虫，将爬取内容存到文件中

# coding: utf-8
import requests
import re
'''
最终爬取内容的样式：
('J-Dub', '小吧主', '13', '有的人天之骄子，从选秀便是球队核心有的人甘作绿叶，一心一意干好自己的事有的人能力有限，最后只能被联盟淘汰而有的人
虽有天赋，但是球队的处境让他无法让他的天赋得到兑现，而他们可能在离开球队后，便得到突猛进的发展，可能在同位置球员离开后成为球队老大。今天楼主就来和大家谈谈那些从角色球员最终成为球队老大的球员。', '来自', 'iPhone客户端', '1楼', '2016-05-17 19:22')

'''
url = 'https://tieba.baidu.com/p/4553108519'
html = requests.get(url).content

rs = re.findall(re.compile(r'<li class="d_name.*?<a.*?>(.*?)</a>.*?<div class="d_badge_tit.*?>(.*?)</div>.*?lv">(.*?)</div>.*?<cc.*?>(.*?)</cc>.*?<span class="tail-info.*?>(.*?)<a.*?>(.*?)</a>.*?<span.*?>(.*?)</sp.*?<span class="tail.*?>(.*?)</span>',re.S), html)

# 找到帖子名称
pattern = re.compile(r'<title>(.*?)</title>')
tz_rs = re.search(pattern, html)
tz_name = tz_rs.group(1)
file_name = tz_name+'.txt'

# 1.打开文件 decode('utf-8) 使用utf-8解码，得到就是正常的中文
file_handle = open(file_name.decode('utf-8'),'w')

# 遍历每一个楼层的数据
for floor in rs:
    name = floor[0]
    # 去除name中的img标签
    replace_img = re.compile(r'<img.*?>')
    name = re.sub(replace_img, '-', name)
    # 取出头衔
    rank = floor[1]
    # 取出等级
    level = floor[2]
    # 把内容中的<br>替换为\n
    content = floor[3].replace('<br>','\n')
    # 把内容中的标签全部剔除,并且去除空格
    strip_ele = re.compile(r'<.*?>',re.S)
    content = re.sub(strip_ele, '', content).strip()
    # 取出来自客户端...
    from_device = floor[4]+floor[5]
    if 'tail' in from_device:
        # 取出楼层的正则
        floor_pat = re.compile('\w+楼')
        rs = re.search(floor_pat, from_device)
        floor_num = rs.group()
        # 取出日期的正则
        datetime_pat = re.compile('<span class="tail.*?>(.*?)</span>')
        rs = re.search(datetime_pat, from_device)
        datetime = rs.group(1)
        from_device = '来自PC电脑版'
    else:
        # 取出楼层
        floor_num = floor[6]
        # 取出日期
        datetime = floor[7]
    # 写入
    file_handle.write('层主姓名：%s   等级：%s   头衔：%s\n'%(name, level, rank))
    file_handle.write(content)
    file_handle.write('\n')
    file_handle.write('%s    楼层：%s   日期：%s\n'%(from_device, floor_num, datetime))
    file_handle.write('****************************************\n')

file_handle.close()