一、文件操作习题
1、文件拷贝
f1='test.txt'
f2='test1.txt'
f=open(f1,'w+')
lines=['abc','123','magedu']
f.writelines('\n'.join(lines))
f.seek(0)
print(f.read())
f.close()
def copy(src,dst):
with open(src) as f1:
with open(dst,'w') as f2:
f2.write(f1.read())
copy(f1,f2)
除非当前不想关闭,否则尽量用with语法;
单纯拷贝、进行读写时还可考虑用二进制流,当行数太多可一行行读入,或改动buffering的大小,改成4196或其他值;
2、统计文件单词top10
方法一:原始算法,直接按默认空格切
from collections import defaultdict
d=defaultdict(lambda :0) #缺省字典,方便+1
with open("sample.txt",encoding='utf-8') as f:
for line in f:
words=line.split()
for word in words:
d[word]+=1
print(sorted(d.items(),key=lambda x:x[1],reverse =True))
方法二:去掉干扰字符
from collections import defaultdict
d=defaultdict(lambda :0)
def makekey(src:str,chars=set("""\/''"".,~$*^()[]{}<>?-+—=_&%@»""")):
target=[]
s=src.lower()
for c in s:
if c in chars:
target.append(" ")
else:
target.append(c)
return ''.join(target).split()
#一行行处理
with open("sample.txt",encoding='utf-8') as f:
for line in f:
words=makekey(line)
for word in words:
d[word]+=1
print(sorted(d.items(),key=lambda x:x[1],reverse =True))
#若整篇文本不长,可一次性处理
with open("sample.txt",encoding='utf-8') as f:
words=makekey(f.read())
for word in words:
d[word]+=1
print(sorted(d.items(),key=lambda x:x[1],reverse =True))
方法三:利用切片(一般会用到索引和enumerate),每次在迭代中就把想要的数据抛出来,可列举一些极端情况、如前后端有特殊字符,以完善分支条件
from collections import defaultdict
d=defaultdict(lambda :0)
def makekey1(src:str,chars=set(""" \\//\n\r''"".,~$*^()[]{}<>?-+—=_&%@»""")):
l=list()
start=0
length=len(src)
for i,c in enumerate(src):
if c in chars:
if i==start:
start+=1
continue
l.append(src[start:i].lower())
start=i+1
else:
if start<length:
l.append(src[start:])
return l
with open("sample.txt",encoding='utf-8') as f:
words = makekey1(f.read())
for word in words:
d[word]+=1
print(sorted(d.items(),key=lambda x:x[1],reverse =True)[:10])
#方法四:合理利用python3的惰性求值,并进一步封装成函数,通过传参将ignore的单词放在stopwords中,让代码更简洁
from collections import defaultdict
d=defaultdict(lambda :0)
def makekey1(src:str,chars=set(""" \\//\n\r''"".,~$*^()[]{}<>?-+—=_&%@»""")):
start=0
length=len(src)
for i,c in enumerate(src):
if c in chars:
if i==start:
start+=1
continue
yield src[start:i]
start=i+1
else:
if start<length:
yield src[start:]
def topn(n,filename,stopwords=set(),encoding='utf-8',reverse=True):
with open(filename,encoding=encoding) as f:
for line in f:
for word in map(str.lower,makekey1(line)): # 此处转换为小写,双双惰性求值
if word not in stopwords:
d[word]+=1
yield from sorted(d.items(),key=lambda x:x[1],reverse =reverse)[:n]
for rank in topn(10,"sample.txt",{'the','is','a','if','and','to','on','of','or','in','an','for','this'}):
print(rank)
# 运行结果:
('path', 138)
('os', 50)
('return', 30)
('windows', 25)
('file', 24)
('pathname', 17)
('true', 17)
('drive', 17)
('directory', 16)
('unix', 16)
# 方法五:正则表达式
from collections import defaultdict
import re
d=defaultdict(lambda :0)
def makekey3(src:str):
regex=re.compile('[^\w-]+')
return regex.split(src)
with open("sample.txt",encoding='utf-8') as f:
words = makekey3(f.read()) # 鉴于文本不长,本次采用一次性处理
for word in words:
if word:
d[word]+=1
print(sorted(d.items(),key=lambda x:x[1],reverse =True)[:10])
3、判断文件类型,若为目录判断是否为空
from pathlib import Path
p=Path()
p/='a/b/c/d'
print(len(p.parents))
for x in p.parents[len(p.parents)-1].iterdir():
print(x, end='\t')
if x.is_dir():
flag = False
for _ in x.iterdir():
flag = True
break
print('dir','Not Empty' if flag else 'Empyt',sep='\t')
elif x.is_file():
print('file')
else:
print('other file')
4、将一个ini配置文件转为json格式
from configparser import ConfigParser
import json
cfg=ConfigParser()
cfg.read('test.ini')
d={}
for section in cfg.sections():
d[section]=dict(cfg.items(section))
print(json.dumps(d))
json.dump(d,'test.json')
二、argparse模块及其运用
自学argparse模块,实现ls命令功能:
实现-l、-a和–all、-h选项
实现显示路径下的文件列表
-a和-all 显示包含.开头的文件
-l 详细列表显示
-h 和-l配合,人性化显示文件大小,例如1K、1G、1T等,可以认为1G=1000M 类型字符
c 字符
d 目录
普通文件
l 软链接
b 块设备
s socket文件
p pipe文件,即FIFO
-rw-rw-r-- 1 python python 5 Oct 25 00:07 test4
mode 硬链接 属主 属组 字节 时间 文件名
按照文件名排序输出,可以和ls的顺序不一样,但要求文件名排序 要求详细列表显示时,时间可以按照“年-月-日 时:分:秒” 格式显示
# Ding
from pathlib import Path
import argparse
def mode_handle(mode):
file_dict = {
0o040000: 'd', # directory
0o020000: 'c', # character device
0o060000: 'b', # block device
0o100000: '-', # regular file
0o010000: 'f', # fifo (named pipe)
0o120000: 'l', # symbolic link
0o140000: 's' # socket file
}
file_type = file_dict.get(mode&0o770000)
file_mode = mode2_handle(mode)
ret = '{}{}'.format(file_type,file_mode)
return ret
def mode2_handle(mode):
mode_dict = {
0: '---',
1: '--x',
2: '-w-',
3: '-wx',
4: 'r--',
5: 'r-x',
6: 'rw-',
7: 'rwx'
}
umode = mode_dict.get((mode&0o700) >> 6)
gmode = mode_dict.get((mode&0o70) >> 3)
omode = mode_dict.get((mode&0o7))
return '{}{}{}'.format(umode,gmode,omode)
def atime_handle(atime):
import datetime
return datetime.datetime.fromtimestamp(atime).strftime('%X %F')
def size_handle(size):
K = 1000
M = K*1000
G = M*1000
if size <= K:
ret = '{}B'.format(size)
elif size < M:
ret = '{:.1f}K'.format(size/K)
elif size < G:
ret = '{:.1f}M'.format(size/M)
else:
ret = '{:.1f}G'.format(size/G)
return ret
def info(p, stat):
mode = mode_handle(stat.st_mode)
nlink = stat.st_nlink
# import pwd
# owner = pwd.getpwuid(stat.st_uid).pw_name
# group = pwd.getpwuid(stat.st_gid).gr_name
owner = p.owner()
group = p.group()
size = size_handle(stat.st_size)
atime = atime_handle(stat.st_atime)
name = str(p)
return mode, nlink, owner, group, size, atime, name
def _ls(p):
for i in sorted(p.iterdir()):
if str(i).startswith('.'): continue
st = i.stat()
# mode-硬链接-属主-属组-大小-时间-文件名
print('{:<10}\t{:<1}\t{}\t{}\t{:>7}\t{:<20}\t{}'.format(*info(i,st)))
def ls():
parser = argparse.ArgumentParser(help=False)
parser.add_argument('-a', '--all',action="store_true",
help='Include directory entries whose names begin with a dot (.).')
parser.add_argument('-l', '--list',action='store_true',
help='List in long format.')
parser.add_argument('-h', action='store_true',)
parser.add_argument('file', nargs='?', default='./')
args = parser.parse_args()
p = args.file
_ls(p)
# Teacher Wayne
import argparse
from pathlib import Path
import stat
import datetime
def iterdir(p:str,all=False,detail=True,human=True):
def _getfiletype(p:Path): # 获取文件类型
if p.is_dir():
return 'd'
elif p.is_block_device():
return 'b'
elif p.is_char_device():
return 'c'
elif p.is_socket():
return 's'
elif p.is_symlink():
return 'l'
elif p.is_fifo():
return 'p'
else:
return '-'
def _gethuman(size:int): # 人性化显示大小,解决单位转换
units='KMGTP'
index=0
while size>=1024:
size=size//1024
index+=1
return '{}{}'.format(size,units[index].rstrip())
# modestr=['r','w','x','r','w','x','r','w','x'] # 解决文件权限,不用stat模块的实现方法
# def _getmode(mode:int): # 移位计算
# mode=mode&0o777
# mstr=''
# for i in range(8,-1,-1):
# m=mode>>i&1
# if m:
# mstr+=modestr[8-i]
# else:
# mstr+='-'
# return mstr
def _listdir(p:str,all=False,detail=True,human=True): # 列出本目录的文件
path=Path(p)
for x in path.iterdir():
if not all and x.name.startswith('.'): # 不显示隐藏文件
continue
if detail:
st=path.stat()
t=_getfiletype(path)
# mode=_getmode(st.st_mode)
mode=stat.filemode(st.st_mode)
atime=datetime.datetime.fromtimestamp(st.st_atime).strftime('%Y/%m/%d %H:%M:%S')
human=_gethuman(st.st_size) if human else str(st.st_size)
yield mode,st.st_nlink,(st.st_uid,x.owner()),(st.st_gid,x.group()),human,atime,x.name
# uid gid 可以替换为x.owner(),x.group(),非Windows均支持
else:
yield (x.name,) # 构造元祖方便用负索引、实现排序
yield from sorted(_listdir(p),key=lambda x:x[-1])
if __name__=='__main__':
parser = argparse.ArgumentParser(
prog='ls',
description='List information about the FILES',
add_help=False,
) # 获得一个参数解析器
# 以下注释为官方文档提供的样例
# parser.add_argument('integers', metavar='N', type=int, nargs='+',
# help='an integer for the accumulator')
# parser.add_argument('--sum', dest='accumulate', action='store_const',
# const=sum, default=max,
# help='sum the integers (default: find the max)')
parser.add_argument('path',nargs='*',default='.',help='filepath') # 一旦添加进来就属于必须参数,位置参数
parser.add_argument('-l',dest='list',action='store_true',help='use a long listing format')
parser.add_argument('-a','--all',action='store_true',help='do not ignore entries starting with .')
parser.add_argument('-h','--human-readable',dest='human',action='store_true',help='with -l,print sizes in human readable format')
args = parser.parse_args() # 分析参数,同时传入可迭代的参数,缺省值是'.'
# print(args) # 打印名词空间中收集的参数
# parser.print_help() # 打印帮助
for i in iterdir(args.path,args.all,args.list,args.human):
print(i)
三、二叉树和堆排序
1、打印树
# By me,算出最底端的总宽度,然后就能得到每行空格的宽度,每行依次打印
origin=[30,20,80,40,50,10,60,70,90]
import math
def tree(iterable):
depth=math.ceil(math.log(len(iterable)+1,2))
width=2**(depth-1)*8
length=len(iterable)
for i in range(1,depth+1):
space = int(width/2**i-1)
for j in range(2**(i-1),2**i):
print(space*' ',iterable[j-1] if j<=length else '',space*' ',sep='',end='')
print()
tree(origin)
# 方法一:居中对齐
import math
def print_tree(array,unit_width=2): # 两位数,故单位宽度设为2
length=len(array)
depth=math.ceil(math.log2(length+1))
index=0
width=2**depth-1 # 行宽,最深行15个数
for i in range(depth):
for j in range(2**i):
# 居中打印,后面追加一个空格,即end为第二行以后补中间数字占用的空格
print('{:^{}}'.format(array[index],width*unit_width),end=' '*unit_width)
index+=1
if index>=length:
break
width=width//2
print()
print_tree([x+1 for x in range(15)])
# 方法二:投影栅格
import math
def print_tree(array,unit_width=2): # 两位数,故单位宽度设为2
length=len(array)
depth=math.ceil(math.log2(length+1))
index=0
sep=' '*unit_width
for i in range(depth-1,-1,-1):
pre=2**i-1
print(sep*pre,end='') # 前置空格
offset=2**(depth-i-1)
line=array[index:index+offset]
index=index+offset
intervalspace=sep*(2*pre+1)
print(intervalspace.join(map(str,line)))
origin=[30,20,80,40,50,10,60,70,90]
print_tree(origin)
2、二叉树的遍历
广度优先遍历:层序遍历
深度优先遍历:前序遍历(先根遍历DLR)、中序遍历(中根遍历LDR)、后序遍历(后根遍历LRD);都是递归遍历
遍历序列:将树中所有元素遍历一遍后得到的元素序列,实现了将非线性的层次结构转换成线性结构sequence。
3、堆排序Heap Sort
由于每次都从无序区选出一个极值,故堆排序属于选择排序
1)堆是一个完全二叉树;
2)大顶堆:每个非叶子结点的值均大于等于左右孩子结点的值,根节点一定是大顶堆的最大值;次大值一定在第二层;
3)小顶堆:每个非叶子结点的值均小于等于左右孩子结点的值,根节点一定是小顶堆的最小值;
第一步:构建一个完全二叉树
将待排序数字首位凑0,组成一个层序遍历的新序列
第二步:构建大顶堆–核心算法
起点结点的选择:从最后一层的最右边叶子结点的父结点开始;根据二叉树的性质5,结点数为n,则起始结点的编号为n//2;
下一结点的选择:从起始结点开始向左找同层结点,到头后再从上一层的最右边结点开始继续向左逐个查找,直至根节点;
调整成大顶堆:通过交换,确保每个结点都比左右结点都值大;
第三步:排序
将大顶堆根节点这个最大值和最后一个叶子结点交换,然后将这个叶子结点排除在待排序结点之外;
从新的根节点开始,重新调整为大顶堆后,重复上一步;
# 堆排序
def heap_sort(array):
def heap_adjust(n,i,array:list):
'''
核心算法:调整当前结点
调整的起点在n//2,保证所有调整的结点都有孩子结点
param n:待比较的数字个数
param i:当前结点的下标
param array:待排序数据
'''
# 单个结点从上往下调整
while 2*i<=n: # 判断有无孩子结点
# 孩子结点判断:2i为左孩子,2i+1为右孩子
lchild_index=2*i
max_child_index=lchild_index
# n>2i 说明还有右孩子,然后比较左右孩子的值大小,将大值赋给max
if n>lchild_index and array[lchild_index+1]>array[lchild_index]:
max_child_index=lchild_index+1
# 再和子树根结点比较
if array[max_child_index]>array[i]:
array[i],array[max_child_index]=array[max_child_index],array[i]
i=max_child_index # 交换后,需要判断是否还需要调整
else:
break
# 构建大顶堆
def max_heap(total,array:list): #
for i in range(total//2,0,-1):
heap_adjust(total,i,origin)
return array
# 排序
def sort(array:list):
total=len(origin)-1 # 初始待排序元素个数n
max_heap(total,array) # 排序前先构建一个大顶堆
while total>1:
array[1],array[total]=array[total],array[1]
total-=1 # 对调完以后,大顶堆的平衡被破坏
# 小优化点:最后剩两个元素时,如果后结点比堆顶大,就不用调整了
if total==2 and array[total]>=array[total-1]:
break
# 从堆顶破坏的,故从堆顶开始往下调整,又会在堆顶得到一个新的最大值
heap_adjust(total,1,array)
return array
return sort(array)[1:]
# 为了和编码对应,首位凑0
origin=[0,30,20,80,40,50,10,60,70,90]
print(heap_sort(origin))
1)哪怕最后只剩3个元素,也要做堆调整,算法整体能优化的点并不多;
2)堆排序的时间复杂度是O(nlogn);空间复杂度是O(1),因为只使用了一个交换用的空间,使用空间是常量,不随规模变化而变化;
3)由于堆排序对原始记录的排序状态并不敏感,因此无论是最好、最差的情况,平均时长均为O(nlogn);
4)稳定性
稳定性一般指等值情况下谁先谁后;
堆排序是不稳定的排序算法,因为每次得到的大顶堆不一样,每次只要有一两个数不一样,都会调整出不一样的大顶堆;
# 练习
def heap_sort(array):
def heap_adjust(array,i,n):
while 2*i<=n:
left_child_index=2*i
max_child_index=left_child_index
if 2*i+1<=n and array[2*i+1]>array[left_child_index]:
max_child_index=2*i+1
if array[i]<array[max_child_index]:
array[i],array[max_child_index]=array[max_child_index],array[i]
i=max_child_index
else: # 注意退出条件
break
def max_heap(array,n):
for i in range(n//2,0,-1):
heap_adjust(array,i,n)
def sort(array):
array=[0]+array
n=len(array)-1
max_heap(array,n)
while n>1:
array[1],array[n]=array[n],array[1]
n-=1
if n==2 and array[n]>array[1]:
break
heap_adjust(array,1,n)
return array
return sort(array)[1:]
import random
l=[random.randrange(1,50) for _ in range(10)]
print(l)
print(heap_sort(l))
四、正则表达式
1、socket模块
快速提出所有的ip地址后,再用socket模块验证即可
import socket
line="""192.168.1.150
0.0.0.0
255.255.255.255
17.16.52.100
172.16.0.100
400.400.999.888
001.022.003.000
257.257.255.256"""
for i,ip in enumerate(line.splitlines()):
# socket.inet_aton是从str转成bytes,并验证ip合法性,有错误抛出
try:
net=socket.inet_aton(ip)
except Exception as e:
print(i,ip,e)
continue
print(i,ip,net,socket.inet_ntoa(net)) # 从bytes转成str
# 运行结果:
0 192.168.1.150 b'\xc0\xa8\x01\x96' 192.168.1.150
1 0.0.0.0 b'\x00\x00\x00\x00' 0.0.0.0
2 255.255.255.255 b'\xff\xff\xff\xff' 255.255.255.255
3 17.16.52.100 b'\x11\x104d' 17.16.52.100
4 172.16.0.100 b'\xac\x10\x00d' 172.16.0.100
5 400.400.999.888 illegal IP address string passed to inet_aton
6 001.022.003.000 b'\x01\x12\x03\x00' 1.18.3.0
7 257.257.255.256 illegal IP address string passed to inet_aton
2、日志数据提取和分析
import datetime
line='''42.120.74.236 - - [18/Apr/2017:11:03:05 +0800] "GET /app/template/default//images/arr.png HTTP/1.1" 200 1120 "http://job.magedu.com/app/template/default//style/yun_index4.1.css" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"'''
# 分割数据
def makekey(line:str,chars=set(' \'"[]')):
start=0
flag=False
for i,c in enumerate(line):
if c in chars:
if c=='[':
flag=True
start=i+1
if c==']':
flag=False
if c=='"':
flag = not flag
if flag: # 第一次碰到双引号,flag=True时,才改变start
start=i+1
if i==start:
start=i+1
continue
if not flag:
yield line[start:i]
start=i+1
else:
if start<len(line):
yield line[start:]
print([i for i in makekey(line)])
# 类型转换
def donothing(data):
return data
names=('remote','','','datetime','request','status','length','css','useragent')
options=(donothing,donothing,donothing,lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
lambda request:dict(zip(('method','url','protocol'),request.split())),
int,int,donothing,donothing)
def extract(line:str):
return dict(map(lambda item:(item[0],item[1](item[2])),zip(names,options,makekey(line))))
print(extract(line))
# 运行结果:
['42.120.74.236', '-', '-', '18/Apr/2017:11:03:05 +0800', 'GET /app/template/default//images/arr.png HTTP/1.1', '200', '1120', 'http://job.magedu.com/app/template/default//style/yun_index4.1.css', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36']
{'remote': '42.120.74.236', '': '-', 'datetime': datetime.datetime(2017, 4, 18, 11, 3, 5, tzinfo=datetime.timezone(datetime.timedelta(seconds=28800))), 'request': {'method': 'GET', 'url': '/app/template/default//images/arr.png', 'protocol': 'HTTP/1.1'}, 'status': 200, 'length': 1120, 'css': 'http://job.magedu.com/app/template/default//style/yun_index4.1.css', 'useragent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
# 正则表达式
import datetime
import re
line='''42.120.74.236 - - [18/Apr/2017:11:03:05 +0800] "GET /app/template/default//images/arr.png HTTP/1.1" 200 1120 "http://job.magedu.com/app/template/default//style/yun_index4.1.css" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"'''
options={'datetime':lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
'status':int,'size':int}
def extract(line:str):
# 注意[]等字符前面要加\转义,注意善用取反[^"]
pattern='(?P<remote>[\d.]{7,15}) - - \[(?P<datetime>.*)\] "(?P<method>[^" ]+) (?P<url>[^" ]+) (?P<protocol>[^" ]+)" (?P<status>\d+) (?P<size>\d+) "(?P<css>[^"]*)" "(?P<useragent>[^"]*)"'
regex=re.compile(pattern)
matcher=regex.match(line)
# 如果不写start和end,会只显示match的50个字符
print(line[matcher.start():matcher.end()])
print(matcher.groupdict())
return {k:options.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
print(extract(line))
# 运行结果:
42.120.74.236 - - [18/Apr/2017:11:03:05 +0800] "GET /app/template/default//images/arr.png HTTP/1.1" 200 1120 "http://job.magedu.com/app/template/default//style/yun_index4.1.css" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
{'remote': '42.120.74.236', 'datetime': '18/Apr/2017:11:03:05 +0800', 'method': 'GET', 'url': '/app/template/default//images/arr.png', 'protocol': 'HTTP/1.1', 'status': '200', 'size': '1120', 'css': 'http://job.magedu.com/app/template/default//style/yun_index4.1.css', 'useragent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
{'remote': '42.120.74.236', 'datetime': datetime.datetime(2017, 4, 18, 11, 3, 5, tzinfo=datetime.timezone(datetime.timedelta(seconds=28800))), 'method': 'GET', 'url': '/app/template/default//images/arr.png', 'protocol': 'HTTP/1.1', 'status': 200, 'size': 1120, 'css': 'http://job.magedu.com/app/template/default//style/yun_index4.1.css', 'useragent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}