1,numpy
import numpy as np
# 随机数生成器
np.random.seed(1) # 设置随机数种子
print(np.random.random(10)) # 0-1之间的随机数
print(np.random.randint(1,9,10)) # 1-9之间的整数
print(np.random.rand(10)) # 0~1均匀分布的随机样本值
print(np.random.randn(10)) # 标准高斯分布
print(np.random.normal((3,3))) # 高斯分布
print(np.random.uniform(-1,1,(3,3))) # 均匀分布
c = [[1,2,3,4],[5,6,7,8]]
print(np.stack((c,),axis=0)) # 升高维度
print(np.vstack((c,c))) # 竖直合并,不改变维度
print(np.hstack((c,c))) # 水平合并,不改变维度
arr = np.array([1.22,4.44,5.5])
print(np.ceil(arr)) # 右取证
print(np.floor(arr)) # 左去整
print(np.rint(arr)) # 四舍五入
arr = np.array([1,2,3,np.NaN,4])
print(np.isnan(arr))
print(arr[~np.isnan(arr)]) # 过滤 空,np.NaN
print(np.unique([1,1,2])) # 去重
2,matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
# 设置中文显示
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# 1,线图
x = [1,2,3,4,5,6]
y1 = [100,200,300,350,410,400]
y2 = [120,220,320,370,430,390]
plt.figure()
plt.plot(x,y1,'r--',label='出口')
plt.plot(x,y2,'b-',label='进口')
plt.legend(loc='loss')
plt.title('上半年进出口数据')
plt.xlabel('月份')
plt.ylabel('进出口/美元(亿)')
plt.xticks(np.arange(1,7,1),['1月','2月','3月','4月','5月','6月'])
plt.yticks(np.arange(100,600,100))
ax = plt.gca()
ax.spines['right'].set_color(None)
ax.spines['top'].set_color(None)
plt.savefig('zitu.jpg') # 保存图
plt.show()
# 多图显示
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.plot(np.random.randn(50).cumsum(),'r--')
ax2 = fig.add_subplot(212)
ax2.plot(np.random.randn(50).cumsum(),'b--')
# 多图显示,循环
fig, ax = plt.subplots(2,2)
for i in range(2):
for j in range(2):
ax[i][j].plot(np.random.randn(50).cumsum(),'r--')
plt.savefig('zitu.jpg')
plt.show()
# 2,柱状图
height = np.array([100,200,300,400])
left = np.arange(1,5,1)
n = 4 # 柱子的宽度
width = 0.8/4
plt.figure()
plt.bar(left,height,width=width, color = 'lightskyblue',align = 'center',label = '成都')
plt.bar(left-width,height+20,width=width,color = 'red',align = 'center',label = '杭州')
plt.bar(left+width,height+10,width = width,color = 'yellowgreen',align = 'center',label = '兰州')
plt.legend(loc='upper left')
plt.title('人数')
plt.xlabel('地点')
plt.ylabel('人数')
plt.xticks(np.arange(1,5,1),['1月','2月','3月','4月'])
plt.yticks(np.arange(100,600,100))
ax = plt.gca()
ax.spines['right'].set_color(None)
ax.spines['top'].set_color(None)
plt.show()
# 3,直方图
mu , sigma = 100,20
x = np.random.randn(10000)*mu*sigma
x = np.random.randn(10000).cumsum()
plt.figure()
plt.hist(x,1000,alpha = 0.5,color = 'red')
plt.title('直方图')
plt.grid(True)
plt.show()
# 4,散点图
x = np.random.randn(1000)
y = np.random.randn(1000)
plt.scatter(x,y,c = np.random.rand(1000),s = np.random.rand(1000)*50,alpha = 0.7)
plt.title('散点图')
plt.show()
# 5,饼图
x = [15,25,30,30]
color = ['r','yellow','green','blue']
explode = [0,0.1,0,0]
labels = ['亚洲','欧洲','欧洲','非洲']
plt.figure()
plt.pie(x,explode = explode,labels = labels , colors = color,startangle = 90,shadow = True,labeldistance = 1.1,autopct = '%.1f%%')
plt.axis('equal')
plt.title('饼图')
plt.show()
# 实际应用,散点图和线图
import numpy as np
import matplotlib.pyplot as plt
SIZE = 13
Y = np.linspace(-6, 6, SIZE)
X = np.linspace(-2, 3, SIZE)
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
ax1.set_title("SCATTER PLOT")
random_x = []
random_y = []
for i in range(SIZE):
random_x.append(X[i] + np.random.uniform(-1, 1))
for i in range(SIZE):
random_y.append(Y[i] + np.random.uniform(-1, 1))
RANDOM_X = np.array(random_x) # 散点图的横轴。
RANDOM_Y = np.array(random_y) # 散点图的纵轴。
ax1.scatter(RANDOM_X, RANDOM_Y)
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.plot(X, Y)
plt.show()
3,pandas
3.1 Series
# 创建series并创建索引
ser01 = pd.Series(np.random.randint(1,9,5), index=['a','b','c','d','e'])
ser01.name = 'series'
ser01.index.name = '编号'
print(ser01)
print(ser01.index) # 索引
print(ser01.empty) # 判断是否为空
print(ser01.ndim) # 维度
print(ser01.size) # 行数
print(ser01.head(3)) # 返回前3行
print(ser01.tail(2)) # 返回倒数2行
ser01.pop(0) # 删除第0行
# 缺失值
ser01['m'] = np.NaN # 设置为空
print(ser01)
print(ser01.notnull()) # 输出索引
print(ser01[ser01.notnull()]) # 输出非空值
3.2 DataFrame,二维数据
# 通过列表创建
li = [[1,2,3,4],[5,6,7,8]]
df01 = pd.DataFrame(li)
print(df01)
print(df01.shape) # 形状,统计行和列
print(df01.index) # 行 索引
print(df01.columns) # 列 索引
print(df01.values) # 值
print(ser01.head(3)) # 返回前3行
print(ser01.tail(2)) # 返回倒数2行
print(df01.loc[0]) # 返回第0行
print(df01[0:2]) # 索引切片,如果数组中包含字符,必须使用loc方法索引查询
print(df01[0]) # 索引
print(df01.loc[0]) # 数字索引
print(df01.iloc[0]) # 数字索引
df02=df01.drop(0) # 删除第0行
df02=df01.dropna() # 返回删除包含NaN的行
df01=df01.fillna(0) # 返回包含NaN的行为0
df01 = df01.fillna(method='ffill') # 根据前一个值进行填充
df01 = df01.fillna(method='bfill') # 根据后一个值进行填充
df01=df01.fillna({'b':100,'c':100}) # NaN替换
df01=df01.replace({np.NaN:'haha'}) # NaN替换
4, time模块
import time
print(time.time()) # 获取当前时间的时间戳,当前时间 - (1970-1-1 0:0:0) == 输出秒
print(time.localtime()) # 返回本地的时间元组
print(time.localtime().tm_year) # 输出本地的时间元组的年份
print(time.localtime(time.time())) # 将时间戳转化为时间元组
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) # 将时间元组格式化输出时间,常用
print(time.mktime(time.localtime())) # 将时间元组转换为时间戳,常用
print(time.strptime("2018-12-14 0:00:00",'%Y-%m-%d %H:%M:%S')) # 将字符串转化为时间元组,常用
print(time.strftime('%Y/%m/%d %H/%M/%S',time.strptime(time_str,'%Y-%m-%d %H:%M:%S'))) # 将时间元组格式化输出,常用
time.sleep(2) # 主线程休眠2秒
5,bs4
from bs4 import BeautifulSoup
import requests
starURL = 'http://www.runoob.com/python/python-100-examples.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
response = requests.get(starURL, headers=headers).content.decode('utf-8')
soup = BeautifulSoup(response, 'lxml')
# 提取 100个 a链接
link = []
for i in soup.find(id = 'content').ul.find_all('a'):
link.append(i['href'])
print(link[0])
6,xml
6.1 etree
from xml.etree import ElementTree as ET
# xml_file = r"D:\Deep_Learning_data\gen_data\test_xml\2007_000323.xml"
root = ET.parse(xml_file).getroot()
img_name = root.find("filename").text
6.2 lxml
from lxml import etree
html = etree.HTML(open('web.html',encoding='utf-8').read()) # 容错率高
'''
选取节点
'''
# print(len(html.xpath('//div')))
# print(len(html.xpath('/html/body/div'))) # 从根节点开始查找
# print(len(html.xpath('//div/a'))) # 从全文中开始查找
# print(len(html.xpath('//div/a/..'))) # 查找该节点的父节点 . 该节点自己
# print(html.xpath("//div[@class='left']/a")) # [@class='xxx'] 查找属性
7,python删除指定的字符
>>> # Whitespace stripping
>>> s = ' hello world \n'
>>> s.strip()
'hello world'
>>> s.lstrip()
'hello world \n'
>>> s.rstrip()
' hello world'
>>>
>>> # Character stripping
>>> t = '-----hello====='
>>> t.lstrip('-')
'hello====='
>>> t.strip('-=')
'hello'
>>>