python numpy, pandas, matplotlib, time, bs4, lxml读取

最新推荐文章于 2022-11-03 19:51:43 发布

原创最新推荐文章于 2022-11-03 19:51:43 发布 · 206 阅读

0 ·

CC 4.0 BY-SA版权

python 专栏收录该内容

104 篇文章

订阅专栏

本文详细介绍Python在数据科学领域的核心库，包括Numpy的数学运算、Matplotlib的数据可视化、Pandas的数据处理，以及时间处理和网页解析等技术。通过实例演示如何生成随机数、绘制图表、操作数据框和解析网页。

1，numpy

import numpy as np

# 随机数生成器
np.random.seed(1)                    # 设置随机数种子
print(np.random.random(10))          # 0-1之间的随机数
print(np.random.randint(1,9,10))     # 1-9之间的整数
print(np.random.rand(10))            # 0~1均匀分布的随机样本值
print(np.random.randn(10))           # 标准高斯分布

print(np.random.normal((3,3)))       # 高斯分布
print(np.random.uniform(-1,1,(3,3))) # 均匀分布


c = [[1,2,3,4],[5,6,7,8]]
print(np.stack((c,),axis=0))         # 升高维度
print(np.vstack((c,c)))              # 竖直合并，不改变维度
print(np.hstack((c,c)))              # 水平合并，不改变维度

arr = np.array([1.22,4.44,5.5])     
print(np.ceil(arr))                  # 右取证
print(np.floor(arr))                 # 左去整
print(np.rint(arr))                  # 四舍五入


arr = np.array([1,2,3,np.NaN,4])
print(np.isnan(arr))
print(arr[~np.isnan(arr)])           # 过滤 空，np.NaN

print(np.unique([1,1,2]))            # 去重

2，matplotlib

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np


# 设置中文显示
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False


# 1,线图
x = [1,2,3,4,5,6]
y1 = [100,200,300,350,410,400]
y2 = [120,220,320,370,430,390]
plt.figure()
plt.plot(x,y1,'r--',label='出口')
plt.plot(x,y2,'b-',label='进口')
plt.legend(loc='loss')
plt.title('上半年进出口数据')
plt.xlabel('月份')
plt.ylabel('进出口/美元(亿)')
plt.xticks(np.arange(1,7,1),['1月','2月','3月','4月','5月','6月'])
plt.yticks(np.arange(100,600,100))
ax = plt.gca()
ax.spines['right'].set_color(None)
ax.spines['top'].set_color(None)
plt.savefig('zitu.jpg')                   # 保存图
plt.show()

# 多图显示
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.plot(np.random.randn(50).cumsum(),'r--')
ax2 = fig.add_subplot(212)
ax2.plot(np.random.randn(50).cumsum(),'b--')

# 多图显示，循环
fig, ax = plt.subplots(2,2)
for i in range(2):
    for j in range(2):
        ax[i][j].plot(np.random.randn(50).cumsum(),'r--')
plt.savefig('zitu.jpg')
plt.show()


# 2，柱状图
height = np.array([100,200,300,400])
left = np.arange(1,5,1)
n = 4 # 柱子的宽度
width = 0.8/4
plt.figure()
plt.bar(left,height,width=width, color = 'lightskyblue',align = 'center',label = '成都')
plt.bar(left-width,height+20,width=width,color = 'red',align = 'center',label = '杭州')
plt.bar(left+width,height+10,width = width,color = 'yellowgreen',align = 'center',label = '兰州')
plt.legend(loc='upper left')
plt.title('人数')
plt.xlabel('地点')
plt.ylabel('人数')
plt.xticks(np.arange(1,5,1),['1月','2月','3月','4月'])
plt.yticks(np.arange(100,600,100))
ax = plt.gca()
ax.spines['right'].set_color(None)
ax.spines['top'].set_color(None)
plt.show()


# 3，直方图
mu , sigma = 100,20
x = np.random.randn(10000)*mu*sigma
x = np.random.randn(10000).cumsum()
plt.figure()
plt.hist(x,1000,alpha = 0.5,color = 'red')
plt.title('直方图')
plt.grid(True)
plt.show()


# 4，散点图
x = np.random.randn(1000)
y = np.random.randn(1000)
plt.scatter(x,y,c = np.random.rand(1000),s = np.random.rand(1000)*50,alpha = 0.7)
plt.title('散点图')
plt.show()


# 5,饼图
x = [15,25,30,30]
color = ['r','yellow','green','blue']
explode = [0,0.1,0,0]
labels = ['亚洲','欧洲','欧洲','非洲']
plt.figure()
plt.pie(x,explode = explode,labels = labels , colors = color,startangle = 90,shadow = True,labeldistance = 1.1,autopct = '%.1f%%')
plt.axis('equal')
plt.title('饼图')
plt.show()


# 实际应用，散点图和线图
import numpy as np
import matplotlib.pyplot as plt


SIZE = 13
Y = np.linspace(-6, 6, SIZE)
X = np.linspace(-2, 3, SIZE)

fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
ax1.set_title("SCATTER PLOT")
random_x = []
random_y = []
for i in range(SIZE):
    random_x.append(X[i] + np.random.uniform(-1, 1))
for i in range(SIZE):
    random_y.append(Y[i] + np.random.uniform(-1, 1))
RANDOM_X = np.array(random_x)  # 散点图的横轴。
RANDOM_Y = np.array(random_y)  # 散点图的纵轴。

ax1.scatter(RANDOM_X, RANDOM_Y)
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax1.plot(X, Y)
plt.show()

3，pandas

3.1 Series

# 创建series并创建索引
ser01 = pd.Series(np.random.randint(1,9,5), index=['a','b','c','d','e'])
ser01.name = 'series'
ser01.index.name = '编号'
print(ser01)
print(ser01.index)    # 索引
print(ser01.empty)    # 判断是否为空
print(ser01.ndim)     # 维度
print(ser01.size)     # 行数
print(ser01.head(3))  # 返回前3行
print(ser01.tail(2))  # 返回倒数2行
ser01.pop(0)          # 删除第0行

# 缺失值
ser01['m'] = np.NaN            # 设置为空
print(ser01)
print(ser01.notnull())         # 输出索引
print(ser01[ser01.notnull()])  # 输出非空值

3.2 DataFrame，二维数据

# 通过列表创建
li = [[1,2,3,4],[5,6,7,8]]
df01 = pd.DataFrame(li)
print(df01)
print(df01.shape)     # 形状，统计行和列
print(df01.index)     # 行 索引
print(df01.columns)   # 列 索引
print(df01.values)    # 值
print(ser01.head(3))  # 返回前3行
print(ser01.tail(2))  # 返回倒数2行
print(df01.loc[0])    # 返回第0行
print(df01[0:2])      # 索引切片，如果数组中包含字符，必须使用loc方法索引查询
print(df01[0])        # 索引
print(df01.loc[0])    # 数字索引
print(df01.iloc[0])   # 数字索引  
df02=df01.drop(0)     # 删除第0行
df02=df01.dropna()    # 返回删除包含NaN的行
df01=df01.fillna(0)   # 返回包含NaN的行为0
df01 = df01.fillna(method='ffill')  # 根据前一个值进行填充
df01 = df01.fillna(method='bfill')  # 根据后一个值进行填充
df01=df01.fillna({'b':100,'c':100}) # NaN替换
df01=df01.replace({np.NaN:'haha'})  # NaN替换

4， time模块

import time

print(time.time())  # 获取当前时间的时间戳，当前时间 - (1970-1-1 0:0:0) == 输出秒

print(time.localtime())            # 返回本地的时间元组
print(time.localtime().tm_year)    # 输出本地的时间元组的年份
print(time.localtime(time.time())) # 将时间戳转化为时间元组

print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) # 将时间元组格式化输出时间，常用

print(time.mktime(time.localtime()))   # 将时间元组转换为时间戳，常用

print(time.strptime("2018-12-14 0:00:00",'%Y-%m-%d %H:%M:%S')) # 将字符串转化为时间元组，常用
print(time.strftime('%Y/%m/%d %H/%M/%S',time.strptime(time_str,'%Y-%m-%d %H:%M:%S'))) # 将时间元组格式化输出，常用

time.sleep(2) # 主线程休眠2秒

5，bs4

from bs4 import BeautifulSoup
import requests


starURL = 'http://www.runoob.com/python/python-100-examples.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}

response = requests.get(starURL, headers=headers).content.decode('utf-8')
soup = BeautifulSoup(response, 'lxml')
# 提取 100个 a链接
link = []
for i in soup.find(id = 'content').ul.find_all('a'):
    link.append(i['href'])
print(link[0])

6，xml

6.1 etree

from xml.etree import ElementTree as ET

# xml_file = r"D:\Deep_Learning_data\gen_data\test_xml\2007_000323.xml"
root = ET.parse(xml_file).getroot()
img_name = root.find("filename").text

6.2 lxml

from lxml import etree

html = etree.HTML(open('web.html',encoding='utf-8').read())  # 容错率高
'''
    选取节点
'''
# print(len(html.xpath('//div')))
# print(len(html.xpath('/html/body/div')))     # 从根节点开始查找
# print(len(html.xpath('//div/a')))            # 从全文中开始查找
# print(len(html.xpath('//div/a/..')))         # 查找该节点的父节点 . 该节点自己
# print(html.xpath("//div[@class='left']/a"))  # [@class='xxx']  查找属性

7，python删除指定的字符

>>> # Whitespace stripping
>>> s = ' hello world \n'
>>> s.strip()
'hello world'
>>> s.lstrip()
'hello world \n'
>>> s.rstrip()
' hello world'
>>>
>>> # Character stripping
>>> t = '-----hello====='
>>> t.lstrip('-')
'hello====='
>>> t.strip('-=')
'hello'
>>>