series操作:
import pandas as pd
import numpy as np
# series:带标签的一维数组
# 创建空对象
s = pd.Series()
print(s)
# 通过列表创建
# index:设置series的标签,默认是从0开始的递增整数
s = pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])
print(s)
# 通过ndarray创建
a = np.array([1, 2, 3, 4, 5, 6])
s = pd.Series(a)
print(s)
# 通过字典创建,把字典中的key作为标签
dic = {"a": 1, "b": 2, "c": 3}
s = pd.Series(dic)
print(s)
# 访问Series中的元素,通过标签名获取元素
item = s["b"]
print(item)
# 遍历
# items():返回元素的标签和值
dic = {"a": 1, "b": 2, "c": 3}
s = pd.Series(dic)
for idx, value in s.items():
print(idx, value)
# index遍历
for idx in s.index:
print(idx, s[idx])
# values遍历,直接获取Series的值
for value in s.values:
print(value)
dataframe创建:
import pandas as pd
# DataFrame创建
# 创建空对象
df = pd.DataFrame()
print(df)
# 通过列表嵌套字典方式创建
# 列表的字典中字段个数不一致,则按照最多的字段创建
data = [{"name": "zhangsan", "age": 20, "address": "四川省"}, {"name": "lisi", "age": 30, "sex": 0}]
df = pd.DataFrame(data)
print(df)
# 通过字典嵌套列表方式创建
data = {"name": ["zhangsan", "lisi", "wangwu"], "age": [20, 30, 19]}
df = pd.DataFrame(data)
print(df)
# 指定index和columns
# 在创建dataframe时,不能直接修改columns,可以在创建完成后修改
df = pd.DataFrame(data, index=["a", "b", "c"])
df.columns = ["name1", "age1"]
print(df)
# 通过Series创建
data = {"one": pd.Series([1, 2, 3], index=["a", "b", "c"]),
"two": pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])}
result = pd.DataFrame(data=data)
print(result)
dataframe列操作:
import numpy as np
import pandas as pd
# 访问某一列数据
data = {"one": pd.Series([1, 2, 3], index=["a", "b", "c"]),
"two": pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])}
result = pd.DataFrame(data=data)
d = data["one"]
print(d)
# 添加一列数据,添加一个新列名,直接赋值即可
# 新添加的数据长度要和原dataframe的行长度一致,否则报错
result["three"] = [1, 2, 3, 4]
print(result)
# assign:添加新列
# assign方法可以使用链式调用实现
# assign方法的参数是一个赋值语句
# 等号左边是要添加的列名,右边是要添加的数据
data = {"one": pd.Series([1, 2, 3], index=["a", "b", "c"]),
"two": pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])}
df = pd.DataFrame(data)
df = df.assign(three=[1, 2, 3, 4]).assign(four=[5, 6, 7, 8])
print(df)
# insert插入一列
# 参数:loc——要插入的索引位置
# column——要插入的列名
# value——要插入的数据
df.insert(1, "five", [10, 20, 30, 40])
print(df)
# 修改数据
# 获取已存在的列,直接赋值即可修改
data = {"one": pd.Series([1, 2, 3], index=["a", "b", "c"]),
"two": pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])}
df = pd.DataFrame(data)
# df["two"] = pd.Series([10, 20, 30, 40], index=["a", "b", "c", "d"])
df["two"] = [10, 20, 30, 40]
print(df)
# 修改列名
# rename:参数columns数据格式是一个字典,字典的key是原列名,value是新列名
# rename修改列名返回一个新的DataFrame,原来的不受影响
# 通过df.columns属性修改列名,是在原DataFrame上直接修改
data = {"one": pd.Series([1, 2, 3], index=["a", "b", "c"]),
"two": pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])}
df = pd.DataFrame(data)
df1 = df.rename(columns={"one": "A", "two": "B"})
print(df1)
print(df)
# astype:修改数据类型,修改的是某一列数据的类型
df1["B"] = df1["B"].astype(np.float32)
print(df1)
# dtypes:返回dataframe的数据类型
print(df1.dtypes)
# 删除
# drop():
# 参数:labels——要删除的行或列的标签名
# axis——指定要删除的轴,0代表按行删除,1代表按列删除
# inplace——如果值为True表示原地删除(在原DataFrame删除)
# 如果值为False表示删除返回一个新DataFrame,默认值为False
# 按列删除示例:
data = {"one": pd.Series([1, 2, 3], index=["a", "b", "c"]),
"two": pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])}
df = pd.DataFrame(data)
df1 = df.drop(["one"], axis=1)
print(df1)
# 按行删除示例:
data = {"one": pd.Series([1, 2, 3], index=["a", "b", "c"]),
"two": pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"])}
df = pd.DataFrame(data)
df1 = df.drop(["c", "d"], axis=0)
print(df1)
# 原地删除
df.drop(["one"], axis=1, inplace=True)
print(df)
dataframe行操作:
import pandas as pd
# import numpy as np
# 访问行元素
# loc:通过标签名获取数据
data = {
"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": [9, 10, 11, 12]
}
df = pd.DataFrame(data, index=["a", "b", "c", "d"])
# 获取a行数据
print(df.loc["a"])
# 获取a行到c行的数据,loc中切片的终止值是被包含的
print(df.loc["a":"c"])
# 获取a行B列的值
print(df.loc["a", "B"])
# 通过行和列使用切片获取数据
print(df.loc["a":"c", "A":"B"])
# 通过行和列指定标签列表获取数据
print(df.loc[["a", "c"], ["A", "C"]])
# iloc:通过下标获取数据
data = {
"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": [9, 10, 11, 12]
}
df = pd.DataFrame(data, index=["a", "b", "c", "d"])
# 获取第一行数据
print(df.iloc[0])
# 获取第一行到第三行的数据
# 使用iloc下标做切片,不包含终止值
print(df.iloc[0:3])
# 行和列使用切片获取数据
print(df.iloc[0:2, 0:2])
# 添加数据行
# loc方法添加新行
data = {
"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": [9, 10, 11, 12]
}
df = pd.DataFrame(data, index=["a", "b", "c", "d"])
df.loc["e"] = [13, 14, 15]
print(df)
# concat:连接DataFrame
# 按行连接
df1 = pd.DataFrame({
"A": [1, 2, 3],
"B": [4, 5, 6]
})
df2 = pd.DataFrame({
"A": [7, 8, 9],
"B": [10, 11, 12],
"c": [13, 14, 15]
})
df3 = pd.concat([df1, df2], axis=0, ignore_index=True)
print(df3)
# 按列连接,join=inner,则按行取交集
df1 = pd.DataFrame({
"A": [1, 2, 3],
"B": [4, 5, 6]
})
df2 = pd.DataFrame({
"C": [7, 8, 9],
"D": [10, 11, 12]
})
df3 = pd.concat([df1, df2], axis=1)
print(df3)
# join:默认为outer(并集),可改为inner(交集)
df1 = pd.DataFrame({
"A": [1, 2, 3],
"B": [4, 5, 6]
}, index=[0, 1, 2])
df2 = pd.DataFrame({
"A": [7, 8, 9],
"B": [10, 11, 12]
}, index=[1, 2, 3])
df3 = pd.concat([df1, df2], axis=1, join="inner")
print(df3)
# 按行连接,join=inner,则按列取交集
df1 = pd.DataFrame({
"A": [1, 2, 3],
"B": [4, 5, 6]
}, index=[0, 1, 2])
df2 = pd.DataFrame({
"A": [7, 8, 9],
"B": [10, 11, 12],
"C": [11, 12, 13]
}, index=[1, 2, 3])
df3 = pd.concat([df1, df2], axis=0, join="inner")
print(df3)
# 统计函数
# var:样本方差
# pandas默认使用样本方差,numpy默认使用总体方差
data = {
"A": [1, 2, 3, 4, 5],
"B": [10, 20, 30, 40, 50],
"C": [100, 200, 300, 400, 500]
}
df = pd.DataFrame(data)
print(df.var(axis=1))
dataframe函数:
import pandas as pd
# 重置行索引
# 如果要重置的索引中有新的行索引,则该行索引对应的数据默认填充NaN
# 要重置的行索引可以调整顺序
data = {
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9]
}
df = pd.DataFrame(data, index=["a", "b", "c"])
new_index = ["b", "a", "c", "d"]
df1 = df.reindex(new_index)
print(df1)
# 重置列索引
# 如果要重置的索引中有新的列索引,则该列索引对应的数据默认填充NaN
# 要重置的列索引可以调整顺序
data = {
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9]
}
df = pd.DataFrame(data, index=["a", "b", "c"])
idx = ["A", "B", "C", "D"]
df1 = df.reindex(columns=idx)
print(df1)
# 重置索引
# method:做数据填充
# ffill——前向填充,bfill——后向填充
data = {
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9]
}
df = pd.DataFrame(data, index=["a", "b", "c"])
idx = ["A", "B", "C", "D"]
df1 = df.reindex(columns=idx, method="ffill")
print(df1)
# fill_value:当行或列的数据为NaN,fill_value可以使用指定值进行填充
data = {
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9]
}
df = pd.DataFrame(data, index=["a", "b", "c"])
idx = ["A", "B", "C", "D"]
df1 = df.reindex(columns=idx, fill_value=0)
print(df1)
# DataFrame遍历
data = {
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9]
}
df = pd.DataFrame(data)
# 直接遍历dataframe,默认返回的是列名
for i in df:
print(i)
# itertuples():遍历行,返回的是带标签的数据元组
data = {
"A": [1, 2, 3],
"B": [4, 5, 6]
}
df = pd.DataFrame(data)
for row in df.itertuples():
print(row)
# index=False:在返回的行元组中去掉索引项,只保留数据
for row in df.itertuples(index=False):
print(row)
# items():遍历列,返回的是列名和列数据
data = {
"A": [1, 2, 3],
"B": [4, 5, 6]
}
df = pd.DataFrame(data)
for col_index, col_value in df.items():
print(f"index:{col_index}, value:{col_value}")
# 使用index和columns属性遍历
data = {
"A": [1, 2, 3],
"B": [4, 5, 6]
}
df = pd.DataFrame(data)
for idx in df.index:
for col in df.columns:
print(df.loc[idx, col])
# 排序
# sort_values():对数据进行排序
data = {
"A": [2, 3, 1],
"B": [5, 6, 4],
"C": [9, 8, 7]
}
df = pd.DataFrame(data, index=["b", "c", "a"])
# 按单列进行排序
df1 = df.sort_values(["A"])
print(df1)
# 按多列进行排序
df2 = df.sort_values(["A", "B"])
print(df2)
# ascending:设置升序还是降序排列
# True——升序(默认),False——降序
df2 = df.sort_values(["A", "B"], ascending=False)
print(df2)
df = pd.DataFrame({
"Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
"Age": [25, 30, 25, 35, 30],
"Score": [85, 90, 80, 95, 88]
})
# 先按照年龄降序排序,如果年龄相同,则按照分数进行升序排序
df1 = df.sort_values(["Age", "Score"], ascending=[False, True])
print(df1)