一、Pandas基础
1.核心数据结构


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
x = np.linspace(0,2*np.pi, num=100)
print(x)
y = np.sin(x)
print(y)
plt.plot(x,y)
s = pd.Series([1,3,5,np.NaN,8,4])
s
dates = pd.date_range("20200823", periods=6)
dates
data = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
data.head(3)
data.shape
data.values
df = pd.DataFrame({"A":1, "B":pd.Timestamp("2020-8-23 00:00:00"), "C":range(4), "D":np.arange(4)})
df
df["A"]
df.head(3)
df.tail(3)
df.T
data.sort_index(axis=1, ascending=False)
data.loc[:,["B","C"]]
data.loc["2020-08-24","B"]
data.at[pd.Timestamp("2020-08-24"), "B"]
%timeit df.iat[1,1]
%timeit data.iat[1,1]
data[data>0]
data2 = data.copy()
tag = ["a"]*2 + ["b"]*2 + ["c"]*2
data2["TAG"]=tag
data2
data2[data2.TAG.isin(["a","c"])]
data.iat[0,0]=100
data
data["A"] = range(6)
data
data.B = 100
data
data.iloc[:,2:5]=1000
data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dates = pd.date_range("20200823", periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1
df1.loc[dates[1:3],"E"]=2
df1
df1.dropna()
df1.fillna(value=5)
pd.isnull(df1).any().any()
df1.mean()
df
df.apply(np.cumsum)
df.apply(lambda x: x.max() - x.min())
def _sum(x):
print(type(x))
return x.sum()
df.apply(_sum)
s = pd.Series(np.random.randint(10,20,size=20))
s
s.value_counts()
s.mode()
df2 = pd.DataFrame(np.random.randn(10,4), columns=list("ABCD"))
df2
df2.iloc[:3]
df2.iloc[3:7]
df2.iloc[7:]
df3 = pd.concat([df2.iloc[:3],df2.iloc[3:7],df2.iloc[7:]])
(df2 == df3).all().all()
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
axb = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
axb
"""
MultiIndex.from_tuple函数:是将元组变为多层索引
它有三个参数:pandas.MultiIndex. from_tuples(tuples, sortorder=None, names=None)
tuples:元组
sortorder:排序顺序,可选参数。
名称:设置多层索引名称,可选。
"""
index = pd.MultiIndex.from_tuples(axb, names=['first', 'second'])
index
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=["A", "B"])
df
stacked = df.stack()
stacked
stacked.index
stacked.unstack().unstack()
df = pd.DataFrame({
"A":["one", "one", "two", "three"]*3,
"B":["A", "B", "C"]*4,
"C":["foo", "foo", "foo", "bar", "bar", "bar"]*2,
"D":np.random.randn(12),
"E":np.random.randn(12)
})
df
df.pivot_table(values="D", index=["A", "B"], columns=["C"])
df[df["A"]=="one"].groupby("C").mean()
data = pd.read_csv("D:\Tools\PythonProject\MyTest\pandas_basic\James_Harden.csv")
data.head()
data.pivot_table(index=["对手"])
data.pivot_table(index=["对手", "主客场"])
data.pivot_table(values=["得分", "篮板", "助攻"], index=["主客场", "胜负"])
data.pivot_table(values=["得分", "篮板", "助攻"], index=["主客场", "胜负"], aggfunc=[np.mean,np.sum])
data.pivot_table(values="得分", index="主客场", columns="对手", aggfunc=[np.sum],fill_value="0")
rng = pd.date_range("20160301", periods=600, freq="s")
rng
s = pd.Series(np.random.randint(0,500,len(rng)), index=rng)
s
s.resample("2Min").sum()
rng = pd.period_range("2000Q1","2016Q1", freq="Q")
rng
rng.to_timestamp()
pd.Timestamp("20160301") - pd.Timestamp("20160201")
pd.Timestamp("20160301") + pd.Timedelta(days=5)
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":["a","b","b","a","a","d"]})
df
df["grade"]=df.raw_grade.astype("category")
df
df.grade
df.grade.cat.categories = ["very good", "good", "bad"]
df
df.sort_values(by="grade", ascending=False)
s = pd.Series(np.random.randn(1000), index=pd.date_range("20000101", periods=1000))
s
s = s.cumsum
s.plot()
2.练习:moviesLens电影数据分析
import pandas as pd
import numpy as np
fpath = r"D:\WinterIsComing\python\New_Wave\pandas_basic\pandas-learn-code\datas\movielens-1m\users.dat"
users = pd.read_csv(fpath,
header=None,
engine="python",
sep="::",
names=["user_id", "gender", "age", "occupation", "zip"]
)
print(len(users))
users.head(5)
rating_names = ['user_id', 'movie_id', 'rating', 'timestamp']
fpath = r"D:\WinterIsComing\python\New_Wave\pandas_basic\pandas-learn-code\datas\movielens-1m\ratings.dat"
ratings = pd.read_table(fpath, sep='::', header=None, names=rating_names, engine='python')
movie_names = ['movie_id', 'title', 'genres']
fpath = r"D:\WinterIsComing\python\New_Wave\pandas_basic\pandas-learn-code\datas\movielens-1m\movies.dat"
movies = pd.read_table(fpath, sep='::', header=None, names=movie_names, engine='python')
print(len(ratings))
ratings.head(5)
print(len(movies))
movies.head(5)
data = pd.merge(pd.merge(users,ratings),movies)
print(len(data))
data.head(10)
data[data.user_id==1]
ratings_by_gender = data.pivot_table(values="rating", index="title", columns="gender", aggfunc=[np.mean])
ratings_by_gender.head(10)
data.user_id
print(type(ratings_by_gender),"\n",type(data))
3.pandas核心数据结构
import pandas as pd
import numpy as np
s1 = pd.Series(np.random.randn(3), index=["a", "c", "e"])
s2 = pd.Series(np.random.randn(3), index=["a", "d", "e"])
print(f'{s1}\n\n{s2}')
s1 + s2
df = pd.DataFrame({
"one": pd.Series([1,2,3], index=["a", "b", "c"]),
"two": pd.Series([1,2,3,4], index=["a", "b", "c", "d"])
})
df
c = {
"one": pd.Series([1,2,3], index=["a", "b", "c"]),
"two": pd.Series([1,2,3,4], index=["a", "b", "c", "d"])
}
df = pd.DataFrame(c, index=["d", "b", "a"])
df
df = pd.DataFrame(c, columns=["two", "three"])
df
d = {"one": [1,2,3,4],
"two": [21,22,23,24]}
df = pd.DataFrame(d)
df
data = [(1,2.2,"Hello"), (2,3.,"world")]
df = pd.DataFrame(data, index=["one", "two"], columns=list("ABC"))
df
df = pd.DataFrame(np.random.randn(6,4), columns=['one', 'two', 'three', 'four'])
df
df["three"] = df["one"] + df["two"]
df
del df["three"]
df
df["flag"] = df["one"] > 0.2
df
df["five"] = 5
df
s = df.pop('four')
s
df
df.insert(1,'bar', df['one'] + df['two'])
df
df.assign(Ratio = df["one"]/ df["two"])
df.assign(Ratio = lambda x: x.one - x.two)
df.assign(ABRatio = df.one / df.two).assign(BarValue = lambda x: x.ABRatio * x.bar)
data = {"Item1": pd.DataFrame(np.random.randn(4,3)),
"Item2": pd.DataFrame(np.random.randn(4,2))}
print(pd.Panel(data))
4.pandas基础运算
import pandas as pd
import numpy as np
s = pd.Series([1,3,5,6,8], index=list("acefh"))
s
s.index
s = s.reindex(list("abcdefgh"))
s
s.fillna(0, inplace=True)
s
s.reindex(list("abcdefghijk"), fill_value=10)
s = pd.Series([1,3,5,7,9], index=list("acegi"))
s
s.reindex(list("abcdefghi"))
s.reindex(list("abcdefghi"),method="ffill")
df = pd.DataFrame(np.random.randn(4,6), index=["A","D", "F", "H"], columns=["one", "two", "three", "four", "five", "six"])
df
df2 = df.reindex(index=list("ABCDEFGH"))
df2
df2.fillna(0, inplace=True)
df2
df.reindex(columns=["one", "three", "five", "seven"])
df.reindex(columns=["one", "three", "five", "seven"], fill_value=0)
df.reindex(index=list("ABCDEFGH"), method="bfill")
df.drop("A")
df.drop(["two", "four"], axis=1)
df
df = pd.DataFrame(np.arange(12).reshape(4,3), index=["one", "two", "three", "four"], columns=list("ABC"))
df
df.apply(lambda x: x.max() - x.min())
df.apply(lambda x: x.max() - x.min(),axis=1)
df.apply?
def min_max(x):
return pd.Series([x.min(), x.max()], index=["min", "max"])
df.apply(min_max, axis=1)
df = pd.DataFrame(np.random.randn(4,3), index=["one", "two", "three", "four"], columns=list("ABC"))
df
formater = lambda x: round(x,3)
df.apply(formater)
df = pd.DataFrame(np.random.randn(4,3), index=list("ABCD"), columns=["one", "two", "three"])
df.sort_values(by="one",ascending=False)
s = pd.Series([3,6,2,6,4])
s
s.rank(method="first")
s.rank(method="average")
df.rank(method="first")
s = pd.Series(list("abbcdabacad"))
s
s.value_counts()
s.unique()
s.isin(['a', "c", "d"])