pandas常用操作

最新推荐文章于 2025-04-18 08:30:03 发布
原创最新推荐文章于 2025-04-18 08:30:03 发布 · 344 阅读
CC 4.0 BY-SA版权
文章标签：
##read file
# 鼠标放在内建函数上，Ctrl+B，看源码
import pandas as pd
df = pd.DataFrame({'ID':[1,2,3],'name':['time','victor','nick']})
df.to_excel("D:\\pycharmfile\\output.xlsx")
print('zhuzhu')# it will generate index column automatically
#-------------------------
import pandas as pd
df = pd.DataFrame({'ID':[1,2,3],'name':['time','victor','nick']})
df = df.set_index('ID')
df.to_excel("D:\\pycharmfile\\output.xlsx")
print('zhuzhu')#set 'ID' column as index column
#---------------------------------------------------
import pandas as pd
people = pd.read_excel("D:\\pycharmfile\\people.xlsx")
print(people.shape)
print(people.columns)
print(people.tail(5))
people = pd.read_excel("D:\\pycharmfile\\people.xlsx",header=1)
print(people.columns)#set the firet column that ie not null as header
#---------------------------------------------------
import pandas as pd
people =pd.read_excel("D:\\pycharmfile\\people.xlsx",header=None)
people.columns = ['id','type','first','firsname','middlename','lastname']
people = people.set_index('id')#set id as index column
people.set_index('id',inplace  = True)#set id as index column
print(people.columns)
people.to_excel('D:\\pycharmfile\\people1.xlsx')

df = pd.read_excel('D:\\pycharmfile\\output.xlsx',index_col = 'ID')
df.to_excel('D:\\pycharmfile\\output2.xlsx')
print('done')
#---------------------------------
import pandas as pd
d = {'x':100,'y':200,'z':399}#
print(d.keys())
print(d.values())
print(d['x'])
s1 = pd.Series(d)# tanslate dictionary  into series
print(s1)
print(s1.index)
print(s1.data)
L1 = [100,200,200]
L2 = ['x','y','z']
S1 = pd.Series(L1, index = L2)
print(s1)#序列#one- dimensional ndarray with axis labels
s1 = pd.Series([1,2,3],index=[1,2,3],name = 'A')
s2 = pd.Series([1,22,33],index=[1,2,3],name = 'B')
s3 = pd.Series([1,24,35],index=[1,2,3],name = 'C')
df = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3})# as a column
print(df)
df= pd.DataFrame([s1,s2,s3])# as a row
print(df)
#----------------------------------------
s1 = pd.Series([1,2,3],index=[1,2,3],name = 'A')
s2 = pd.Series([1,22,33],index=[1,2,3],name = 'B')
s3 = pd.Series([1,24,35],index=[1,2,4],name = 'C')
df = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3})# as a column
print(df)
df.to_excel('D:\\pycharmfile\\datefram.xlsx')
#------------------------------------------
from datetime import date,timedelta
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx')
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx',skiprows=3,usecols='C:F', \
                      index_col= None,dtype ={'ID':str,'InStore':str,'Date':str})
print(books)
print(type(books['ID']))
books['ID'].at[0] =100
print(books['ID'])
def add_month(d,md):
    yd  = md//12
    m = d.month+ md%12
    if m!=12:
        yd +=m//12
        m = m%12
    return date(d.year+yd,m,d.day)

start =date(2018,1,1)
for i in books.index:
    books['ID'].at[i] = i+1
    books['InStore'].at[i] = 'yes' if i%2==0 else 'no'
    books['Date'].at[i] =start+timedelta(days = i)#只能加天一下
    books['Date'].at[i] = date(start.year +i, start.month,start.day)#add year
    books['Date'].at[i] = add_month(start,i)#add momth
books.set_index('ID',inplace = True)
print(books)
books.to_excel('D:\\pycharmfile\\Books2.xlsx')
#--------------------------------------------------
start =date(2018,1,1)
for i in books.index:
    books.at[i,'ID'] = i+1
    books.at[i,'InStore'] = 'yes' if i%2==0 else 'no'
    #books['Date'].at[i] =start+timedelta(days = i)#只能加天一下
    #books['Date'].at[i] = date(start.year +i, start.month,start.day)#add year
    books.at[i,'Date'] = add_month(start,i)#add momth
books.set_index('ID',inplace = True)
print(books)
books.to_excel('D:\\pycharmfile\\Books3.xlsx')
#-------------------------------------------------------------------
import pandas as pd
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx',index_col = 'ID')
print(books.head)
books['Price'] = books['ListPrice'] * books['Discount']
print(books)
#--------------------------------------------------------------------
import pandas as pd
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx',index_col = 'ID')
for i in books.index:
    books['Price'].at[i] = books['ListPrice'].at[i]* books['Discount'].at[i]#单元格与单元格计算
print(books)
#——--------------------------------------------------------------------------
import pandas as pd
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx',index_col = 'ID')
for i in range(5,16):
    books['Price'].at[i] = books['ListPrice'].at[i]* books['Discount'].at[i]#单元格与单元格计算
print(books)
#------------------------------------------------------------------------------------
import pandas as pd
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx',index_col = 'ID')
books['ListPrice'] = books['ListPrice']+2
print(books)
#----------------------------------------------------------------------------------------\
import pandas as pd
def add_2(x):
    return x+2
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx',index_col = 'ID')
books['ListPrice'] = books['ListPrice'].apply(add_2)
print(books)
#---------------------------------------------------------------------------------、
import pandas as pd
books = pd.read_excel('D:\\pycharmfile\\Books.xlsx',index_col = 'ID')
books['ListPrice'] = books['ListPrice'].apply(lambda x:x+2)
print(books)
#-----------------------------------------------------------------------------
import pandas as pd
products  = pd.read_excel('D:\\pycharmfile\\List.xlsx',index_col = 'ID')
products.sort_index(by = 'Price',inplace= True,ascending=False)#orderring in thie datefram,not  in a new one
print(products)
#--------------------------------------------------------------------------------------------
import pandas as pd
products  = pd.read_excel('D:\\pycharmfile\\List.xlsx',index_col = 'ID')
products.sort_index(by = ['Worthy','Price'],inplace= True,ascending=[True,False])#orderring in thie datefram,not  in a new one
print(products)
#-----------------------------
import statistics
statistics.mean([1,2,3,4,5,6,7,8.9])#计算平均数
statistics.mean(range(1,10))
import fractions
x = [(1,2),(1,21),(5,6),(1,7)]
y = [fractions.Fraction(*item) for item in x]
statistics.mean(y)
import decimal
x = ('0.5','0.75','0.625','0.374')
y = map(decimal.Decimal,x)
statistics.mean(y)

#-------------------------------------------------------------
import pandas as pd
def age_18_to_30(a):
    return a>=18 and a<30# 18<=a<30
def level_a(s):
    return 85<=s<=100
student = pd.read_excel('D:\\pycharmfile\\Students.xlsx',index_col = 'ID')
# student = student.loc[student['Age'].apply(age_18_to_30)].loc[student.Score.apply(level_a)]#两种语法
print(student)
student = student.loc[student.Age.apply(lambda a: 18<=a<30)] \
    .loc[student.Score.apply(lambda s:85<=s<=100)]#两种语法 换行的话：先打一个空格，再打\
print(student)
#----------------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel('D:\\pycharmfile\\Student.xlsx')
students.sort_values(by ='Number', inplace=True, ascending=False)
print(students)
students.plot.bar(x='Field', y='Number', color='blue', title='International Students by Field')
plt.tight_layout()
plt.show()
#------------------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel('D:\\pycharmfile\\Student.xlsx')
students.sort_values(by='Number', inplace=True, ascending=False)
students.index = range(0, len(students))
print(students)
plt.bar(students['Field'], students['Number'], color='orange', width=0.7)
plt.xticks(students['Field'], rotation='90')
plt.title('International Student by Field', fontsize=16)
plt.xlabel('Field')
plt.ylabel('Number')
plt.tight_layout()#显示出标签
plt.show()

#------------------------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt

students = pd.read_excel('C:/Temp/Students.xlsx')
students.sort_values(by='2017', inplace=True, ascending=False)
print(students)
students.plot.bar('Field', ['2016', '2017'], color=['orange', 'Red'])
plt.title('International Students by Field', fontsize=16)
plt.xlabel('Field', fontweight='bold')
plt.ylabel('Number', fontweight='bold')
# plt.tight_layout()
ax = plt.gca()
ax.set_xticklabels(students['Field'], rotation=40, ha='right')
plt.gcf().subplots_adjust(left=0.2, bottom=0.42)
plt.show()
#-------------------------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

students = pd.read_excel('../Students.xlsx')
students.sort_values(by='2015 to 16', inplace=True, ascending=False)
students.index = range(0, len(students))
print(students)

bar_width = 0.7
x_pos = np.arange(len(students) * 2, step=2)
plt.bar(x_pos, students['2016 to 17'], color='green', width=bar_width)
plt.bar(x_pos + bar_width, students['2015 to 16'], color='blue', width=bar_width)

plt.xticks(x_pos + bar_width / 2, students['Field'], rotation='90')
plt.title('International Student by Field', fontsize=16)
plt.xlabel('Field')
plt.ylabel('Number')
plt.tight_layout()
plt.show()
#--------------------------------------------------------------
#21旋转行列
import pandas as pd
pd.options.display.max_columns = 999#显示出所有的行
videos = pd.read_excel('../Videos.xlsx', index_col='Month')
# table = videos.transpose()
table = videos.T
print(table)
#--------------------------------------------------------
#22读取多种文件的数据
import pandas as pd
#若是文件中的数据使用逗号分隔的，应保存为csv文件
# 若是文件中的数据是用teb分隔的，应保存为tsv
# 若是特殊文件中的分隔符实现的，用txt保存
students1 = pd.read_csv("../Students.csv",index_col='ID')
print(students1)
students2 = pd.read_csv("../Students.txt", sep = '|',index_col = 'ID')
print(students2)
students3 = pd.read_csv('../Students.tsv ',sep = '\t',index_col="ID")
print(students3)
#-------------------------------------------------------------
#23认识数据分类汇总，透视表、分组、聚合
import pandas as pd
import numpy as np
pd.options.display.max_columns = 999
orders = pd.read_excel("../Orders.xlsx")
# orders['Year']= pd.DatetimeIndex(orders['Date']).year
orders['Year'] = pd.DatetimeIndex(orders.Date).year#yield Year column
# print(orders)
print(orders.Date.dtype)
print(orders.head())
#使用pivot_table()
pt1 = orders.pivot_table(index = 'Category',columns='Year',values='Total',aggfunc=np.sum)
print(pt1)
#手动制作pivot table
groups = orders.groupby(['Category','Year'])
s = groups['Total'].sum()
c = groups['ID'].count()
pt2 = pd.DataFrame({'Sum':s,'Count':c})
print(pt2)
#-------------------------------------------------------------------------
#24 线性回归，数据预测
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
sales = pd.read_excel("../Sales.xlsx",dtype={'Month':str})#将month设置为字符类型，以防后续自动设置为浮点类型
print(sales)
slope,intercept,r,p,std_err = linregress(sales.index,sales.Revenue)
exp = sales.index* slope+intercept
print(slope*35+intercept)

plt.scatter(sales.index,sales.Revenue)
plt.plot(sales.index,exp,color = 'orange')
plt.title(f'y = {slope}*x+{intercept}')
plt.xticks(sales.index,sales.Month,rotation = 90)
plt.tight_layout()
plt.show()
#----------------------------------------------------------------------
#25条件格式  需要在jupyter notebook中进行运行
# apply和applymap的区别:

# apply是指根据轴进行匹配,从左往右是用轴为1即axis=1,从上至下是用轴为0即axis=0,默认是从上往下进行列匹配
#
# applymap是指无差别的进行指定区域匹配
import pandas as pd

def low_score_red(s):
    color = 'red' if s<60 else 'black'
    return f'color:{color}'
def highest_score_grow(col):
    return['background-color:lime' if s == col.max() else "background-color:white" for s in col]


students = pd.read_excel('D:/pycharmfile/Students25.xlsx')
students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3'])\
    .apply(highest_score_grow,subset =['Test_1', 'Test_2', 'Test_3'] )#默认值为axis= 0

import pandas as pd
students = pd.read_excel('D:/pycharmfile/Students25.xlsx')
students.style.bar(color = 'orange', subset=['Test_1', 'Test_2', 'Test_3'])

import pandas as pd
import  seaborn as sns
color_map = sns.light_palette('green',as_cmap = True)
students = pd.read_excel('D:/pycharmfile/Students25.xlsx')
students.style.background_gradient(cmap  = color_map, subset=['Test_1', 'Test_2', 'Test_3'])
#-----------------------------------------------------------------------------------------------
#27 行操作集锦
import pandas as pd

page_001 = pd.read_excel('../Students27.xlsx', sheet_name='Page_001')
page_002 = pd.read_excel('../Students27.xlsx', sheet_name='Page_002')
print(page_001)
print(page_002)
students  = page_001.append(page_002)#这种方式数据显示有问题
students  = page_001.append(page_002).reset_index(drop = True)#将原来的索引删除
stu = pd.Series({'ID':41,'Name':"Abel",'Score':99})
students = students.append(stu,ignore_index = True)#忽视索引
#替换数据的两种方法：
#1)
students.at[39,'Name'] = 'Bailey'
students.at[39,'Score'] = '120'
#2)
stu = pd.Series({'ID':39,'Name':'zhuzhu','Score':'130'})
students.iloc[38]= stu
#切片 插入数据
stu  = pd.Series({'ID':101,'Name':'Danni',"Score":101})
part1 = students[:20]
part2 = students[20:]
students = part1.append(stu,ignore_index = True).append(part2).reset_index(drop= True)
print(students)
#删除操作
# pandas 中 inplace 参数在很多函数中都会有，它的作用是：是否在原对象基础上进行修改
#  inplace = True：不创建新的对象，直接对原始对象进行修改；
#  inplace = False：对数据进行修改，创建并返回新的对象承载其修改结果。
# 默认是False，即创建新的对象进行修改，原对象不变，和深复制和浅复制有些类似。
students.drop(index = [0,1,2],inplace = True)
students.drop(index = range(0,9),inplace = True)
print(students)
#用切片进行删除
students.drop(index = students[0:10].index,inplace = True)
print(students)
#设置空值
for i in range(5,16):
    students['Name'].at[i] = ' '
#去掉空值

missing = students.loc[students['Name']==' ']
students.drop(index = missing.index,inplace = True)
studets  = students.reset_index(drop = True)
print(students)
#-------------------------------------------------------------------------------------------------
#28 列操作
import pandas as pd

page_001 = pd.read_excel('../Students28.xlsx', sheet_name='Page_001')
page_002 = pd.read_excel('../Students28.xlsx', sheet_name='Page_002')
# students = pd.concat((page_001,page_002)).reset_index(drop = True)#append相似
students = pd.concat((page_001,page_002),axis = 1).reset_index(drop = True)#append相似

print(students)
#追加列
students = pd.concat((page_001,page_002)).reset_index(drop = True)
# students['Age'] =np.repeat(25,len(students))
students['Age'] = np.arange(0,len(students))
#删除列
students.drop(columns= ['Age','Score'],inplace = True)
print(students)
#插入列
students.insert(1,column = 'FOO',value = np.repeat('FOO',len(students)))#从0列开始
students.rename(columns = {'FOO':'FOO','Name':'NAME'},inplace = True)
print(students)

students['ID'] = students['ID'].astype(float)#只有将int转化为float 才能设置为nan
for i in range(5,15):
    students['ID'].at[i] = np.nan
students.dropna(inplace =True)#
print(students)
from numpy  import NaN#None是一个python特殊的数据类型， 但是NaN却是用一个特殊的float
#https://blog.youkuaiyun.com/ialexanderi/article/details/78493607
type(None)
type(NaN)
#------------------------------------------------------------------------------------------------------
#29链接数据库
#-----------------------------------------------------------------------------------------------
#30复杂运算
import pandas as pd
import numpy as np


def get_circumcircle_area(l, h):
    r = np.sqrt(l ** 2 + h ** 2) / 2
    return r ** 2 * np.pi


def wrapper(row):
    return get_circumcircle_area(row['Length'], row['Height'])


rects = pd.read_excel('../Rectangles.xlsx', index_col='ID')
rects['Circumcircle Area'] = rects.apply(wrapper, axis=1)
print(rects)
#方法二
import pandas as pd
import numpy as np

def get_circumcircle_area(l, h):
    r = np.sqrt(l ** 2 + h ** 2) / 2
    return r ** 2 * np.pi
rects['Circumcircle Area'] = rects.apply(lambda row: get_circumcircle_area(row['Length'], row['Height']), axis=1)