《使用pandas进行数据分析》网课笔记(24到31)

博客围绕Pandas展开,涵盖创建虚拟变量、处理日期时间、查找和移除重复行、避免警告、更改显示选项、创建DataFrame以及应用函数到DataFrame或Series等常见操作的相关问题。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

二十四. how do I create dummy variables in Pandas


# coding: utf-8

# In[1]:


#24
import pandas as pd


# In[2]:


train = pd.read_csv('http://bit.ly/kaggletrain')


# In[3]:


train.head()


# In[4]:


train['Sex_male'] =train.Sex.map({'female':0,'male':1})


# In[5]:


train


# In[8]:


pd.get_dummies(train.Sex)


# In[12]:


train.Embarked.value_counts()


# In[20]:


pd.get_dummies(train,columns=['Sex','Embarked'],drop_first = True)


# In[15]:


pd.get_dummies(train.Embarked,prefix = 'Embarked').iloc[:,1:]


# In[17]:


pd.concat([train,embarked_dummies],axis =1)


# In[18]:


embarked_dummies = pd.get_dummies(train.Sex,prefix='Sex').iloc[:,1:]


# In[19]:


embarked_dummies

二十五.  How do I work with dates and times in Pandas


# coding: utf-8

# In[1]:


#25
import pandas as pd


# In[2]:


ufo = pd.read_csv('http://bit.ly/uforeports')


# In[3]:


ufo.head()


# In[4]:


ufo.dtypes


# In[6]:


ufo.Time.str.slice(-5,-3).astype(int).head()


# In[7]:


ufo['Time'] = pd.to_datetime(ufo.Time)


# In[8]:


ufo.head()


# In[9]:


ufo.dtypes


# In[10]:


ufo.Time.dt.hour


# In[11]:


ufo.Time.dt.weekday_name


# In[12]:


ufo.Time.dt.dayofyear


# In[13]:


ufo.Time.dt.dayofyear.head()


# In[14]:


ts = pd.to_datetime('1/1/1999')


# In[15]:


ufo.loc[ufo.Time >= ts,:]


# In[16]:


(ufo.Time.max() - ufo.Time.min()).days


# In[17]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[18]:


ufo['Year'] = ufo.Time.dt.year


# In[19]:


ufo.head()


# In[24]:


ufo.Year.value_counts().sort_index().plot()

二十六. How do I find and remove duplicate rows in Pandas


# coding: utf-8

# In[1]:


#26
import pandas as pd


# In[3]:


user_cols = ['user_id','age','gender','occupation','zip_code']
users = pd.read_table('http://bit.ly/movieusers',sep='|',header = None, names = user_cols,index_col='user_id')


# In[4]:


users.head()


# In[5]:


users.shape


# In[8]:


#Get to know all the duplicate data
users.zip_code.duplicated()


# In[10]:


users.duplicated().sum()


# In[13]:


users.loc[users.duplicated(keep=False),:]

二十七. How do I avoid a SettingWithCopyWarning in pandas?


# coding: utf-8

# In[1]:


#27
import pandas as pd


# In[2]:


movies = pd.read_csv('http://bit.ly/imdbratings')


# In[3]:


movies.head()


# In[5]:


movies.content_rating.isnull().sum()


# In[6]:


movies[movies.content_rating.isnull()]


# In[8]:


movies.content_rating.value_counts()


# In[9]:


movies[movies.content_rating == 'NOT RATED']


# In[12]:


import numpy as np


# In[18]:


#将所有的“not rated"赋值为np.nan
movies.loc[movies.content_rating == 'NOT RATED','content_rating'] = np.nan


# In[19]:


movies.content_rating.isnull().sum()


# In[23]:


top_movies = movies.loc[movies.star_rating >= 9 ,:].copy()


# In[24]:


top_movies


# In[25]:


top_movies.loc[0,'duration'] = 150


# In[26]:


top_movies

二十八. How do I change display options in Pandas


# coding: utf-8

# In[1]:


#28
import pandas as pd


# In[2]:


drinks = pd.read_csv('http://bit.ly/drinksbycountry')


# In[3]:


drinks


# In[4]:


pd.get_option('display.max_rows')


# In[6]:


#Now we change the display mode to scroll
pd.set_option('display.max_rows',None)
drinks


# In[7]:


pd.get_option('display.max_columns')


# In[8]:


train = pd.read_csv('http://bit.ly/kaggletrain')


# In[9]:


train.head()


# In[11]:


pd.get_option('display.max_colwidth')


# In[12]:


pd.set_option('display.max_colwidth',100)
train.head()


# In[13]:


pd.set_option('display.precision',2)


# In[14]:


drinks.head()


# In[15]:


drinks['x'] = drinks.wine_servings *1000


# In[16]:


drinks['y'] = drinks.total_litres_of_pure_alcohol *1000


# In[19]:


drinks.head()


# In[18]:


pd.set_option('display.float_format','{:,}'.format)


# In[20]:


drinks.dtypes


# In[22]:


pd.describe_option('rows')


# In[23]:


pd.reset_option('all')

二十九. How do I create a Pandas DataFrame for another project?


# coding: utf-8

# In[20]:


#29
import pandas as pd


# In[21]:


df = pd.DataFrame({'id':[100,101,102],'color':['red','blue','red']},columns = ['id','color'],index=['a','b','c'])


# In[22]:


pd.DataFrame([[100,'red'],[101,'blue'],[102,'red']])


# In[23]:


import numpy as np


# In[24]:


arr = np.random.rand(4,2)
arr


# In[25]:


pd.DataFrame(arr,columns=['one','two'])


# In[26]:


pd.DataFrame({'student':np.arange(100,110,1),'test':np.random.randint(60,101,10)}).set_index('student')


# In[27]:


s = pd.Series(['round','square'],index = ['c','b'],name='shape')
s


# In[28]:


df


# In[31]:


pd.concat([df,s],axis=1)

三十. How do I apply a function to a Pandas DataFrame or Series


# coding: utf-8

# In[1]:


#30
import pandas as pd


# In[3]:


train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()


# In[4]:


train['Sex_num'] = train.Sex.map({'female':1,'male':0})


# In[5]:


train.loc[0:4,['Sex','Sex_num']]


# In[6]:


train['Name_length'] = train.Name.apply(len)


# In[7]:


train.loc[0:4,['Name','Name_length']]


# In[14]:


train.Name.str.split(',').head()


# In[9]:


def get_element(my_list,position):
    return my_list[position]


# In[12]:


train.Name.str.split(',').apply(get_element,position=0).head()


# In[13]:


train.Name.str.split(',').apply(lambda x: x[0]).head()


# In[15]:


drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()


# In[16]:


drinks.loc[:,'beer_servings':'wine_servings'].apply(max,axis=0)


# In[19]:


import numpy as np
drinks.loc[:,'beer_servings':'wine_servings'].apply(np.argmax, axis=1)

三十一. 


# coding: utf-8

# In[1]:


#31
#changes in Pandas


# In[2]:


import pandas as pd
pd.__version__


# In[4]:


drinks = pd.read_csv('http://bit.ly/drinksbycountry',index_col='country')
drinks.head()


# In[5]:


#loc accesses by label
drinks.loc['Angola','spirit_servings']


# In[6]:


#iloc accesses by position
drinks.iloc[4,1]


# In[7]:


#ix accesses by label OR position(newly deprecated)
drinks.ix['Angola',1]


# In[8]:


#alternative: use loc
drinks.loc['Angola',drinks.columns[1]]


# In[9]:


#alternative:use iloc
drinks.iloc[drinks.index.get_loc('Angola'),1]


# In[10]:


#ix accesses by label OR position(newly deprecated)
drinks.ix[4,'spirit_servings']


# In[12]:


#alternative: use loc
drinks.loc[drinks.index[4],'spirit_servings']


# In[14]:


#alternative: use iloc
drinks.iloc[4,drinks.columns.get_loc('spirit_servings')]

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值