二十四. how do I create dummy variables in Pandas
# coding: utf-8
# In[1]:
#24
import pandas as pd
# In[2]:
train = pd.read_csv('http://bit.ly/kaggletrain')
# In[3]:
train.head()
# In[4]:
train['Sex_male'] =train.Sex.map({'female':0,'male':1})
# In[5]:
train
# In[8]:
pd.get_dummies(train.Sex)
# In[12]:
train.Embarked.value_counts()
# In[20]:
pd.get_dummies(train,columns=['Sex','Embarked'],drop_first = True)
# In[15]:
pd.get_dummies(train.Embarked,prefix = 'Embarked').iloc[:,1:]
# In[17]:
pd.concat([train,embarked_dummies],axis =1)
# In[18]:
embarked_dummies = pd.get_dummies(train.Sex,prefix='Sex').iloc[:,1:]
# In[19]:
embarked_dummies
二十五. How do I work with dates and times in Pandas
# coding: utf-8
# In[1]:
#25
import pandas as pd
# In[2]:
ufo = pd.read_csv('http://bit.ly/uforeports')
# In[3]:
ufo.head()
# In[4]:
ufo.dtypes
# In[6]:
ufo.Time.str.slice(-5,-3).astype(int).head()
# In[7]:
ufo['Time'] = pd.to_datetime(ufo.Time)
# In[8]:
ufo.head()
# In[9]:
ufo.dtypes
# In[10]:
ufo.Time.dt.hour
# In[11]:
ufo.Time.dt.weekday_name
# In[12]:
ufo.Time.dt.dayofyear
# In[13]:
ufo.Time.dt.dayofyear.head()
# In[14]:
ts = pd.to_datetime('1/1/1999')
# In[15]:
ufo.loc[ufo.Time >= ts,:]
# In[16]:
(ufo.Time.max() - ufo.Time.min()).days
# In[17]:
get_ipython().run_line_magic('matplotlib', 'inline')
# In[18]:
ufo['Year'] = ufo.Time.dt.year
# In[19]:
ufo.head()
# In[24]:
ufo.Year.value_counts().sort_index().plot()
二十六. How do I find and remove duplicate rows in Pandas
# coding: utf-8
# In[1]:
#26
import pandas as pd
# In[3]:
user_cols = ['user_id','age','gender','occupation','zip_code']
users = pd.read_table('http://bit.ly/movieusers',sep='|',header = None, names = user_cols,index_col='user_id')
# In[4]:
users.head()
# In[5]:
users.shape
# In[8]:
#Get to know all the duplicate data
users.zip_code.duplicated()
# In[10]:
users.duplicated().sum()
# In[13]:
users.loc[users.duplicated(keep=False),:]
二十七. How do I avoid a SettingWithCopyWarning in pandas?
# coding: utf-8
# In[1]:
#27
import pandas as pd
# In[2]:
movies = pd.read_csv('http://bit.ly/imdbratings')
# In[3]:
movies.head()
# In[5]:
movies.content_rating.isnull().sum()
# In[6]:
movies[movies.content_rating.isnull()]
# In[8]:
movies.content_rating.value_counts()
# In[9]:
movies[movies.content_rating == 'NOT RATED']
# In[12]:
import numpy as np
# In[18]:
#将所有的“not rated"赋值为np.nan
movies.loc[movies.content_rating == 'NOT RATED','content_rating'] = np.nan
# In[19]:
movies.content_rating.isnull().sum()
# In[23]:
top_movies = movies.loc[movies.star_rating >= 9 ,:].copy()
# In[24]:
top_movies
# In[25]:
top_movies.loc[0,'duration'] = 150
# In[26]:
top_movies
二十八. How do I change display options in Pandas
# coding: utf-8
# In[1]:
#28
import pandas as pd
# In[2]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
# In[3]:
drinks
# In[4]:
pd.get_option('display.max_rows')
# In[6]:
#Now we change the display mode to scroll
pd.set_option('display.max_rows',None)
drinks
# In[7]:
pd.get_option('display.max_columns')
# In[8]:
train = pd.read_csv('http://bit.ly/kaggletrain')
# In[9]:
train.head()
# In[11]:
pd.get_option('display.max_colwidth')
# In[12]:
pd.set_option('display.max_colwidth',100)
train.head()
# In[13]:
pd.set_option('display.precision',2)
# In[14]:
drinks.head()
# In[15]:
drinks['x'] = drinks.wine_servings *1000
# In[16]:
drinks['y'] = drinks.total_litres_of_pure_alcohol *1000
# In[19]:
drinks.head()
# In[18]:
pd.set_option('display.float_format','{:,}'.format)
# In[20]:
drinks.dtypes
# In[22]:
pd.describe_option('rows')
# In[23]:
pd.reset_option('all')
二十九. How do I create a Pandas DataFrame for another project?
# coding: utf-8
# In[20]:
#29
import pandas as pd
# In[21]:
df = pd.DataFrame({'id':[100,101,102],'color':['red','blue','red']},columns = ['id','color'],index=['a','b','c'])
# In[22]:
pd.DataFrame([[100,'red'],[101,'blue'],[102,'red']])
# In[23]:
import numpy as np
# In[24]:
arr = np.random.rand(4,2)
arr
# In[25]:
pd.DataFrame(arr,columns=['one','two'])
# In[26]:
pd.DataFrame({'student':np.arange(100,110,1),'test':np.random.randint(60,101,10)}).set_index('student')
# In[27]:
s = pd.Series(['round','square'],index = ['c','b'],name='shape')
s
# In[28]:
df
# In[31]:
pd.concat([df,s],axis=1)
三十. How do I apply a function to a Pandas DataFrame or Series
# coding: utf-8
# In[1]:
#30
import pandas as pd
# In[3]:
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
# In[4]:
train['Sex_num'] = train.Sex.map({'female':1,'male':0})
# In[5]:
train.loc[0:4,['Sex','Sex_num']]
# In[6]:
train['Name_length'] = train.Name.apply(len)
# In[7]:
train.loc[0:4,['Name','Name_length']]
# In[14]:
train.Name.str.split(',').head()
# In[9]:
def get_element(my_list,position):
return my_list[position]
# In[12]:
train.Name.str.split(',').apply(get_element,position=0).head()
# In[13]:
train.Name.str.split(',').apply(lambda x: x[0]).head()
# In[15]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()
# In[16]:
drinks.loc[:,'beer_servings':'wine_servings'].apply(max,axis=0)
# In[19]:
import numpy as np
drinks.loc[:,'beer_servings':'wine_servings'].apply(np.argmax, axis=1)
三十一.
# coding: utf-8
# In[1]:
#31
#changes in Pandas
# In[2]:
import pandas as pd
pd.__version__
# In[4]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry',index_col='country')
drinks.head()
# In[5]:
#loc accesses by label
drinks.loc['Angola','spirit_servings']
# In[6]:
#iloc accesses by position
drinks.iloc[4,1]
# In[7]:
#ix accesses by label OR position(newly deprecated)
drinks.ix['Angola',1]
# In[8]:
#alternative: use loc
drinks.loc['Angola',drinks.columns[1]]
# In[9]:
#alternative:use iloc
drinks.iloc[drinks.index.get_loc('Angola'),1]
# In[10]:
#ix accesses by label OR position(newly deprecated)
drinks.ix[4,'spirit_servings']
# In[12]:
#alternative: use loc
drinks.loc[drinks.index[4],'spirit_servings']
# In[14]:
#alternative: use iloc
drinks.iloc[4,drinks.columns.get_loc('spirit_servings')]