Pandas Notes
July 2, 2017
In [144]: """ useful data analysis tool """
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [145]: s = pd.Series([1,3,5,np.nan,6,8])
s
Out[145]: 0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
0.0.1 Data Frame
Data frame can be thought of as a dict-like container for Series objects.
In [146]: d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), # Series
'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} # S
df = pd.DataFrame(d)
df
Out[146]: one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0
In [147]: pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
Out[147]: two three
d 4.0 NaN
b 2.0 NaN
a 1.0 NaN
In [148]: dates = pd.date_range("20130101", periods=6) # starting date, total perio
dates
1
Out[148]: DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
Initialization
In [149]: df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')
df
Out[149]: A B C D
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
Pass in a dict
In [150]: m_dict = {'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo'}
df2 = pd.DataFrame(m_dict)
df2
Out[150]: A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
In [151]: df2.dtypes # call dtypes of each coln
Out[151]: A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
View Data
In [152]: df.head(3) # show first 3 lines, default 5
2
Out[152]: A B C D
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
In [153]: df.tail(4)
Out[153]: A B C D
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
In [154]: df.index # index
Out[154]: DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In [155]: df.columns # coln names
Out[155]: Index(['A', 'B', 'C', 'D'], dtype='object')
In [156]: df.values # value matrix
Out[156]: array([[-0.37682797, 0.90417517, -0.71621554, 1.23294192],
[-0.28077627, 1.23322733, 1.87810555, 0.01043258],
[-1.2556979 , -0.84654659, -2.09449308, 0.21161436],
[-0.83271278, 0.09493884, -1.51439392, 0.0242103 ],
[-2.17368304, 0.1219127 , 1.1571895 , 0.11535622],
[ 1.33082938, 0.84502689, 1.82673574, 1.17527 ]])
In [157]: df.describe() # info of ONLY numeric colns
Out[157]: A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.598145 0.392122 0.089488 0.461638
std 1.169646 0.757682 1.752058 0.579919
min -2.173683 -0.846547 -2.094493 0.010433
25% -1.149952 0.101682 -1.314849 0.046997
50% -0.604770 0.483470 0.220487 0.163485
75% -0.304789 0.889388 1.659349 0.934356
max 1.330829 1.233227 1.878106 1.232942
In [158]: df.T # transpose
Out[158]: 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A -0.376828 -0.280776 -1.255698 -0.832713 -2.173683 1.330829
B 0.904175 1.233227 -0.846547 0.094939 0.121913 0.845027
C -0.716216 1.878106 -2.094493 -1.514394 1.157189 1.826736
D 1.232942 0.010433 0.211614 0.024210 0.115356 1.175270
3
In [159]: df.sort_index(axis=0, ascending=False) #axis=0/1 -> sort row/coln name
Out[159]: A B C D
2013-01-06 1.330829 0.845027 1.826736 1.175270
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
In [160]: df.sort_values(by="B") # by the value of coln 'B'
Out[160]: A B C D
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
2013-01-05 -2.173683 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
Selection
In [161]: df.A # directly call col name
Out[161]: 2013-01-01 -0.376828
2013-01-02 -0.280776
2013-01-03 -1.255698
2013-01-04 -0.832713
2013-01-05 -2.173683
2013-01-06 1.330829
Freq: D, Name: A, dtype: float64
In [162]: # same as
df["A"]
Out[162]: 2013-01-01 -0.376828
2013-01-02 -0.280776
2013-01-03 -1.255698
2013-01-04 -0.832713
2013-01-05 -2.173683
2013-01-06 1.330829
Freq: D, Name: A, dtype: float64
In [163]: # same as
df.get("A")
Out[163]: 2013-01-01 -0.376828
2013-01-02 -0.280776
2013-01-03 -1.255698
4
2013-01-04 -0.832713
2013-01-05 -2.173683
2013-01-06 1.330829
Freq: D, Name: A, dtype: float64
In [164]: df[0:3]
Out[164]: A B C D
2013-01-01 -0.376828 0.904175 -0.716216 1.232942
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
In [165]: # same as
df['20130102':'20130104']
Out[165]: A B C D
2013-01-02 -0.280776 1.233227 1.878106 0.010433
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614
2013-01-04 -0.832713 0.094939 -1.514394 0.024210
In [166]: df.loc[dates[0]] # select the first date
Out[166]: A -0.376828
B 0.904175
C -0.716216
D 1.232942
Name: 2013-01-01 00:00:00, dtype: float64
In [167]: df.loc[:,['A','B']] # coln that matches "A", "B"
Out[167]: A B
2013-01-01 -0.376828 0.904175
2013-01-02 -0.280776 1.233227
2013-01-03 -1.255698 -0.846547
2013-01-04 -0.832713 0.094939
2013-01-05 -2.173683 0.121913
2013-01-06 1.330829 0.845027
In [168]: df.loc["20130101":'20130103',["C","B"]]
Out[168]: C B
2013-01-01 -0.716216 0.904175
2013-01-02 1.878106 1.233227
2013-01-03 -2.094493 -0.846547
In [169]: # same as
df.loc[dates[:-3],["C","B"]]
Out[169]: C B
2013-01-01 -0.716216 0.904175
2013-01-02 1.878106 1.233227
2013-01-03 -2.094493 -0.846547
5
In [170]: df.loc['20130103',["C","B"]] # reduction of dim
Out[170]: C -2.094493
B -0.846547
Name: 2013-01-03 00:00:00, dtype: float64
In [171]: df.loc[dates[0],'A']
Out[171]: -0.37682797203913287
In [172]: # same as
df.at[dates[0],'A']
Out[172]: -0.37682797203913287
In [173]: """ Selection by Position """
""" loc find row/coln by value, iloc by index"""
df.iloc[3]
Out[173]: A -0.832713
B 0.094939
C -1.514394
D 0.024210
Name: 2013-01-04 00:00:00, dtype: float64
In [174]: df.iloc[3:5,0:2]
Out[174]: A B
2013-01-04 -0.832713 0.094939
2013-01-05 -2.173683 0.121913
In [175]: df.iloc[1,1]
Out[175]: 1.2332273329710621
In [176]: # same as
df.iat[1,1]
Out[176]: 1.2332273329710621
Boolean Indexing
In [177]: # similar to R
df[df.A > 0] # returns a bunchs of True/False
Out[177]: A B C D
2013-01-06 1.330829 0.845027 1.826736 1.17527
In [178]: df[df>0] # for <0 will still display, but the value is missing
6
Out[178]: A B C D
2013-01-01 NaN 0.904175 NaN 1.232942
2013-01-02 NaN 1.233227 1.878106 0.010433
2013-01-03 NaN NaN NaN 0.211614
2013-01-04 NaN 0.094939 NaN 0.024210
2013-01-05 NaN 0.121913 1.157189 0.115356
2013-01-06 1.330829 0.845027 1.826736 1.175270
In [179]: df.iloc[1,1][False] # if True, a value will be displayed
Out[179]: array([], dtype=float64)
In [180]: df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2
Out[180]: A B C D E
2013-01-01 -0.376828 0.904175 -0.716216 1.232942 one
2013-01-02 -0.280776 1.233227 1.878106 0.010433 one
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 two
2013-01-04 -0.832713 0.094939 -1.514394 0.024210 three
2013-01-05 -2.173683 0.121913 1.157189 0.115356 four
2013-01-06 1.330829 0.845027 1.826736 1.175270 three
In [181]: # isin() for filtering
df2[df2['E'].isin(["two","four"])]
Out[181]: A B C D E
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 two
2013-01-05 -2.173683 0.121913 1.157189 0.115356 four
Setting
In [182]: s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1
Out[182]: 2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
In [183]: df['F'] = s1 # since a coln in df is a Series
df
Out[183]: A B C D F
2013-01-01 -0.376828 0.904175 -0.716216 1.232942 NaN
7
2013-01-02 -0.280776 1.233227 1.878106 0.010433 1.0
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 2.0
2013-01-04 -0.832713 0.094939 -1.514394 0.024210 3.0
2013-01-05 -2.173683 0.121913 1.157189 0.115356 4.0
2013-01-06 1.330829 0.845027 1.826736 1.175270 5.0
In [185]: # set by label
df.at[dates[0],'A']=0
df
Out[185]: A B C D F
2013-01-01 0.000000 0.904175 -0.716216 1.232942 NaN
2013-01-02 -0.280776 1.233227 1.878106 0.010433 1.0
2013-01-03 -1.255698 -0.846547 -2.094493 0.211614 2.0
2013-01-04 -0.832713 0.094939 -1.514394 0.024210 3.0
2013-01-05 -2.173683 0.121913 1.157189 0.115356 4.0
2013-01-06 1.330829 0.845027 1.826736 1.175270 5.0
In [104]: # set by location
df.iloc[0,1] = 0
df
Out[104]: A B C D F
2013-01-01 0.000000 0.000000 0.411997 -0.591356 NaN
2013-01-02 0.353742 0.271770 0.489375 -2.474150 1.0
2013-01-03 -0.438979 -1.275026 1.044026 -0.356895 2.0
2013-01-04 -1.811731 0.719650 0.080504 -0.242658 3.0
2013-01-05 -0.239879 0.374453 -0.116906 -0.493383 4.0
2013-01-06 0.139179 1.299168 -1.235911 0.998853 5.0
In [107]: # assign w/ a numpy array
df.loc[:,'D'] = np.array([5] *len(df))
df
Out[107]: A B C D F
2013-01-01 0.000000 0.000000 0.411997 5 NaN
2013-01-02 0.353742 0.271770 0.489375 5 1.0
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0
2013-01-04 -1.811731 0.719650 0.080504 5 3.0
2013-01-05 -0.239879 0.374453 -0.116906 5 4.0
2013-01-06 0.139179 1.299168 -1.235911 5 5.0
In [108]: df2 = df.copy()
df2[df2 > 0] = -df2
df2
Out[108]: A B C D F
2013-01-01 0.000000 0.000000 -0.411997 -5 NaN
2013-01-02 -0.353742 -0.271770 -0.489375 -5 -1.0
8
2013-01-03 -0.438979 -1.275026 -1.044026 -5 -2.0
2013-01-04 -1.811731 -0.719650 -0.080504 -5 -3.0
2013-01-05 -0.239879 -0.374453 -0.116906 -5 -4.0
2013-01-06 -0.139179 -1.299168 -1.235911 -5 -5.0
0.0.2 Missing Data
In [113]: df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1
Out[113]: A B C D F E
2013-01-01 0.000000 0.000000 0.411997 5 NaN NaN
2013-01-02 0.353742 0.271770 0.489375 5 1.0 NaN
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0 NaN
2013-01-04 -1.811731 0.719650 0.080504 5 3.0 NaN
In [115]: # assign value to 'E'
df1.loc[dates[0]:dates[1], 'E'] = 1
df1
Out[115]: A B C D F E
2013-01-01 0.000000 0.000000 0.411997 5 NaN 1.0
2013-01-02 0.353742 0.271770 0.489375 5 1.0 1.0
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0 NaN
2013-01-04 -1.811731 0.719650 0.080504 5 3.0 NaN
In [118]: # drop row w/ NA
df1.dropna(how="any") # return a new df
Out[118]: A B C D F E
2013-01-02 0.353742 0.27177 0.489375 5 1.0 1.0
In [120]: # fill na
df1.fillna(value=5) # fill na's w/ 5
Out[120]: A B C D F E
2013-01-01 0.000000 0.000000 0.411997 5 5.0 1.0
2013-01-02 0.353742 0.271770 0.489375 5 1.0 1.0
2013-01-03 -0.438979 -1.275026 1.044026 5 2.0 5.0
2013-01-04 -1.811731 0.719650 0.080504 5 3.0 5.0
In [121]: # find na's
pd.isnull(df1)
Out[121]: A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True
9
0.0.3 Operations
Stats
In [122]: df.mean() # mean of each coln
Out[122]: A -0.332945
B 0.231669
C 0.112181
D 5.000000
F 3.000000
dtype: float64
In [123]: df.mean(0) # same
Out[123]: A -0.332945
B 0.231669
C 0.112181
D 5.000000
F 3.000000
dtype: float64
In [125]: df.mean(1) # mean of each row
Out[125]: 2013-01-01 1.352999
2013-01-02 1.422977
2013-01-03 1.266004
2013-01-04 1.397685
2013-01-05 1.803534
2013-01-06 2.040487
Freq: D, dtype: float64
Apply
In [126]: df.apply(np.cumsum) #cumulative sum of each row
Out[126]: A B C D F
2013-01-01 0.000000 0.000000 0.411997 5 NaN
2013-01-02 0.353742 0.271770 0.901372 10 1.0
2013-01-03 -0.085237 -1.003256 1.945398 15 3.0
2013-01-04 -1.896968 -0.283605 2.025902 20 6.0
2013-01-05 -2.136847 0.090848 1.908996 25 10.0
2013-01-06 -1.997667 1.390015 0.673085 30 15.0
In [132]: df.apply(lambda x:x.max() - x.min(), axis = 0) # (0)/1 -> coln/row
# lambda func <=> function(x): {return x.max() - x.min()}, here x is ret
Out[132]: A 2.165474
B 2.574193
C 2.279937
D 0.000000
F 4.000000
dtype: float64
10
In [133]:
Out[133]: <function __main__.<lambda>>
In [ ]:
11