# Before using any methods of pandas, you have to import the library
import pandas as pd
Series : It is like a single column of a table or a 1 dimensional array
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)
# Output is the first column is index, 2nd is the series a
print(myvar[0])
# Provide the first column index to proper names.
myvar = pd.Series(a,index = ["row1","row2","row3"])
print(myvar)
Dataframes is actually similar to either
a. tables in SQL or
b. 2d dimensional array
healthdata = {
"weight": [25, 60, 84,103,76,40],
"height": [140,182,161,182,192,123]
}
myhealth = pd.DataFrame(healthdata)
print(myhealth)
# By using the index you can give a proper name called label to the
index if needed.
# Row1, Row2 ... Row6 are labels.
healthdata = {
"weight": [25, 60, 84,103,76,40],
"height": [140,182,161,182,192,123]
}
mydf =
pd.DataFrame(healthdata,index=["Row1","Row2","Row3","Row4","Row5","Row
6"])
print(mydf)
# To subset the data we can apply Boolean indexing.
# This indexing is commonly known as a filter.
# For example if we want to subset the rows in which the weight is
greater than 73:
# Filter the weight above 100
mydf_sub = mydf[ mydf['weight'] > 73 ]
print(mydf_sub)
Like >, you can also check ==, <, <=, >= Filter the height less than 125
#There are a number of ways to subset the Data Frame:
# one or more columns
# one or more rows
# a subset of rows and columns
# Rows and columns can be selected by their position or label
#
# When selecting one column, it is possible to use single set of
brackets,
# but the resulting object will be a Series (not a DataFrame):
#Select column height, this will return a series
mydf['height']
# When we need to select more than one column and/or make the output
to be a DataFrame,
# we should use double brackets:
#Select column 'weight' , height:
mydf[['weight','height']]
# If we need to select a range of rows, we can specify the range using
":"
#Select rows by their position:
mydf[1:3]
# Notice that the first row has a position 0, and the last value in
the range is omitted:
# So for 0:10 range the first 10 rows are returned with the positions
starting with 0 and ending with 9
#If we need to select a range of rows, using their labels we can use
method loc:
#Select rows by their labels:
mydf.loc["Row1":"Row3",['weight']]
# If we need to select a range of rows and/or columns, using their
positions we can use method iloc:
# 1st position is the height
# remember the position starts from 0.
mydf.iloc[2:4,[1]]
Below set of iloc and loc code snippets for self practice They are different variations of how we
can get subset of dataframe based on index filter.
mydfdf.iloc[0] # First row of a data frame
i = 2
mydf.iloc[i] #(i+1)th row
mydf.iloc[-1] # Last row
mydf.iloc[:, 0] # First column
mydf.iloc[:, -1] # Last column
mydf.iloc[0:7] #First 7 rows
mydf.iloc[:, 0:2] #First 2 columns
mydf.iloc[1:3, 0:2] #Second through third rows and first 2 columns
mydf.iloc[[0,5], [1,3]] #1st and 6th rows and 2nd and 4th columns
Let's understanding what data Cleaning means. Its all about fix bad data. Bad data could be:
Empty cells Data in wrong format Wrong data Duplicates
Orders = {
"Orderno" : [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,0,21,17],
"CustomerName" :
["Rk","Mike","Ben","Veronica","Maria","Lata","Judiath","Blake","George
","Duke","Prish",
"Ivan",None,"Nancy","Sarvesh","Jay","Jayant","Margaret","Jay"],
"OrderAmount" : [2400,1432,173,258,3402,7143,143422,1734,2143,12,-
23473,17343,593,432,943,999,1843,0,999],
"OrderQty" :
[24,32,17,2,4,143,None,172,432,21,2,17,13,12,8,12,6,3,12],
"OrderStatus" :
["Open","Closed","InProgress","Cancelled",None,"Open","Closed","InProg
ress","Cancelled",
"Open","Closed","InProgress","Cancelled",None,"Open","Closed","InProgr
ess","Cancelled","Closed"],
}
ordersdf = pd.DataFrame(Orders)
print(ordersdf)
Let's look at empty cells. there are empty cells at row index 4, 12 and 13 We can do following
1. Remove all rows which have empty values
2. Replace it with some value
# remove all row with empty values, all values with None are treated
as empty values
newordersdf = ordersdf.dropna()
print(newordersdf)
# To replace all row with empty values, all values with None are
treated as empty values
newordersdf = ordersdf.fillna(19)
print(newordersdf)
# Here this is wrong as customer name, order status cannot have value
19.
# Let's change only the orderstatus with Na to "Open"
ordersdf.fillna({"OrderStatus":"Open"}, inplace = True)
# inplace=True changes the original ordersdf as well.
print(ordersdf)
# Orderqty is None, there we can try to have a mean or median as
replace value
OrdMedian = ordersdf["OrderQty"].median()
ordersdf.fillna({"OrderQty":OrdMedian},inplace=True)
print(ordersdf)
Let's handle now wrong data value Refer to OrderAmount it has a negative value. This could be
a pure mistake and we can replace the specific value in dataframe
ordersdf.loc[10,"OrderAmount"] = 23473
print(ordersdf)
Wrong Format can be changed Let's see orderqty seems to be a float while it should be integer.
This can be changed using type casting
ordersdf['OrderQty'] = ordersdf['OrderQty'].astype(int)
print(ordersdf)
Let's Now check for Duplicate entries. Here we will use duplicated() and drop_duplicated()
print(ordersdf.duplicated())
ordersdf.drop_duplicates(inplace = True)
print(ordersdf)