DSBDA ASSIGNMENT 1
Importing Libraries
import pandas as pd
Loading csv file
data= pd.read_csv("C:/Users/Neha/Desktop/FOLDERS/6th
SEM/DSBDA_PRACTICALS/StudentsPerformance.csv")
data.columns
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object')
data.gender
0 female
1 female
2 female
3 male
4 male
...
995 female
996 male
997 female
998 female
999 female
Name: gender, Length: 1000, dtype: object
data['math score']
0 72
1 69
2 90
3 47
4 76
..
995 88
996 62
997 59
998 68
999 77
Name: math score, Length: 1000, dtype: int64
data.isnull() #checks if there are any null values
gender race/ethnicity parental level of education lunch test
preparation course math score reading score writing score
0 False False False False False False False False
1 False False False False False False False False
2 False False False False False False False False
3 False False False False False False False False
4 False False False False False False False False
... ... ... ... ... ... ... ... ...
995 False False False False False False False False
996 False False False False False False False False
997 False False False False False False False False
998 False False False False False False False False
999 False False False False False False False False
data1= pd.read_csv("C:/Users/Neha/Desktop/FOLDERS/6th
SEM/DSBDA_PRACTICALS/score.csv")
data1.columns
Index(['Name', 'M1 Score', 'M2 Score'], dtype='object')
WRITING TO A CSV FILE USING
1)USING csv.writer
import csv
header=['Name','M1 Score','M2 Score']
data1 = [['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w', newline="") as file:
csvwriter = csv.writer(file) # create a csvwriter object
csvwriter.writerow(header) # write the header
csvwriter.writerows(data1) # write the rest of the data
2) USING csv.writer
import csv
header=['Name','M1 Score','M2 Score']
data1 = [['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w', newline="") as file:
csvwriter=csv.writer(file) #create csvwriter object
csvwriter.writerow(header) #write header
csvwriter.writerows(data) #write the rest of the data
3) USING .writelines()
header=['Name','M1 Score','M2 Score']
data1=[['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'score.csv'
with open(filename, 'w') as file:
for header in header:
file.write(str(header)+',')
file.write('\n')
for row in data1:
for x in row:
file.write(str(x)+',')
file.write('\n')
4)USING pandas
header = ['Name', 'M1 Score', 'M2 Score']
data1 = [['Alex', 62, 80], ['Brad', 56], ['Joey', 85, 98],[85,98]]
data1 = pd.DataFrame(data1, columns=header)
data1.to_csv('score.csv', index=False) #The index=False argument specifies that
the row indices should not be included in the CSV file.
data1.isnull()
Name M1 Score M2 Score
0 False False False
1 False False True
2 False False False
3 False False True
data1.Name.isnull()
0 False
1 False
2 False
3 False
Name: Name, dtype: bool
data1.isnull().sum()
Name 0
M1 Score 0
M2 Score 2
dtype: int64
Count NaN(Not a Number) values in Pandas DataFrame
1) Using Numpy
import pandas as pd
import numpy as np
data2={'set_of_numbers':[1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan]}
df=pd.DataFrame(data2)
print(df)
set_of_numbers
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 NaN
6 6.0
7 7.0
8 NaN
9 NaN
10 8.0
11 9.0
12 10.0
13 NaN
Count number of NaN values in the DataFrame
import pandas as pd
import numpy as np
data3={'first_set':[1,2,3,4,5,np.nan,6,7,np.nan,np.nan],
'second_set':['a','b',np.nan,np.nan,'c','d','e',np.nan,np.nan,'f'],
'third_set':['aa',np.nan,'bb','cc',np.nan,np.nan,'dd',np.nan,np.nan,'ee']}
df1=pd.DataFrame(data3,columns=['first_set','second_set','third_set'])
print(df1)
first_set second_set third_set
0 1.0 a aa
1 2.0 b NaN
2 3.0 NaN bb
3 4.0 NaN cc
4 5.0 c NaN
5 NaN d NaN
6 6.0 e dd
7 7.0 NaN NaN
8 NaN NaN NaN
9 NaN f ee
# Count NaNvalues under a single DataFramecolumn
count_nan=df1['first_set'].isna().sum()
print('count of NaN :'+ str(count_nan))
count of NaN :3
# Count NaN values under the entire DataFrame
count_nan_full=df1.isna().sum().sum()
print('count of entire NaN :'+ str(count_nan_full))
count of entire NaN :12
# Count NaN values across a single DataFramerow
count_nan_row=df1.loc[[7]].isna().sum().sum()
print('count of row NaN :'+ str(count_nan_row))
count of row NaN :2
Remove Duplicates from Pandas DataFrame
import pandas as pd
boxes={'Color': ['Green','Green','Green','Blue','Blue','Red','Red','Red'],
'Shape':
['Rectangle','Rectangle','Square','Rectangle','Square','Square','Square','Rectangle
']}
df2=pd.DataFrame(boxes,columns=['Color','Shape'])
print(df2)
Color Shape
0 Green Rectangle
1 Green Rectangle
2 Green Square
3 Blue Rectangle
4 Blue Square
5 Red Square
6 Red Square
7 Red Rectangle
df2_duplicates_removed=df2.drop_duplicates()
print(df2_duplicates_removed)
#only the distinct values across the two columns remain
Color Shape
0 Green Rectangle
2 Green Square
3 Blue Rectangle
4 Blue Square
5 Red Square
7 Red Rectangle
# to remove the duplicates on a specific column
df2_duplicates_removed=df2.drop_duplicates(subset=['Color'])
print(df2_duplicates_removed)
Color Shape
0 Green Rectangle
3 Blue Rectangle
5 Red Square
df2_duplicates_removed=df2.drop_duplicates(subset=['Shape'])
print(df2_duplicates_removed)
Color Shape
0 Green Rectangle
2 Green Square
# Get the Descriptive Statistics for Pandas DataFram
#step 1 -- collect data
#step 2 --create dataframe
import pandas as pd
data_a = {'product': ['A', 'B', 'C', 'c','D'], 'price': [22000, 27000, 25000,
29000, 35000], 'year': [2014, 2015, 2016, 2017, 2018] }
df_a = pd.DataFrame(data_a)
df_a
product price year
0 A 22000 2014
1 B 27000 2015
2 C 25000 2016
3 c 29000 2017
4 D 35000 2018
#step 3 -- Get the Descriptive Statistics for Pandas DataFrame
# Descriptive Statistics for column price
stats_numeric=df_a['price'].describe()
stats_numeric
count 5.000000
mean 27600.000000
std 4878.524367
min 22000.000000
25% 25000.000000
50% 27000.000000
75% 29000.000000
max 35000.000000
Name: price, dtype: float64
# Descriptive Statistics for column product
stats_categorical= df_a['product'].describe()
stats_categorical
count 5
unique 5
top A
freq 1
Name: product, dtype: object
# Descriptive Statistics for entire dataframe
stats=df_a.describe(include='all')
stats
product price year
count 5 5.000000 5.000000
unique 5 NaN NaN
top A NaN NaN
freq 1 NaN NaN
mean NaN 27600.000000 2016.000000
std NaN 4878.524367 1.581139
min NaN 22000.000000 2014.000000
25% NaN 25000.000000 2015.000000
50% NaN 27000.000000 2016.000000
75% NaN 29000.000000 2017.000000
max NaN 35000.000000 2018.000000