In [1]: import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [5]: df = pd.read_csv("FB-1 (1).csv")
df.head()
Out[5]: status_id num_reactions num_comments num_shares num_likes num_loves num_wows num_hahas num_sads
0 246675545449582_1649696485147474 529 512 262 432 92 3 1 1
1 246675545449582_1649426988507757 150 0 0 150 0 0 0 0
2 246675545449582_1648730588577397 227 236 57 204 21 1 1 0
3 246675545449582_1648576705259452 111 0 0 111 0 0 0 0
4 246675545449582_1645700502213739 213 0 0 204 9 0 0 0
Q5) Which of the variables in the dataset is not significant for doing Principal
Component Analysis?
Now, let us go ahead and drop the column 'status_id' as that variable is of no use to us when we are doing
Principal Component Analysis.
In [6]: df_new = df.drop(['status_id'],axis = 1)
df_new.head()
Out[6]: num_reactions num_comments num_shares num_likes num_loves num_wows num_hahas num_sads num_angrys status_link status_phot
0 529 512 262 432 92 3 1 1 0 0
1 150 0 0 150 0 0 0 0 0 0
2 227 236 57 204 21 1 1 0 0 0
3 111 0 0 111 0 0 0 0 0 0
4 213 0 0 204 9 0 0 0 0 0
In [ ]:
Q6) After doing z-score scaling on the dataset, what is the value of the 2nd observation of the
variable ‘ num_hahas’?
In [7]: from scipy.stats import zscore
df_new=df_new.apply(zscore)
df_new.head()
Out[7]: num_reactions num_comments num_shares num_likes num_loves num_wows num_hahas num_sads num_angrys status_link status_phot
0 0.646104 0.323350 1.686879 0.482727 1.983266 0.196196 0.076713 0.473570 -0.155748 -0.094957 -1.24599
1 -0.173192 -0.252206 -0.304144 -0.144720 -0.318454 -0.147879 -0.176010 -0.152587 -0.155748 -0.094957 0.80257
2 -0.006738 0.013089 0.129017 -0.024571 0.206938 -0.033187 0.076713 -0.152587 -0.155748 -0.094957 -1.24599
3 -0.257499 -0.252206 -0.304144 -0.231495 -0.318454 -0.147879 -0.176010 -0.152587 -0.155748 -0.094957 0.80257
4 -0.037003 -0.252206 -0.304144 -0.024571 -0.093286 -0.147879 -0.176010 -0.152587 -0.155748 -0.094957 0.80257
ANS - The value of the 2nd observation of the variable ‘ num_hahas’ is -0.176010 .
In [ ]:
Q7) Apply PCA taking all features and extract 6 components and Find out the eigenvector of
the 5th component
In [9]: #Apply PCA taking all features
from sklearn.decomposition import PCA
pca = PCA(n_components=6, random_state=123)
pca_transformed = pca.fit_transform(df_new)
In [10]: #Extract eigen vectors
pca.components_
Out[10]: array([[ 0.29363054, 0.34749787, 0.44325444, 0.2517696 , 0.46125508,
0.29634039, 0.30885435, 0.16313058, 0.23724676, -0.00138341,
-0.23261371, 0.01379735],
[ 0.60664114, -0.230746 , -0.20491048, 0.6406539 , -0.16591724,
0.01626203, -0.13903343, -0.11041549, -0.12687418, 0.06418546,
0.03655064, 0.21318874],
[ 0.11200241, -0.087548 , -0.00392859, 0.10570202, 0.05181555,
0.21154873, 0.101801 , -0.04987934, 0.08923166, -0.23521304,
0.64341911, -0.65653464],
[ 0.00104601, -0.01595734, 0.03483879, -0.00173808, 0.03336338,
0.03375172, 0.01780145, -0.25206584, -0.042459 , 0.89259956,
-0.07188694, -0.35877103],
[ 0.08189114, 0.1862877 , -0.06986598, 0.1020903 , -0.13942737,
-0.37729947, -0.13429183, 0.81640504, 0.12355741, 0.20996813,
0.11148599, -0.15861432],
[-0.08520722, -0.43754044, -0.19674073, -0.09669555, -0.00487 ,
0.37224941, 0.05770028, 0.17312055, 0.66984295, 0.19021436,
0.12087046, 0.28612412]])
ANS - [ 0.08189114, 0.1862877 , -0.06986598, 0.1020903 , -0.13942737, -0.37729947, -0.13429183, 0.81640504, 0.12355741, 0.20996813,
0.11148599, -0.15861432],
In [ ]:
Q8) What is the eigenvector associated with the Second variable?
ANS - [ 0.60664114, -0.230746 , -0.20491048, 0.6406539 , -0.16591724, 0.01626203, -0.13903343, -0.11041549, -0.12687418, 0.06418546,
0.03655064, 0.21318874]
In [ ]:
Q9) Using the scaled dataset, Find out eigenvalues?
In [11]: #Check the eigen values
#Note: This is always returned in descending order
pca.explained_variance_
Out[11]: array([3.596288 , 1.78479109, 1.2511225 , 1.02089676, 0.95528279,
0.84959164])
ANS -([3.596288 , 1.78479109, 1.2511225 , 1.02089676, 0.95528279, 0.84959164])
In [ ]:
Q10) Using the given dataset, What are explained variances
In This
[12]: #Check
study source the explained
was downloaded variance
by 100000838437785 for each
from CourseHero.com PC
on 02-18-2022 07:44:44 GMT -06:00
#Note: Explained variance = (eigen value of each PC)/(sum of eigen values of all PCs)
pca.explained_variance_ratio_
https://www.coursehero.com/file/102630915/AS-Quiz-3-PCA-Solutionpdf/
Out[12]: array([0.29964816, 0.14871149, 0.10424542, 0.08506266, 0.07959561,
0.07078926])
ANS - ([0.29964816, 0.14871149, 0.10424542, 0.08506266, 0.07959561, 0.07078926])
In [ ]:
Powered by TCPDF (www.tcpdf.org)