8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [1]: import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [4]: from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn import metrics
In [5]: import warnings
In [6]: warnings.filterwarnings('ignore')
In [11]: #LOAD DATASET
In [7]: df=pd.read_csv('TSLA.csv')
In [9]: df.head()
Out[9]:
Date Open High Low Close Adj Close Volume
0 2010-06-29 19.000000 25.00 17.540001 23.889999 23.889999 18766300
1 2010-06-30 25.790001 30.42 23.299999 23.830000 23.830000 17187100
2 2010-07-01 25.000000 25.92 20.270000 21.959999 21.959999 8218800
3 2010-07-02 23.000000 23.10 18.709999 19.200001 19.200001 5139800
4 2010-07-06 20.000000 20.00 15.830000 16.110001 16.110001 6866900
In [10]: df.tail()
Out[10]:
Date Open High Low Close Adj Close Volume
2411 2020-01-28 568.489990 576.809998 558.080017 566.900024 566.900024 11788500
2412 2020-01-29 575.690002 589.799988 567.429993 580.989990 580.989990 17801500
2413 2020-01-30 632.419983 650.880005 618.000000 640.809998 640.809998 29005700
2414 2020-01-31 640.000000 653.000000 632.520020 650.570007 650.570007 15719300
2415 2020-02-03 673.690002 786.140015 673.520020 780.000000 780.000000 47065000
In [12]: #EXPLORE dimensions
print('number of data columns:',df.shape[1],'\nnumber of data rows:',df.shape[0])
number of data columns: 7
number of data rows: 2416
localhost:8888/notebooks/Week-5.ipynb# 1/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [13]: df.describe()
Out[13]:
Open High Low Close Adj Close Volume
count 2416.000000 2416.000000 2416.000000 2416.000000 2416.000000 2.416000e+03
mean 186.271147 189.578224 182.916639 186.403651 186.403651 5.572722e+06
std 118.740163 120.892329 116.857591 119.136020 119.136020 4.987809e+06
min 16.139999 16.629999 14.980000 15.800000 15.800000 1.185000e+05
25% 34.342498 34.897501 33.587501 34.400002 34.400002 1.899275e+06
50% 213.035004 216.745002 208.870002 212.960007 212.960007 4.578400e+06
75% 266.450012 270.927513 262.102501 266.774994 266.774994 7.361150e+06
max 673.690002 786.140015 673.520020 780.000000 780.000000 4.706500e+07
In [14]: df.info() #Summary of dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2416 entries, 0 to 2415
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 2416 non-null object
1 Open 2416 non-null float64
2 High 2416 non-null float64
3 Low 2416 non-null float64
4 Close 2416 non-null float64
5 Adj Close 2416 non-null float64
6 Volume 2416 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 132.2+ KB
In [15]: df['date']=pd.to_datetime(df.Date)
In [16]: df.date.dtype
Out[16]: dtype('<M8[ns]')
EXPLORATORY DATA ANALYSIS
localhost:8888/notebooks/Week-5.ipynb# 2/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [17]: plt.figure(figsize=(15,5))
sns.lineplot(data=df,x='date',y='Close')
plt.title('Tesla Close Price',fontsize=15)
plt.ylabel('Price in dollars')
plt.show()
In [20]: #Check for same
In [19]: df[df['Close']==df['Adj Close']].shape
Out[19]: (2416, 8)
In [21]: df.drop(['Adj Close','date'],axis=1,inplace=True)
In [23]: df.head()
Out[23]:
Date Open High Low Close Volume
0 2010-06-29 19.000000 25.00 17.540001 23.889999 18766300
1 2010-06-30 25.790001 30.42 23.299999 23.830000 17187100
2 2010-07-01 25.000000 25.92 20.270000 21.959999 8218800
3 2010-07-02 23.000000 23.10 18.709999 19.200001 5139800
4 2010-07-06 20.000000 20.00 15.830000 16.110001 6866900
In [24]: df.isnull().sum()
Out[24]: Date 0
Open 0
High 0
Low 0
Close 0
Volume 0
dtype: int64
localhost:8888/notebooks/Week-5.ipynb# 3/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [26]: features=['Open','High','Low','Close','Volume']
plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.distplot(df[col])
plt.show()
In [28]: #For outliers
In [27]: plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.boxplot(df[col])
plt.show()
FEATURE ENGINEERING
localhost:8888/notebooks/Week-5.ipynb# 4/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
Feature Construction
In [29]: splitted=df['Date'].str.split('-',expand=True)
In [30]: df['Day']=splitted[2].astype('int')
df['Month']=splitted[1].astype('int')
df['Year']=splitted[0].astype('int')
In [31]: df.drop('Date',axis=1,inplace=True)
In [32]: df.head()
Out[32]:
Open High Low Close Volume Day Month Year
0 19.000000 25.00 17.540001 23.889999 18766300 29 6 2010
1 25.790001 30.42 23.299999 23.830000 17187100 30 6 2010
2 25.000000 25.92 20.270000 21.959999 8218800 1 7 2010
3 23.000000 23.10 18.709999 19.200001 5139800 2 7 2010
4 20.000000 20.00 15.830000 16.110001 6866900 6 7 2010
Month-3,6,9,12 value 1 else 0
In [33]: df['is_quarter_end']=np.where(df['Month']%3==0,1,0)
In [34]: df.head()
Out[34]:
Open High Low Close Volume Day Month Year is_quarter_end
0 19.000000 25.00 17.540001 23.889999 18766300 29 6 2010 1
1 25.790001 30.42 23.299999 23.830000 17187100 30 6 2010 1
2 25.000000 25.92 20.270000 21.959999 8218800 1 7 2010 0
3 23.000000 23.10 18.709999 19.200001 5139800 2 7 2010 0
4 20.000000 20.00 15.830000 16.110001 6866900 6 7 2010 0
In [39]: df.grouped=df.groupby('Year').mean() #GROUPING BY YEAR
localhost:8888/notebooks/Week-5.ipynb# 5/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [40]: plt.subplots(figsize=(20,10))
for i,col in enumerate(['Open','High','Low','Close']):
plt.subplot(2,2,i+1)
data_grouped[col].plot.bar()
plt.show()
In [42]: df.groupby('is_quarter_end').mean()
Out[42]:
Open High Low Close Volume Day Month
is_quarter_end
0 185.875081 189.254226 182.449499 186.085081 5.767062e+06 15.710396 6.173886 2014.
1 187.071200 190.232700 183.860262 187.047163 5.180154e+06 15.825000 7.597500 2014.
In [43]: df['open-close']=df['Open']-df['Close']
df['high-low']=df['High']-df['Low']
df['target']=np.where(df['Close'].shift(-1) > df['Close'],1,0)
localhost:8888/notebooks/Week-5.ipynb# 6/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [44]: plt.figure(figsize=(10,10))
sns.heatmap(df.corr()>0.9,annot=True,cbar=False)
plt.show()
localhost:8888/notebooks/Week-5.ipynb# 7/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [45]: plt.pie(df['target'].value_counts().values,labels=[0,1],autopct='%1.1f%%')
plt.show()
In [46]: features=df[['open-close','high-low','is_quarter_end']]
target=df['target']
In [47]: scaler=StandardScaler()
features=scaler.fit_transform(features)
In [48]: #SPLIT DATASET
x_train,x_test,y_train,y_test=train_test_split(features,target,test_size=0.1,random_st
print(x_test.shape,x_train.shape)
(242, 3) (2174, 3)
In [ ]:
localhost:8888/notebooks/Week-5.ipynb# 8/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [50]: #MODEL DEVELOPMENT & EVALUATION
class CustomXGBClassifier(XGBClassifier):
def __repr__(self):
return "XGBClassifier"
models=[LogisticRegression(),SVC(kernel='poly',probability=True),CustomXGBClassifier(
for model in models:
model.fit(x_train,y_train)
training_accuracy=metrics.roc_auc_score(y_train,model.predict_proba(x_train)[:,1]
validation_accuracy=metrics.roc_auc_score(y_test,model.predict_proba(x_test)[:,1]
print(model)
print("Training Accuracy:",training_accuracy)
print("Validation Accuracy:",validation_accuracy)
LogisticRegression()
Training Accuracy: 0.5228802330060918
Validation Accuracy: 0.4923371647509579
SVC(kernel='poly', probability=True)
Training Accuracy: 0.4704775693536028
Validation Accuracy: 0.5374247400109469
XGBClassifier
Training Accuracy: 0.943461732220797
Validation Accuracy: 0.4487889983579639
In [ ]:
localhost:8888/notebooks/Week-5.ipynb# 9/9