[go: up one dir, main page]

0% found this document useful (0 votes)
83 views9 pages

Week-5 - Jupyter Notebook

Uploaded by

pramidibalu2005
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
83 views9 pages

Week-5 - Jupyter Notebook

Uploaded by

pramidibalu2005
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [1]: import sklearn


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]: from sklearn.model_selection import train_test_split


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn import metrics

In [5]: import warnings

In [6]: warnings.filterwarnings('ignore')

In [11]: #LOAD DATASET

In [7]: df=pd.read_csv('TSLA.csv')

In [9]: df.head()

Out[9]:
Date Open High Low Close Adj Close Volume

0 2010-06-29 19.000000 25.00 17.540001 23.889999 23.889999 18766300

1 2010-06-30 25.790001 30.42 23.299999 23.830000 23.830000 17187100

2 2010-07-01 25.000000 25.92 20.270000 21.959999 21.959999 8218800

3 2010-07-02 23.000000 23.10 18.709999 19.200001 19.200001 5139800

4 2010-07-06 20.000000 20.00 15.830000 16.110001 16.110001 6866900

In [10]: df.tail()

Out[10]:
Date Open High Low Close Adj Close Volume

2411 2020-01-28 568.489990 576.809998 558.080017 566.900024 566.900024 11788500

2412 2020-01-29 575.690002 589.799988 567.429993 580.989990 580.989990 17801500

2413 2020-01-30 632.419983 650.880005 618.000000 640.809998 640.809998 29005700

2414 2020-01-31 640.000000 653.000000 632.520020 650.570007 650.570007 15719300

2415 2020-02-03 673.690002 786.140015 673.520020 780.000000 780.000000 47065000

In [12]: #EXPLORE dimensions


print('number of data columns:',df.shape[1],'\nnumber of data rows:',df.shape[0])

number of data columns: 7


number of data rows: 2416

localhost:8888/notebooks/Week-5.ipynb# 1/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [13]: df.describe()

Out[13]:
Open High Low Close Adj Close Volume

count 2416.000000 2416.000000 2416.000000 2416.000000 2416.000000 2.416000e+03

mean 186.271147 189.578224 182.916639 186.403651 186.403651 5.572722e+06

std 118.740163 120.892329 116.857591 119.136020 119.136020 4.987809e+06

min 16.139999 16.629999 14.980000 15.800000 15.800000 1.185000e+05

25% 34.342498 34.897501 33.587501 34.400002 34.400002 1.899275e+06

50% 213.035004 216.745002 208.870002 212.960007 212.960007 4.578400e+06

75% 266.450012 270.927513 262.102501 266.774994 266.774994 7.361150e+06

max 673.690002 786.140015 673.520020 780.000000 780.000000 4.706500e+07

In [14]: df.info() #Summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2416 entries, 0 to 2415
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 2416 non-null object
1 Open 2416 non-null float64
2 High 2416 non-null float64
3 Low 2416 non-null float64
4 Close 2416 non-null float64
5 Adj Close 2416 non-null float64
6 Volume 2416 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 132.2+ KB

In [15]: df['date']=pd.to_datetime(df.Date)

In [16]: df.date.dtype

Out[16]: dtype('<M8[ns]')

EXPLORATORY DATA ANALYSIS

localhost:8888/notebooks/Week-5.ipynb# 2/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [17]: plt.figure(figsize=(15,5))
sns.lineplot(data=df,x='date',y='Close')
plt.title('Tesla Close Price',fontsize=15)
plt.ylabel('Price in dollars')
plt.show()

In [20]: #Check for same

In [19]: df[df['Close']==df['Adj Close']].shape

Out[19]: (2416, 8)

In [21]: df.drop(['Adj Close','date'],axis=1,inplace=True)

In [23]: df.head()

Out[23]:
Date Open High Low Close Volume

0 2010-06-29 19.000000 25.00 17.540001 23.889999 18766300

1 2010-06-30 25.790001 30.42 23.299999 23.830000 17187100

2 2010-07-01 25.000000 25.92 20.270000 21.959999 8218800

3 2010-07-02 23.000000 23.10 18.709999 19.200001 5139800

4 2010-07-06 20.000000 20.00 15.830000 16.110001 6866900

In [24]: df.isnull().sum()

Out[24]: Date 0
Open 0
High 0
Low 0
Close 0
Volume 0
dtype: int64

localhost:8888/notebooks/Week-5.ipynb# 3/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [26]: features=['Open','High','Low','Close','Volume']
plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.distplot(df[col])
plt.show()

In [28]: #For outliers

In [27]: plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.boxplot(df[col])
plt.show()

FEATURE ENGINEERING

localhost:8888/notebooks/Week-5.ipynb# 4/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

Feature Construction

In [29]: splitted=df['Date'].str.split('-',expand=True)

In [30]: df['Day']=splitted[2].astype('int')
df['Month']=splitted[1].astype('int')
df['Year']=splitted[0].astype('int')

In [31]: df.drop('Date',axis=1,inplace=True)

In [32]: df.head()

Out[32]:
Open High Low Close Volume Day Month Year

0 19.000000 25.00 17.540001 23.889999 18766300 29 6 2010

1 25.790001 30.42 23.299999 23.830000 17187100 30 6 2010

2 25.000000 25.92 20.270000 21.959999 8218800 1 7 2010

3 23.000000 23.10 18.709999 19.200001 5139800 2 7 2010

4 20.000000 20.00 15.830000 16.110001 6866900 6 7 2010

Month-3,6,9,12 value 1 else 0

In [33]: df['is_quarter_end']=np.where(df['Month']%3==0,1,0)

In [34]: df.head()

Out[34]:
Open High Low Close Volume Day Month Year is_quarter_end

0 19.000000 25.00 17.540001 23.889999 18766300 29 6 2010 1

1 25.790001 30.42 23.299999 23.830000 17187100 30 6 2010 1

2 25.000000 25.92 20.270000 21.959999 8218800 1 7 2010 0

3 23.000000 23.10 18.709999 19.200001 5139800 2 7 2010 0

4 20.000000 20.00 15.830000 16.110001 6866900 6 7 2010 0

In [39]: df.grouped=df.groupby('Year').mean() #GROUPING BY YEAR

localhost:8888/notebooks/Week-5.ipynb# 5/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [40]: plt.subplots(figsize=(20,10))
for i,col in enumerate(['Open','High','Low','Close']):
plt.subplot(2,2,i+1)
data_grouped[col].plot.bar()
plt.show()

In [42]: df.groupby('is_quarter_end').mean()

Out[42]:
Open High Low Close Volume Day Month

is_quarter_end

0 185.875081 189.254226 182.449499 186.085081 5.767062e+06 15.710396 6.173886 2014.

1 187.071200 190.232700 183.860262 187.047163 5.180154e+06 15.825000 7.597500 2014.

In [43]: df['open-close']=df['Open']-df['Close']
df['high-low']=df['High']-df['Low']
df['target']=np.where(df['Close'].shift(-1) > df['Close'],1,0)

localhost:8888/notebooks/Week-5.ipynb# 6/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [44]: plt.figure(figsize=(10,10))
sns.heatmap(df.corr()>0.9,annot=True,cbar=False)
plt.show()

localhost:8888/notebooks/Week-5.ipynb# 7/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [45]: plt.pie(df['target'].value_counts().values,labels=[0,1],autopct='%1.1f%%')
plt.show()

In [46]: features=df[['open-close','high-low','is_quarter_end']]
target=df['target']

In [47]: scaler=StandardScaler()
features=scaler.fit_transform(features)

In [48]: #SPLIT DATASET


x_train,x_test,y_train,y_test=train_test_split(features,target,test_size=0.1,random_st
print(x_test.shape,x_train.shape)

(242, 3) (2174, 3)

In [ ]: ​

localhost:8888/notebooks/Week-5.ipynb# 8/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [50]: #MODEL DEVELOPMENT & EVALUATION


class CustomXGBClassifier(XGBClassifier):
def __repr__(self):
return "XGBClassifier"
models=[LogisticRegression(),SVC(kernel='poly',probability=True),CustomXGBClassifier(
for model in models:
model.fit(x_train,y_train)
training_accuracy=metrics.roc_auc_score(y_train,model.predict_proba(x_train)[:,1]
validation_accuracy=metrics.roc_auc_score(y_test,model.predict_proba(x_test)[:,1]
print(model)
print("Training Accuracy:",training_accuracy)
print("Validation Accuracy:",validation_accuracy)

LogisticRegression()
Training Accuracy: 0.5228802330060918
Validation Accuracy: 0.4923371647509579
SVC(kernel='poly', probability=True)
Training Accuracy: 0.4704775693536028
Validation Accuracy: 0.5374247400109469
XGBClassifier
Training Accuracy: 0.943461732220797
Validation Accuracy: 0.4487889983579639

In [ ]: ​

localhost:8888/notebooks/Week-5.ipynb# 9/9

You might also like