[go: up one dir, main page]

0% found this document useful (0 votes)
16 views11 pages

External

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views11 pages

External

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

1.

Create NumPy arrays from Python Data Structures, Intrinsic NumPy objects and Random Functions

import numpy as np

# From Python Data Structures

list_data = np.array([1, 2, 3, 4, 5])

tuple_data = np.array((1, 2, 3, 4, 5))

dict_data = np.array({'a': 1, 'b': 2})

# Intrinsic NumPy Objects

zeros = np.zeros((3, 3))

ones = np.ones((2, 2))

identity = np.eye(4)

arange = np.arange(0, 10, 2)

linspace = np.linspace(0, 1, 5)

# Random Functions

rand = np.random.rand(3, 3)

randint = np.random.randint(1, 10, size=(2, 3))

normal = np.random.normal(0, 1, 5)

# Output

print("From Python Data Structures:")

print("List to Array:", list_data)

print("Tuple to Array:", tuple_data)

print("Dict to Array:", dict_data)

print("\nIntrinsic NumPy Objects:")

print("Zeros Array:", zeros)

print("Ones Array:", ones)

print("Identity Matrix:", identity)

print("Arange:", arange)

print("Linspace:", linspace)
print("\nRandom Functions:")

print("Random Array (Uniform):", rand)

print("Random Integers:", randint)

print("Random Normal Distribution:", normal)

2.Create Pandas Series and DataFrame from various inputs.

import pandas as pd

import numpy as np

# Pandas Series from different inputs

series_from_list = pd.Series([1, 2, 3, 4, 5])

series_from_dict = pd.Series({'a': 1, 'b': 2, 'c': 3})

series_from_numpy = pd.Series(np.array([10, 20, 30, 40]))

# Pandas DataFrame from various inputs

df_from_dict = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

df_from_list = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=['X', 'Y'])

df_from_numpy = pd.DataFrame(np.random.rand(3, 4), columns=['W', 'X', 'Y', 'Z'])

# Outputs

print("Series from List:", series_from_list)

print("Series from Dict:", series_from_dict)

print("Series from Numpy Array:", series_from_numpy)

print("\nDataFrame from Dict:")

print(df_from_dict)

print("\nDataFrame from List:")

print(df_from_list)

print("\nDataFrame from Numpy Array:")

print(df_from_numpy)
3,4. Develop a model on residual analysis of simple linear regression.

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.datasets import fetch_california_housing # Use California housing dataset

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

# Load California housing dataset

california = fetch_california_housing()

X = california.data[:, 3].reshape(-1, 1) # Using 'AveRooms' (average rooms per household)

y = california.target

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the simple linear regression model

model = LinearRegression()

model.fit(X_train, y_train)

# Predict the target values using the test set

y_pred = model.predict(X_test)

# Calculate residuals

residuals = y_test - y_pred

# Residual Analysis

# 1. Plotting residuals vs. fitted values (predictions)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

plt.scatter(y_pred, residuals, color='blue')

plt.axhline(y=0, color='red', linestyle='--')

plt.title('Residuals vs Fitted Values')


plt.xlabel('Fitted Values')

plt.ylabel('Residuals')

# 2. Plotting histogram of residuals

plt.subplot(1, 2, 2)

sns.histplot(residuals, kde=True, color='green')

plt.title('Histogram of Residuals')

plt.xlabel('Residuals')

plt.ylabel('Frequency')

plt.tight_layout()

plt.show()

# 3. Checking Normality with a Q-Q plot

import scipy.stats as stats

plt.figure(figsize=(6, 6))

stats.probplot(residuals, dist="norm", plot=plt)

plt.title('Q-Q Plot of Residuals')

plt.show()

# 4. Checking Homoscedasticity: Residuals vs Fitted values (already shown above)

# 5. Print model performance metrics

print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")

print(f"R-squared: {r2_score(y_test, y_pred)}")

5. Import any CSV file to Pandas DataFrame and perform the following:

(a) Handle missing data by detecting and dropping/ filling missing values.

(b) Transform data using apply () and map() method.


import pandas as pd

import numpy as np

file_path = r'C:\Users\peral\Downloads\test.csv' # Use raw string for Windows paths

# Load the CSV file

df = pd.read_csv(file_path)
print(df.head())

# (a) Handle Missing Data

print("\nMissing Data Detection:")

print(df.isnull().sum())

# Dropping rows with missing values

df_dropped = df.dropna()

print("\nDataFrame after Dropping Rows with Missing Values:")

print(df_dropped.head())

# Filling missing values (corrected)

df_filled = df.copy()

for column in df.columns:

if df[column].dtype == 'object': # For categorical columns

df_filled[column] = df[column].fillna(df[column].mode()[0]) # Corrected assignment

else: # For numerical columns

df_filled[column] = df[column].fillna(df[column].mean()) # Corrected assignment

print("\nDataFrame after Filling Missing Values:")

print(df_filled.head())

# (b) Transform Data using apply() and map()

# Apply transformation for numerical columns

for column in df.columns:

if df[column].dtype != 'object': # Apply transformation for numerical columns

df[column + ' Group'] = df[column].apply(lambda x: 'High' if x > df[column].median() else 'Low')

print(f"\nDataFrame after Applying Transformation to {column}:")

print(df[[column, column + ' Group']].head())

# Map categorical columns to uppercase

for column in df.select_dtypes(include=['object']).columns:

df[column] = df[column].map(lambda x: x.upper() if isinstance(x, str) else x)

print(f"\nDataFrame after Mapping {column} to Uppercase:")

print(df[[column]].head())
6. Visualize data using Line Plots, Bar Plots, Histograms, Density Plots and Scatter Plots.

import seaborn as sns

import matplotlib.pyplot as plt

# Load the built-in Iris dataset

df = sns.load_dataset('iris')

# Line Plot

plt.figure(figsize=(10, 6))

sns.lineplot(x=df.index, y=df['sepal_length'])

plt.title('Line Plot of Sepal Length')

plt.show()

# Bar Plot

plt.figure(figsize=(10, 6))

sns.barplot(x='species', y='sepal_length', data=df)

plt.title('Bar Plot of Sepal Length by Species')

plt.show()

# Histogram

plt.figure(figsize=(10, 6))

sns.histplot(df['sepal_length'], kde=False, bins=20)

plt.title('Histogram of Sepal Length')

plt.show()

# Density Plot (Updated with fill=True instead of shade=True)

plt.figure(figsize=(10, 6))

sns.kdeplot(df['sepal_length'], fill=True)

plt.title('Density Plot of Sepal Length')

plt.show()
# Scatter Plot

plt.figure(figsize=(10, 6))

sns.scatterplot(x='sepal_length', y='sepal_width', hue='species', data=df)

plt.title('Scatter Plot of Sepal Length vs Sepal Width')

plt.show()

7. Manipulation of NumPy arrays - Indexing, Slicing, Reshaping, Joining and Splitting.

import numpy as np

# Indexing

arr = np.array([1, 2, 3, 4, 5])

print(arr[2])

# Slicing

arr2 = np.array([10, 20, 30, 40, 50, 60])

print(arr2[1:4])

# Reshaping

arr3 = np.array([1, 2, 3, 4, 5, 6])

print(arr3.reshape(2, 3))

# Joining

arr4 = np.array([1, 2, 3])

arr5 = np.array([4, 5, 6])

print(np.concatenate((arr4, arr5)))

# Splitting

arr6 = np.array([10, 20, 30, 40, 50, 60])

print(np.split(arr6, 3))
8. Import any CSV file to Pandas DataFrame and perform the following:

(a) Visualize the first and last 10 records.

(b) Get the shape, index and column details.

import pandas as pd

# Import CSV file

df = pd.read_csv(r"C:\Users\peral\Downloads\train.csv") # Make sure the path is correct

# (a) Visualize the first and last 10 records

print("First 10 records:")

print(df.head(10))

print("\nLast 10 records:")

print(df.tail(10))

# (b) Get the shape, index, and column details

print("\nShape:", df.shape)

print("Index:", df.index)

print("Columns:", df.columns)

9. Residual plots of linear regression.

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Load data

df = pd.read_csv(r'C:\Users\peral\Downloads\train.csv') # Updated file path

# Choose the feature and target columns

X = df[['age']].values # Use 'age' as the feature (replace with any other numerical column you prefer)

y = df['stroke'].values # Use 'stroke' as the target column

# Train linear regression model

model = LinearRegression()

model.fit(X, y)
# Predictions

y_pred = model.predict(X)

# Residuals

residuals = y - y_pred

# Plot residuals

plt.scatter(X, residuals)

plt.axhline(y=0, color='r', linestyle='--')

plt.xlabel('Age')

plt.ylabel('Residuals')

plt.title('Residual Plot')

plt.show()

10. Computation on NumPy arrays using Universal Functions and Mathematical methods.

import numpy as np

# Create array

arr = np.array([1, 2, 3, 4, 5])

# Universal functions (ufuncs)

print(np.sqrt(arr))

print(np.exp(arr))

print(np.log(arr))

# Mathematical methods

print(np.sum(arr))

print(np.mean(arr))

print(np.median(arr))

print(np.std(arr))

print(np.var(arr))
11. import any CSV file to Pandas DataFrame and perform the following:

(a) Detect and filter outliers.

(b) Perform Vectorized String operations on Pandas Series.

import pandas as pd

import numpy as np

df = pd.read_csv(r'C:\Users\peral\Downloads\train.csv')

column_name = 'age'

Q1 = df[column_name].quantile(0.25)

Q3 = df[column_name].quantile(0.75)

IQR = Q3 - Q1

filtered_df = df[(df[column_name] >= (Q1 - 1.5 * IQR)) & (df[column_name] <= (Q3 + 1.5 * IQR))]

print("Filtered DataFrame (Outliers removed):")

print(filtered_df)

string_column = 'gender'

df[string_column] = df[string_column].str.lower()

df[string_column] = df[string_column].str.replace('old', 'new')

print("\nDataFrame after string operations:")

print(df[[string_column]].head())

12. Download the House Pricing dataset from Kaggle and map the values to 23 Aesthetics.

import pandas as pd

file_path = 'path_to_your_downloaded_folder/train.csv'

df = pd.read_csv(file_path)

aesthetic_mapping = {

'ExterCond': {'Po': 'Poor', 'Fa': 'Fair', 'TA': 'Average', 'Gd': 'Good', 'Ex': 'Excellent'},

'ExterQual': {'Po': 'Poor', 'Fa': 'Fair', 'TA': 'Average', 'Gd': 'Good', 'Ex': 'Excellent'},

'PoolQC': {'NA': 'No Pool', 'Ex': 'Excellent', 'Gd': 'Good', 'TA': 'Average', 'Fa': 'Fair'},

for column, mapping in aesthetic_mapping.items():


if column in df.columns:

df[column] = df[column].map(mapping).fillna(df[column])

print(df[['ExterCond', 'ExterQual', 'PoolQC']].head())

You might also like