[go: up one dir, main page]

0% found this document useful (0 votes)
4 views11 pages

External

Download as pdf or txt
Download as pdf or txt
Download as pdf or txt
You are on page 1/ 11

1.

Create NumPy arrays from Python Data Structures, Intrinsic NumPy objects and Random Functions

import numpy as np

# From Python Data Structures

list_data = np.array([1, 2, 3, 4, 5])

tuple_data = np.array((1, 2, 3, 4, 5))

dict_data = np.array({'a': 1, 'b': 2})

# Intrinsic NumPy Objects

zeros = np.zeros((3, 3))

ones = np.ones((2, 2))

identity = np.eye(4)

arange = np.arange(0, 10, 2)

linspace = np.linspace(0, 1, 5)

# Random Functions

rand = np.random.rand(3, 3)

randint = np.random.randint(1, 10, size=(2, 3))

normal = np.random.normal(0, 1, 5)

# Output

print("From Python Data Structures:")

print("List to Array:", list_data)

print("Tuple to Array:", tuple_data)

print("Dict to Array:", dict_data)

print("\nIntrinsic NumPy Objects:")

print("Zeros Array:", zeros)

print("Ones Array:", ones)

print("Identity Matrix:", identity)

print("Arange:", arange)

print("Linspace:", linspace)
print("\nRandom Functions:")

print("Random Array (Uniform):", rand)

print("Random Integers:", randint)

print("Random Normal Distribution:", normal)

2.Create Pandas Series and DataFrame from various inputs.

import pandas as pd

import numpy as np

# Pandas Series from different inputs

series_from_list = pd.Series([1, 2, 3, 4, 5])

series_from_dict = pd.Series({'a': 1, 'b': 2, 'c': 3})

series_from_numpy = pd.Series(np.array([10, 20, 30, 40]))

# Pandas DataFrame from various inputs

df_from_dict = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

df_from_list = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=['X', 'Y'])

df_from_numpy = pd.DataFrame(np.random.rand(3, 4), columns=['W', 'X', 'Y', 'Z'])

# Outputs

print("Series from List:", series_from_list)

print("Series from Dict:", series_from_dict)

print("Series from Numpy Array:", series_from_numpy)

print("\nDataFrame from Dict:")

print(df_from_dict)

print("\nDataFrame from List:")

print(df_from_list)

print("\nDataFrame from Numpy Array:")

print(df_from_numpy)
3,4. Develop a model on residual analysis of simple linear regression.

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.datasets import fetch_california_housing # Use California housing dataset

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

# Load California housing dataset

california = fetch_california_housing()

X = california.data[:, 3].reshape(-1, 1) # Using 'AveRooms' (average rooms per household)

y = california.target

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the simple linear regression model

model = LinearRegression()

model.fit(X_train, y_train)

# Predict the target values using the test set

y_pred = model.predict(X_test)

# Calculate residuals

residuals = y_test - y_pred

# Residual Analysis

# 1. Plotting residuals vs. fitted values (predictions)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

plt.scatter(y_pred, residuals, color='blue')

plt.axhline(y=0, color='red', linestyle='--')

plt.title('Residuals vs Fitted Values')


plt.xlabel('Fitted Values')

plt.ylabel('Residuals')

# 2. Plotting histogram of residuals

plt.subplot(1, 2, 2)

sns.histplot(residuals, kde=True, color='green')

plt.title('Histogram of Residuals')

plt.xlabel('Residuals')

plt.ylabel('Frequency')

plt.tight_layout()

plt.show()

# 3. Checking Normality with a Q-Q plot

import scipy.stats as stats

plt.figure(figsize=(6, 6))

stats.probplot(residuals, dist="norm", plot=plt)

plt.title('Q-Q Plot of Residuals')

plt.show()

# 4. Checking Homoscedasticity: Residuals vs Fitted values (already shown above)

# 5. Print model performance metrics

print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")

print(f"R-squared: {r2_score(y_test, y_pred)}")

5. Import any CSV file to Pandas DataFrame and perform the following:

(a) Handle missing data by detecting and dropping/ filling missing values.

(b) Transform data using apply () and map() method.


import pandas as pd

import numpy as np

file_path = r'C:\Users\peral\Downloads\test.csv' # Use raw string for Windows paths

# Load the CSV file

df = pd.read_csv(file_path)
print(df.head())

# (a) Handle Missing Data

print("\nMissing Data Detection:")

print(df.isnull().sum())

# Dropping rows with missing values

df_dropped = df.dropna()

print("\nDataFrame after Dropping Rows with Missing Values:")

print(df_dropped.head())

# Filling missing values (corrected)

df_filled = df.copy()

for column in df.columns:

if df[column].dtype == 'object': # For categorical columns

df_filled[column] = df[column].fillna(df[column].mode()[0]) # Corrected assignment

else: # For numerical columns

df_filled[column] = df[column].fillna(df[column].mean()) # Corrected assignment

print("\nDataFrame after Filling Missing Values:")

print(df_filled.head())

# (b) Transform Data using apply() and map()

# Apply transformation for numerical columns

for column in df.columns:

if df[column].dtype != 'object': # Apply transformation for numerical columns

df[column + ' Group'] = df[column].apply(lambda x: 'High' if x > df[column].median() else 'Low')

print(f"\nDataFrame after Applying Transformation to {column}:")

print(df[[column, column + ' Group']].head())

# Map categorical columns to uppercase

for column in df.select_dtypes(include=['object']).columns:

df[column] = df[column].map(lambda x: x.upper() if isinstance(x, str) else x)

print(f"\nDataFrame after Mapping {column} to Uppercase:")

print(df[[column]].head())
6. Visualize data using Line Plots, Bar Plots, Histograms, Density Plots and Scatter Plots.

import seaborn as sns

import matplotlib.pyplot as plt

# Load the built-in Iris dataset

df = sns.load_dataset('iris')

# Line Plot

plt.figure(figsize=(10, 6))

sns.lineplot(x=df.index, y=df['sepal_length'])

plt.title('Line Plot of Sepal Length')

plt.show()

# Bar Plot

plt.figure(figsize=(10, 6))

sns.barplot(x='species', y='sepal_length', data=df)

plt.title('Bar Plot of Sepal Length by Species')

plt.show()

# Histogram

plt.figure(figsize=(10, 6))

sns.histplot(df['sepal_length'], kde=False, bins=20)

plt.title('Histogram of Sepal Length')

plt.show()

# Density Plot (Updated with fill=True instead of shade=True)

plt.figure(figsize=(10, 6))

sns.kdeplot(df['sepal_length'], fill=True)

plt.title('Density Plot of Sepal Length')

plt.show()
# Scatter Plot

plt.figure(figsize=(10, 6))

sns.scatterplot(x='sepal_length', y='sepal_width', hue='species', data=df)

plt.title('Scatter Plot of Sepal Length vs Sepal Width')

plt.show()

7. Manipulation of NumPy arrays - Indexing, Slicing, Reshaping, Joining and Splitting.

import numpy as np

# Indexing

arr = np.array([1, 2, 3, 4, 5])

print(arr[2])

# Slicing

arr2 = np.array([10, 20, 30, 40, 50, 60])

print(arr2[1:4])

# Reshaping

arr3 = np.array([1, 2, 3, 4, 5, 6])

print(arr3.reshape(2, 3))

# Joining

arr4 = np.array([1, 2, 3])

arr5 = np.array([4, 5, 6])

print(np.concatenate((arr4, arr5)))

# Splitting

arr6 = np.array([10, 20, 30, 40, 50, 60])

print(np.split(arr6, 3))
8. Import any CSV file to Pandas DataFrame and perform the following:

(a) Visualize the first and last 10 records.

(b) Get the shape, index and column details.

import pandas as pd

# Import CSV file

df = pd.read_csv(r"C:\Users\peral\Downloads\train.csv") # Make sure the path is correct

# (a) Visualize the first and last 10 records

print("First 10 records:")

print(df.head(10))

print("\nLast 10 records:")

print(df.tail(10))

# (b) Get the shape, index, and column details

print("\nShape:", df.shape)

print("Index:", df.index)

print("Columns:", df.columns)

9. Residual plots of linear regression.

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Load data

df = pd.read_csv(r'C:\Users\peral\Downloads\train.csv') # Updated file path

# Choose the feature and target columns

X = df[['age']].values # Use 'age' as the feature (replace with any other numerical column you prefer)

y = df['stroke'].values # Use 'stroke' as the target column

# Train linear regression model

model = LinearRegression()

model.fit(X, y)
# Predictions

y_pred = model.predict(X)

# Residuals

residuals = y - y_pred

# Plot residuals

plt.scatter(X, residuals)

plt.axhline(y=0, color='r', linestyle='--')

plt.xlabel('Age')

plt.ylabel('Residuals')

plt.title('Residual Plot')

plt.show()

10. Computation on NumPy arrays using Universal Functions and Mathematical methods.

import numpy as np

# Create array

arr = np.array([1, 2, 3, 4, 5])

# Universal functions (ufuncs)

print(np.sqrt(arr))

print(np.exp(arr))

print(np.log(arr))

# Mathematical methods

print(np.sum(arr))

print(np.mean(arr))

print(np.median(arr))

print(np.std(arr))

print(np.var(arr))
11. import any CSV file to Pandas DataFrame and perform the following:

(a) Detect and filter outliers.

(b) Perform Vectorized String operations on Pandas Series.

import pandas as pd

import numpy as np

df = pd.read_csv(r'C:\Users\peral\Downloads\train.csv')

column_name = 'age'

Q1 = df[column_name].quantile(0.25)

Q3 = df[column_name].quantile(0.75)

IQR = Q3 - Q1

filtered_df = df[(df[column_name] >= (Q1 - 1.5 * IQR)) & (df[column_name] <= (Q3 + 1.5 * IQR))]

print("Filtered DataFrame (Outliers removed):")

print(filtered_df)

string_column = 'gender'

df[string_column] = df[string_column].str.lower()

df[string_column] = df[string_column].str.replace('old', 'new')

print("\nDataFrame after string operations:")

print(df[[string_column]].head())

12. Download the House Pricing dataset from Kaggle and map the values to 23 Aesthetics.

import pandas as pd

file_path = 'path_to_your_downloaded_folder/train.csv'

df = pd.read_csv(file_path)

aesthetic_mapping = {

'ExterCond': {'Po': 'Poor', 'Fa': 'Fair', 'TA': 'Average', 'Gd': 'Good', 'Ex': 'Excellent'},

'ExterQual': {'Po': 'Poor', 'Fa': 'Fair', 'TA': 'Average', 'Gd': 'Good', 'Ex': 'Excellent'},

'PoolQC': {'NA': 'No Pool', 'Ex': 'Excellent', 'Gd': 'Good', 'TA': 'Average', 'Fa': 'Fair'},

for column, mapping in aesthetic_mapping.items():


if column in df.columns:

df[column] = df[column].map(mapping).fillna(df[column])

print(df[['ExterCond', 'ExterQual', 'PoolQC']].head())

You might also like