Experimenting With Data Analysis Packages and Statistical Operations
Experimenting With Data Analysis Packages and Statistical Operations
Date: 7/8/2024
To explore and utilize data analysis packages like NumPy, SciPy, Jupyter, Statsmodels, and Pandas for
data manipulation and statistical analysis on a chosen dataset, focusing on descriptive analytics and key
statistical measures.
1.Exploring NumPY:
Numpy module
The numpy module in python is created for performing faster mathematical operations such as
matrix multiplication, inversion by storing the features into arrays known as numpy arrays.
Code:
import numpy as np
#Creating an array
arr = np.array([12,26,27,28,30])
print(arr)
print(arr.dtype)
Output:
Code:
Output:
Code:
# Example array
array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# Descriptive statistics
sum_value = np.sum(array)
min_value = np.min(array)
max_value = np.max(array)
range_value = np.ptp(array)
cumsum_value = np.cumsum(array)
cumprod_value = np.cumprod(array)
# Print results
print("Sum:", sum_value)
print("Min:", min_value)
print("Max:", max_value)
print("Range:", range_value)
print("Cumulative Sum:", cumsum_value)
print("Cumulative Product:", cumprod_value)
Output:
Code:
# Mean
data = np.array([10, 20, 30, 40, 50])
mean = np.mean(data)
print("Mean:", mean)
# Median
median = np.median(data)
print("Median:", median)
# Standard Deviation
std_dev = np.std(data)
print("Standard Deviation:", std_dev)
# Variance
variance = np.var(data)
print("Variance:", variance)
2
# Percentile
percentile_25 = np.percentile(data, 25)
print("25th Percentile:", percentile_25)
# Covariance Matrix
cov_matrix = np.cov(x, y)
print("Covariance Matrix:\n", cov_matrix)
# Histogram
hist, bin_edges = np.histogram(data, bins=5)
print("Histogram:", hist)
print("Bin Edges:", bin_edges)
# Unique Elements
unique_elements = np.unique(data)
print("Unique Elements:", unique_elements)
# Dot Product
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
dot_product = np.dot(a, b)
print("Dot Product:", dot_product)
# Random Data
np.random.seed(0)
random_data = np.random.rand(5)
print("Random Data:", random_data)
Output:
3
Code:
# Example matrix
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
# Transpose
transpose_matrix = np.transpose(matrix)
# or using matrix.T
transpose_matrix_alt = matrix.T
# Inverse
# Note: Matrix must be square and non-singular for inversion
try:
inverse_matrix = np.linalg.inv(matrix)
except np.linalg.LinAlgError:
inverse_matrix = "Matrix is singular or not square"
# Determinant
determinant = np.linalg.det(matrix)
print("Transpose:\n", transpose_matrix)
print("Transpose (alternative method):\n", transpose_matrix_alt)
print("Inverse:\n", inverse_matrix)
print("Determinant:", determinant)
4
print("Eigenvalues:", eigenvalues)
print("Eigenvectors:\n", eigenvectors)
print("SVD U:\n", U)
print("SVD S:", S)
print("SVD Vt:\n", Vt)
print("Matrix Product:\n", matrix_product)
print("Matrix Product (alternative method):\n", matrix_product_alt)
print("Trace:", trace)
Output:
2.Exploring SciPY:
Scipy module
5
Scipy module provides mathematical algorithm functions for numpy array features for faster
computation.
Code:
# Linear Regression
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print("Linear Regression - Slope:", slope)
print("Linear Regression - Intercept:", intercept)
# Integration
integral, error = integrate.quad(lambda x: x**2, 0, 1)
print("Integral of x^2 from 0 to 1:", integral)
# Optimization
def objective_function(params):
6
return np.sum((y - (params[0] * x + params[1]))**2)
initial_guess = [1, 0]
result = optimize.minimize(objective_function, initial_guess)
print("Optimization result:", result.x)
# Descriptive Statistics
desc_stats = stats.describe(data)
print("Descriptive Statistics:", desc_stats)
# Interquartile Range
iqr = stats.iqr(data)
print("Interquartile Range:", iqr)
# Z-score
z_scores = stats.zscore(data)
print("Z-scores:", z_scores)
Output:
3.Exploring Statsmodels:
Statsmodel module
The statsmodel module provides the function of summarizing or providing the final result of the
trained model.
Code:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.datasets import load_iris
import pandas as pd
# Load Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
7
iris_df['species'] = iris_df['species'].map({i: species for i, species in enumerate(iris.target_names)})
# Logistic Regression (predicting species from sepal length and petal length)
# Converting species to a binary outcome for simplicity
iris_df['species_binary'] = (iris_df['species'] == 'versicolor').astype(int)
Output:
4.Exploring Pandas:
Pandas module
8
This module provides basic functions to work with datasets which are really helpful for data
scientist for analysing the data.
Code:
import pandas as pd
import numpy as np
# 1. DataFrame Creation
df_created = pd.DataFrame(data)
print("\nDataFrame Created from Dictionary:")
print(df_created)
# 3. Summary Information
print("\nDataFrame Info:")
print(df.info())
# 4. Sampling
print("\nRandom Sample of 2 Rows:")
print(df.sample(2))
df_dropped = df.dropna()
9
print("\nDataFrame with Missing Values Dropped:")
print(df_dropped)
# 6. Data Aggregation
mean_values = df.groupby('B').mean()
print("\nMean Values Grouped by 'B':")
print(mean_values)
# 7. Merging DataFrames
df2 = pd.DataFrame({'B': ['a', 'b'], 'E': [1, 2]})
merged_df = pd.merge(df, df2, on='B', how='left')
print("\nMerged DataFrame:")
print(merged_df)
# 8. Sorting
sorted_df = df.sort_values(by='A', ascending=False)
print("\nDataFrame Sorted by 'A':")
print(sorted_df)
# 9. Filtering
filtered_df = df[df['A'] > 2]
print("\nFiltered DataFrame (A > 2):")
print(filtered_df)
10
df_renamed = df.rename(columns={'A': 'Column_A', 'B': 'Column_B'})
print("\nDataFrame with Renamed Columns:")
print(df_renamed)
Output:
11
Reading from Text File, CSV File, Excel File and Web File:
12
example1 = "/content/Data Analytic Lab.txt"
file = open(example1, "r")
FileContent = file.read()
FileContent
print(FileContent)
Output:
Code:
import pandas as pd
df = pd.read_csv("/content/mxmh_survey_results.csv")
df.head()
df.info()
Output:
Code:
13
#EXCEL
df1 = pd.read_excel("/content/DAEX1.xlsx")
df1.head()
Output:
Code:
#WEB FILE
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data'
df2 = pd.read_csv(url)
df2.head()
Output:
Code:
import pandas as pd
file_path = '/content/mxmh_survey_results.csv'
df = pd.read_csv(file_path)
df.head()
Output:
Code:
df.describe()
14
Output:
Code:
Output:
Code:
15
# Columns with missing values
missing_values[missing_values > 0]
Output:
Code:
# Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'].dropna(), kde=True, bins=20)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
Output:
16
Code:
Output:
Code:
17
import seaborn as sns
import matplotlib.pyplot as plt
numeric_df = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.show()
Output:
Result:
Successfully explored data analysis packages and applied statistical operations on the chosen dataset,
calculating descriptive measures such as mean, median, and standard deviation. Identified data insights
through interpretation of variance, skewness, and kurtosis.
18