ASSIGNMENT
1.Linear Regression
Code:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error,
r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Load dataset
data = pd.read_csv('dataset.csv')
# Display basic info
print(data.head())
print(data.info())
# Handle missing values (example: drop rows with missing values)
data = data.dropna()
data = data[data['Production'] != '=']
# Verify the rows are removed
print(data[data['Production'] == '='])
# Encode categorical features
categorical_cols = ['State_Name', 'District_Name', 'Crop', 'Season']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
# Define features and target variable
X = data[['Area', 'Season', 'Crop', 'Crop_Year']] # Example features
y = data['Production']
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Dataset:
Input:
Output:
2.Logistic Regression
Code:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
#Read the dataset using pandas (replace 'your_dataset.csv' with your actual file
path)
data = pd.read_csv('studyhours.csv')
print(data)
#Assuming the target column is 'target' and all other coulmnss are features
X = data.drop(columns=['status']) #Drop the target column to get features
y = data['status'] #Target variable
#Split the data into training and testing sets
X_train,X_test,y_train,y_test =
train_test_split(X,y,test_size=0.4,random_state=20)
#Initialize the Logistic Regression model
model = LogisticRegression()
#Train the model
model.fit(X_train,y_train)
#Make predictions on the test data
y_pred = model.predict(X_test)
#Evaluate the model
accuracy = accuracy_score(y_test,y_pred)
conf_matrix = confusion_matrix(y_test,y_pred)
#Print results
print("Accuracy.",accuracy)
print("Confusion Matrix.")
print(conf_matrix)
Dataset:
Input:
Output:
3.Random Forest Classification
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
from sklearn.preprocessing import LabelEncoder
# Load the Titanic dataset
file_path = 'titanic.csv' # Replace with your Titanic dataset file path
data = pd.read_csv(file_path)
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())
# Drop columns not relevant for the model
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1,
errors='ignore')
# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
# Encode categorical features
categorical_cols = ['Sex', 'Embarked']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
# Define features and target variable
X = data.drop(['Survived'], axis=1)
y = data['Survived']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Display results
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
Dataset:
Input:
Output:
4. Decision Tree id3
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Load the weather dataset
filename = "weather.csv" # Update this path to your CSV file
df = pd.read_csv(filename)
print(df)
# Remove the 'Day' feature if present
df = df.drop(columns=['Day'], errors='ignore')
# Display the first few rows of the dataset
df.head()
# Encode categorical features using LabelEncoder
label_encoders = {}
for column in df.columns:
if df[column].dtype == 'object': # Apply encoding only to categorical columns
le = LabelEncoder()
df[column] = le.fit_transform(df[column])
label_encoders[column] = le
print("----------------------------After fit and
transform------------------------------------------")
print(df)
# Define features and target
X = df.iloc[:, :-1] # All columns except the last as features
y = df.iloc[:, -1] # Last column as target
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Build the decision tree classifier using the entropy criterion
model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X_train, y_train)
# Visualize the decision tree
plt.figure(figsize=(10, 6))
plot_tree(model, feature_names=X.columns,
class_names=label_encoders[df.columns[-1]].classes_,
filled=True, rounded=True, fontsize=10)
plt.title("Simple ID3 Decision Tree for Weather Dataset")
plt.show()
Dataset:
Input:
Output:
5.Clustering
Code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
# Load dataset from CSV file
df = pd.read_csv('student_marks.csv') # Ensure the file exists
# Selecting relevant features
marks = df[['Subject1', 'Subject2']].values
# Standardizing the data
scaler = StandardScaler()
marks_scaled = scaler.fit_transform(marks)
# Applying K-Means Clustering
k = 2 # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(marks_scaled)
# Get centroids
centroids = kmeans.cluster_centers_
# Assign cluster names based on performance
cluster_names = {0: 'High Performers', 1: 'Low Performers'} # Modify as
needed
df['Cluster Name'] = df['Cluster'].map(cluster_names)
# Save clustered data to CSV
df.to_csv('student_marks_clustered.csv', index=False)
# Performance Metrics
inertia = kmeans.inertia_ # SSE
silhouette_avg = silhouette_score(marks_scaled, df['Cluster'])
db_index = davies_bouldin_score(marks_scaled, df['Cluster'])
print(f"Inertia (SSE): {inertia:.2f}")
print(f"Silhouette Score: {silhouette_avg:.2f}")
print(f"Davies-Bouldin Index: {db_index:.2f}")
# Display cluster-wise information
print("\nCluster Information:")
print(df.groupby('Cluster Name')[['Subject1', 'Subject2']].mean())
# Plot the clusters
plt.figure(figsize=(8, 6))
plt.scatter(marks_scaled[:, 0], marks_scaled[:, 1], c=df['Cluster'], cmap='viridis',
marker='o', edgecolors='k', label='Students')
plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='red', marker='X',
label='Centroids')
plt.xlabel('Subject 1 (Scaled)')
plt.ylabel('Subject 2 (Scaled)')
plt.title('K-Means Clustering of Student Marks')
plt.legend()
plt.show()
Dataset:
Input:
Output:
6. Support Vector Machine SVM
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Read the dataset from CSV
df = pd.read_csv('Crop_recommendation.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
print(X)
print(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train and evaluate Support Vector Machine
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
# Train and evaluate Logistic Regression
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
# Print the accuracy scores
print(f'SVM Accuracy: {svm_accuracy:.4f}')
print(f'Logistic Regression Accuracy: {logreg_accuracy:.4f}')
Dataset:
Input:
Output: