ML LAB EXPERIMENT SHORTENED WITH SAME OUTPUT
1>>>
import pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.datasets import fetch_california_housing as fch
d = fch(as_frame=True).frame
d.info(); print("\n", d.head())
for c in d:
f, a = plt.subplots(1, 2, figsize=(12, 4))
a[0].hist(d[c], bins=30, color='skyblue', edgecolor='k')
a[0].set(title=f'Hist of {c}', xlabel=c, ylabel='Freq'); a[0].grid(axis='y', alpha=.7)
sns.boxplot(x=d[c], ax=a[1], color='lightgreen')
a[1].set(title=f'Box of {c}', xlabel=c)
plt.tight_layout(); plt.show()
print("\nOutliers:")
for c in d:
q1, q3 = d[c].quantile([.25, .75]); i = q3 - q1
o = d[(d[c] < q1 - 1.5*i) | (d[c] > q3 + 1.5*i)]
print(f"{c}: {len(o)} outliers detected")
2>>> ALREADY SHORT
3>>>
import numpy as np, matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
d = load_iris(); X = StandardScaler().fit_transform(d.data); y = d.target; n = d.target_names
fig = plt.figure(figsize=(11,5)); a1 = fig.add_subplot(121, projection='3d')
for i in np.unique(y): a1.scatter(*X[y==i,:3].T, label=n[i])
a1.set(title='3D Before PCA', xlabel=d.feature_names[0], ylabel=d.feature_names[1],
zlabel=d.feature_names[2]); a1.legend()
X2 = PCA(2).fit_transform(X); a2 = fig.add_subplot(122)
for i in np.unique(y): a2.scatter(*X2[y==i].T, label=n[i])
a2.set(title='2D After PCA', xlabel='PC1', ylabel='PC2'); a2.legend()
plt.tight_layout(); plt.show()
4>>>ALREADY SHORT
5>>>
import numpy as np, pandas as pd
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import accuracy_score as acc
import warnings; warnings.filterwarnings('ignore')
np.random.seed(42)
v = np.random.rand(100); print("Randomly generated values are\n", v)
l = ['Class1' if i <= .5 else 'Class2' for i in v[:50]] + [None]*50
print("Generated Labels are", l)
d = {"Point": [f"x{i+1}" for i in range(100)], "Value": v, "Label": l}; print(d)
df = pd.DataFrame(d); print(df.head())
X, y = df[df.Label.notna()][["Value"]], df[df.Label.notna()]["Label"]
Xt = df[df.Label.isna()][["Value"]]
yt = ['Class1' if i <= .5 else 'Class2' for i in v[50:]]
accs = {}
for k in [1,2,3,4,5,20,30]:
p = KNN(k).fit(X, y).predict(Xt)
df.loc[df.Label.isna(), f"Label_k{k}"] = p
accs[k] = acc(yt, p)*100
print(f"Accuracy for k={k}: {accs[k]:.2f}%")
print(p)
print(df[df.Label.isna()].drop("Label", axis=1))
print("\nAccuracies for different k values:")
for k in accs: print(f"k={k}: {accs[k]:.2f}%")
6>>>
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def kernel(point, X, k):
return np.diag(np.exp(-np.sum((X - point)**2, axis=1) / (2 * k**2)))
def localWeight(point, X, y, k):
return np.linalg.inv(X.T @ kernel(point, X, k) @ X) @ (X.T @ kernel(point, X, k) @ y)
data = pd.read_csv('C:\\Users\\sdmit\\data10_tips.csv')
X = np.hstack((np.ones((len(data), 1)), data.total_bill.values.reshape(-1, 1)))
ypred = [X[i] @ localWeight(X[i], X, data.tip.values.reshape(-1, 1), 0.5) for i in range(X.shape[0])]
plt.scatter(data.total_bill, data.tip, color='green')
plt.plot(np.sort(X[:, 1]), np.array(ypred)[np.argsort(X[:, 1])], color='red', linewidth=4)
plt.show()
7>>>
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings; warnings.filterwarnings("ignore")
# Boston Housing
df = pd.read_csv("C:\\Users\\sdmit\\Downloads\\BostonHousing.csv")
print("shape of the dataset",boston_df.shape)
print(boston_df.head())
X, y = StandardScaler().fit_transform(df.drop("medv", axis=1)), df["medv"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
m = LinearRegression().fit(X_train, y_train)
p = m.predict(X_test)
print(f"MSE:{mean_squared_error(y_test,p):.2f},
RMSE:{np.sqrt(mean_squared_error(y_test,p)):.2f}, R²:{r2_score(y_test,p):.2f}")
sns.scatterplot(x=y_test, y=p); plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--');
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("actual vs predicted price")
plt.show()
# Auto MPG
u = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
c = ["mpg","cyl","disp","hp","wt","acc","year","origin","name"]
d = pd.read_csv(u, names=c, sep="\s+", na_values="?").dropna()
X, y = d[["hp"]].astype(float), d["mpg"]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=.2, random_state=42)
pf, lm = PolynomialFeatures(2), LinearRegression()
p = lm.fit(pf.fit_transform(X_tr), y_tr).predict(pf.transform(X_te))
print(f"MPG MSE:{mean_squared_error(y_te,p):.2f},
RMSE:{np.sqrt(mean_squared_error(y_te,p)):.2f}, R²:{r2_score(y_te,p):.2f}")
s = X_te.values.flatten().argsort()
plt.scatter(X_te, y_te); plt.plot(X_te.values.flatten()[s], p[s], 'r-');
plt.xlabel("Horsepower")
plt.ylabel("MPG")
plt.title("auto mpg polynomial regression")
plt.show()
8>>>
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
warnings.filterwarnings('ignore')
df = pd.read_csv("C:\\Users\\sdmit\\breastcancer.csv").drop('id', axis=1)
print(df.head())
print("Shape of the dataset",df.shape)
print(df.info())
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})
X, y = df.drop('diagnosis', axis=1), df['diagnosis']
Xt, Xs, yt, ys = train_test_split(X, y, test_size=0.2, random_state=42)
m = DecisionTreeClassifier(criterion='entropy').fit(Xt, yt)
def ig(d,f,t): e=lambda c:-sum((p:=c.value_counts(normalize=1))*np.log2(p)); return
e(d[t])-sum((d[f]==v).mean()*e(d[d[f]==v][t]) for v in d[f].unique())
for f in X: print(f"IG {f}: {ig(df,f,'diagnosis'):.4f}")
plt.figure(figsize=(12, 8));
plot_tree(m, feature_names=X.columns, class_names=['B','M'], filled=False, rounded=True);
plt.show()
print("prediction",p)
p = m.predict(Xs)
print("Accuracy:", accuracy_score(ys, p)*100)
print("Report:\n", classification_report(ys, p))
new =
[[12.5,19.2,80,500,0.085,0.1,0.05,0.02,0.17,0.06,0.4,1,2.5,40,0.006,0.02,0.03,0.01,0.02,0.003,1
6,25,105,900,0.13,0.25,0.28,0.12,0.29,0.08]]
print("Prediction:", ["Benign", "Malignant"][m.predict(new)[0]])
9>>>
import numpy as np, matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
d = fetch_olivetti_faces()
X, y = d.data, d.target
Xt, Xs, yt, ys = train_test_split(X, y, test_size=0.3, random_state=46)
m = GaussianNB().fit(Xt, yt)
yp = m.predict(Xs)
print(f"Data: {X.shape}, Target: {y.shape}, Persons: {len(np.unique(y))}, Img: 64x64")
print("Train:", Xt.shape, "Test:", Xs.shape)
print("Confusion Matrix:\n", confusion_matrix(ys, yp))
a = accuracy_score(ys, yp)
print(f"Naive Bayes Accuracy: {round(a*100,2)}%")
print(f"Misclassified: {(yp!=ys).sum()}, Total: {len(ys)}, Accuracy: {round(a*100,2)}%")
def show(title, f):
plt.figure(figsize=(20,15)); plt.suptitle(title, fontsize=16, y=1.05)
for i in range(len(Xs)):
plt.subplot(12,10,i+1)
plt.imshow(Xs[i].reshape(64,64), cmap='gray')
plt.title(f(i)); plt.axis('off')
plt.tight_layout(); plt.show()
show("Actual", lambda i: f"A:{ys[i]}")
show("Predicted", lambda i: f"P:{yp[i]}")
show("Actual vs Predicted", lambda i: f"T:{ys[i]}\nP:{yp[i]}")
10>>>
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings; warnings.filterwarnings('ignore')
df = pd.read_csv("C:\\Users\\sdmit\\Downloads\\Wisconsin Breast Cancer
dataset.csv").drop(['id','Unnamed: 32'], axis=1)
df.head();print(df.shape);print(df.info())
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})
X = StandardScaler().fit_transform(df.drop('diagnosis', axis=1))
X_pca = PCA(n_components=2).fit_transform(X)
wcss = [KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_pca).inertia_ for k in
range(1,11)]
plt.plot(range(1,11), wcss, 'o-'); plt.xlabel("k"); plt.ylabel("WCSS"); plt.title("Elbow Method");
plt.show()
k=2
km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_pca)
plt.scatter(*X_pca.T, c=km.labels_, cmap="viridis", alpha=0.6)
plt.scatter(*km.cluster_centers_.T, s=200, c='r', marker='X', label="Centroids")
plt.title("K-Means Clustering after PCA"); plt.legend();plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2"); plt.show()
NOTE:
1.All the programs have been reduced upto 50% + of their original size these reduction has
been made after enquiring with pradeep rao sir .
2.paste the code into a jupyter or VScode for increased readability.
3.The datasets for the programs will be sent by pradeep sir in Google classroom or through
whatsapp group.
4.Please don't misuse this.
HAPPY LEARNING :)