DATA MINING-II
SUBMITTED BY: SUBMITTED TO:
SANJEEVAN Dr. Rama Bansal
SECTION - A
ROLL NO.: 23BAA030
INDEX
Serial No. Practical Page Remarks
1 Perform partitioning,
hierarchical, and density-based
clustering algorithms on a 1-4
downloaded dataset and evaluate
the cluster quality by changing
the algorithm's parameters.
2 Perform the following text
mining preprocessing steps on a
text document:
a. Stop Word Removal
b. Stemming 5-8
c. Removal of punctuation
marks
d. Compute the inverse
document frequency of
the words in the
document
3 Use the Decision Tree
classification algorithm to
construct a classifier on two
datasets. Evaluate the classifier's
performance by dividing the
dataset into a training set (75%)
and a test set (25%). Compare 9-13
the performance with that of:
a. Bagging ensemble
consisting of 3,5,7,9
Decision tree classifiers
b. Adaboost ensemble
consisting of 3,5,7,9
Decision tree classifiers
4 Download a dataset and check
whether outliers are present in
the dataset. Use different 14-16
methods of outlier detection and
compare their performance.
5 Perform CluStream algorithm on
any time series data from Kaggle
and compare its output with that 17-19
of K-means clustering. Evaluate
the cluster quality by changing
the algorithm's parameters.
Q1. Performing partitioning, hierarchical, and density-
based clustering algorithms on a downloaded dataset
and evaluate the cluster quality by changing the
algorithm's parameters.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
# Load dataset (Replace with your dataset file)
df = pd.read_csv("Dataset.csv") # Ensure dataset has numerical features
# Selecting numerical columns for clustering
X = df.select_dtypes(include=[np.number])
X = X.dropna() # Drop missing values
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Partitioning Clustering: K-Means
k_values = [2, 3, 4, 5]
kmeans_scores = {}
for k in k_values:
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_scaled)
score = silhouette_score(X_scaled, labels)
kmeans_scores[k] = score
print(f'K-Means (k={k}) Silhouette Score: {score:.3f}')
//OUTPUT//
K-Means (k=2) Silhouette Score: 0.437
K-Means (k=3) Silhouette Score: 0.463
K-Means (k=4) Silhouette Score: 0.558
K-Means (k=5) Silhouette Score: 0.513
# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=3)
h_labels = hierarchical.fit_predict(X_scaled)
print(f'Hierarchical Clustering Silhouette Score: {silhouette_score(X_scaled, h_labels):.3f}')
# Dendrogram
linked = linkage(X_scaled, method='ward')
plt.figure(figsize=(10, 5))
dendrogram(linked)
plt.title("Hierarchical Clustering Dendrogram")
plt.show()
//OUTPUT//
Hierarchical Clustering Silhouette Score: 0.463
# Density-Based Clustering: DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
db_labels = dbscan.fit_predict(X_scaled)
# Exclude noise (-1 labels) for silhouette score calculation
if len(set(db_labels)) > 1:
dbscan_score = silhouette_score(X_scaled[db_labels != -1], db_labels[db_labels != -1])
print(f'DBSCAN Silhouette Score: {dbscan_score:.3f}')
else:
print("DBSCAN found only noise points. Try adjusting parameters.")
# Visualizing K-Means Clusters
best_k = max(kmeans_scores, key=kmeans_scores.get)
kmeans_best = KMeans(n_clusters=best_k, random_state=42).fit(X_scaled)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans_best.labels_, cmap='viridis', alpha=0.6)
plt.title(f'K-Means Clustering with k={best_k}')
plt.show()
DBSCAN found only noise points. Try adjusting parameters.
Q2. Perform the following text mining preprocessing
steps on a text document:
a. Stop Word Removal
b. Stemming
c. Removal of punctuation marks
d. Compute the inverse document frequency of the
words in the document.
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
# Download NLTK data files (only first time)
nltk.download('punkt')
nltk.download('stopwords')
# Step 2: Sample Text Document (You can modify this)
document = """
Natural language processing (NLP) is a sub-field of artificial intelligence (AI).
It is focused on enabling computers to understand and process human languages.
"""
# Step 3: Remove Punctuation
def remove_punctuation(text):
return ''.join([char for char in text if char not in string.punctuation])
doc_no_punct = remove_punctuation(document.lower())
# Step 4: Tokenization
tokens = nltk.word_tokenize(doc_no_punct)
# Step 5: Stop Word Removal
stop_words = set(stopwords.words('english'))
tokens_no_stopwords = [word for word in tokens if word not in stop_words]
# Step 6: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens_no_stopwords]
print("Stemmed Tokens after Preprocessing:")
print(stemmed_tokens)
# Step 7: Computing Inverse Document Frequency (IDF)
# Let's create a small corpus with variations of the document for IDF
corpus = [
document,
"NLP helps machines understand human language.",
"Artificial intelligence involves machine learning and NLP."
# Use TfidfVectorizer to compute IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)
# Extract IDF scores
idf_scores = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
print("\nInverse Document Frequency (IDF) of Words:")
for word, score in idf_scores.items():
print(f"{word}: {score:.4f}")
//OUTPUT//
[nltk_data] Downloading package punkt to
[nltk_data] C:\Users\jitu\AppData\Roaming\nltk_data...
[nltk_data] Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\jitu\AppData\Roaming\nltk_data...
[nltk_data] Unzipping corpora\stopwords.zip.
Stemmed Tokens after Preprocessing:
['natur', 'languag', 'process', 'nlp', 'subfield', 'artifici', 'intellig', 'ai', 'focus', 'enabl', 'comput',
'understand', 'process', 'human', 'languag']
Inverse Document Frequency (IDF) of Words:
ai: 1.6931
artificial: 1.2877
computers: 1.6931
enabling: 1.6931
field: 1.6931
focused: 1.6931
helps: 1.6931
human: 1.2877
intelligence: 1.2877
involves: 1.6931
language: 1.2877
languages: 1.6931
learning: 1.6931
machine: 1.6931
machines: 1.6931
natural: 1.6931
nlp: 1.0000
process: 1.6931
processing: 1.6931
sub: 1.6931
understand: 1.2877
Q3. Use the Decision Tree classification algorithm to
construct a classifier on two datasets. Evaluate the
classifier's performance by dividing the dataset into a
training set (75%) and a test set (25%). Compare the
performance with that of:
a. Bagging ensemble consisting of 3,5,7,9 Decision
tree classifiers
b. Adaboost ensemble consisting of 3,5,7,9 Decision
tree classifiers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
df1 = pd.read_csv("Dataset 1.csv")
df2 = pd.read_csv("Dataset 2.csv")
df1
df2
def process_and_evaluate(df, dataset_name):
print(f"Evaluating on {dataset_name}")
# Selecting numerical features and target
X = df.select_dtypes(include=[np.number]).drop(columns=['label'], errors='ignore')
y = df['label']
# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Splitting dataset (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)
# Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {acc_dt:.3f}")
# Bagging with Decision Trees
for n in [3, 5, 7, 9]:
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=n,
random_state=42)
bagging.fit(X_train, y_train)
y_pred_bag = bagging.predict(X_test)
acc_bag = accuracy_score(y_test, y_pred_bag)
print(f"Bagging ({n} estimators) Accuracy: {acc_bag:.3f}")
# AdaBoost with Decision Trees
for n in [3, 5, 7, 9]:
adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=n,
random_state=42)
adaboost.fit(X_train, y_train)
y_pred_ada = adaboost.predict(X_test)
acc_ada = accuracy_score(y_test, y_pred_ada)
print(f"AdaBoost ({n} estimators) Accuracy: {acc_ada:.3f}")
print("\n")
# Evaluate on both datasets
process_and_evaluate(df1, "Dataset 1")
process_and_evaluate(df2, "Dataset 2")
//OUTPUT//
Evaluating on Dataset 1
Decision Tree Accuracy: 0.400
Bagging (3 estimators) Accuracy: 0.400
Bagging (5 estimators) Accuracy: 0.400
Bagging (7 estimators) Accuracy: 0.600
Bagging (9 estimators) Accuracy: 0.800
AdaBoost (3 estimators) Accuracy: 0.200
AdaBoost (5 estimators) Accuracy: 0.200
AdaBoost (7 estimators) Accuracy: 0.200
AdaBoost (9 estimators) Accuracy: 0.200
Evaluating on Dataset 2
Decision Tree Accuracy: 0.400
Bagging (3 estimators) Accuracy: 0.400
Bagging (5 estimators) Accuracy: 0.400
Bagging (7 estimators) Accuracy: 0.400
Bagging (9 estimators) Accuracy: 0.400
AdaBoost (3 estimators) Accuracy: 0.800
AdaBoost (5 estimators) Accuracy: 0.800
AdaBoost (7 estimators) Accuracy: 0.800
AdaBoost (9 estimators) Accuracy: 0.800
Q4. Download a dataset and check whether outliers are
present in the dataset. Use different methods of outlier
detection and compare their performance.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore
# Step 1: Download Dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
# Step 2: Detect Outliers Using Different Methods
# 1. Z-score Method (Threshold: |Z| > 3)
def detect_outliers_zscore(df, threshold=3):
z_scores = np.abs(zscore(df))
outliers = (z_scores > threshold).sum(axis=1)
return outliers > 0
z_outliers = detect_outliers_zscore(df)
# 2. IQR Method
def detect_outliers_iqr(df):
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum(axis=1)
return outliers > 0
iqr_outliers = detect_outliers_iqr(df)
# 3. Isolation Forest Method (Final Fix)
iso_forest = IsolationForest(contamination=0.05, random_state=42)
# Ensure we fit with a DataFrame
iso_forest.fit(df)
# Fix: Predict using the same DataFrame format
df_for_prediction = pd.DataFrame(df, columns=df.columns) # Ensure correct format
outliers_iso = iso_forest.predict(df_for_prediction)
isolation_outliers = outliers_iso == -1 # Convert -1 to boolean
# Step 3: Compare Results
print("Outliers detected by Z-score:", np.sum(z_outliers))
print("Outliers detected by IQR:", np.sum(iqr_outliers))
print("Outliers detected by Isolation Forest:", np.sum(isolation_outliers))
# Visualizing Outlier Distribution
plt.figure(figsize=(10, 5))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title("Boxplot of Features for Outlier Detection")
plt.show()
# Display outlier comparison
df_outliers = pd.DataFrame({
"Z-Score": z_outliers,
"IQR": iqr_outliers,
"Isolation Forest": isolation_outliers
})
print("\nComparison of Outlier Detection Methods:")
print(df_outliers.mean())
//OUTPUT//
Outliers detected by Z-score: 846
Outliers detected by IQR: 3798
Outliers detected by Isolation Forest: 1032
Comparison of Outlier Detection Methods:
Z-Score 0.040988
IQR 0.184012
Isolation Forest 0.050000
dtype: float64
Q5. Perform CluStream algorithm on any time series
data from Kaggle and compare its output with that of
K-means clustering. Evaluate the cluster quality by
changing the algorithm's parameters.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
# Step 1: Load and preprocess dataset
df = pd.read_csv("timeseries.csv")
df.dropna(inplace=True)
# Normalize the dataset
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.iloc[:, 1:]) # Assuming first column is time index
# Step 2: CluStream Approximation (Using MiniBatchKMeans for Streaming)
mb_kmeans = MiniBatchKMeans(n_clusters=5, batch_size=100, random_state=42)
mb_kmeans.partial_fit(df_scaled) # Streaming update
clustream_labels = mb_kmeans.predict(df_scaled) # Get final cluster assignments
# Step 3: Apply K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(df_scaled)
# Step 4: Evaluate Clustering Quality
def evaluate_clustering(data, labels, method_name):
silhouette = silhouette_score(data, labels)
db_index = davies_bouldin_score(data, labels)
print(f"\n{method_name} Clustering Evaluation:")
print(f"Silhouette Score: {silhouette:.4f}")
print(f"Davies-Bouldin Index: {db_index:.4f}")
evaluate_clustering(df_scaled, clustream_labels, "CluStream (Approx)")
evaluate_clustering(df_scaled, kmeans_labels, "K-Means")
# Step 5: Visualize Clusters
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df_scaled[:, 0], y=df_scaled[:, 1], hue=kmeans_labels, palette="viridis",
s=20)
plt.title("K-Means Clustering Results")
plt.show()
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df_scaled[:, 0], y=df_scaled[:, 1], hue=clustream_labels,
palette="coolwarm", s=20)
plt.title("CluStream (Approx) Clustering Results")
plt.show()
//OUTPUT//
CluStream (Approx) Clustering Evaluation:
Silhouette Score: 0.3766
Davies-Bouldin Index: 0.8387
K-Means Clustering Evaluation:
Silhouette Score: 0.4363
Davies-Bouldin Index: 0.6774