scikit-learn
diff --git a/‎examples/datasets/generate_missing_values.py
Lines changed: 156 additions & 0 deletions b/‎examples/datasets/generate_missing_values.py
Lines changed: 156 additions & 0 deletions
@@ -0,0 +1,156 @@
+"""
+=============================================
+Generating MNAR / MCAR missing_values in data
+=============================================
+
+This example illustrates how the :func:`sklearn.datasets.value_dropper` can
+be used to generate missing values that are correlated/non-correlated
+with the target.
+
+This function provisions generating missing values incrementally so that an
+exact fraction of missing values can be introduced for benchmarking
+missing-value handling strategies and evaluating the performance of such
+strategies with respect to the type and extent of missingness in data.
+
+MNAR or Missing Not At Random refers to the case when the missingness in the
+data is correlated with the target value(s).
+
+MCAR or Missing Completely At Random refers to the case when the missingness
+in the data is completely random and does not correlate with the target
+value(s).
+"""
+# Author: Raghav RV <rvraghav93@gmail.com>
+#
+# License: BSD 3 clause
+
+from __future__ import print_function
+from sklearn.datasets import drop_values
+import numpy as np
+
+print(__doc__)
+
+
+X = [[0, 1, 2],
+     [3, 4, 5],
+     [6, 7, 8],
+     [9, 0, 1],
+     [2, 3, 4],
+     [8, 9, 8],
+     [1, 0, 5],
+     [7, 8, 9],
+     [5, 4, 3],
+     [2, 1, 1],
+     [3, 4, 5],
+     [2, 3, 4],
+     [8, 9, 8],
+     [1, 0, 5],
+     [7, 8, 9],]
+y = [0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2]
+
+# Drop 10% of values across all features for samples
+# where the target class label is randomly chosen
+
+X, y, mm, labels = drop_values(X, y,
+                               drop_fraction=0.1,
+                               return_missing_mask=True,
+                               return_labels=True,
+                               copy=False,
+                               verbose=True,
+                               random_state=42)
+
+print("After dropping 10%% of values when class label(s) are %r\n" % labels)
+print("y \t X")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], '\t', X[i])
+    
+    
+print("\n\n")
+# Drop 10% of values across all features for samples
+# where the target class label is same as what was chosen before
+
+# NOTE We can now pass the missing mask from the previous step
+# to avoid it getting recomputed.
+
+X, y, mm = drop_values(X, y, missing_mask=mm,
+                       drop_fraction=0.2,
+                       labels=labels,
+                       return_labels=False,
+                       return_missing_mask=True,
+                       copy=False,
+                       verbose=True,
+                       random_state=42)
+
+print("After dropping another 10%% of values when class label(s) are %r\n" % labels)
+print("y \t X")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], '\t', X[i])
+
+print('\n\n')
+    
+# Now drop another 10%, but this time from class 0
+# This time let us not modify X inplace and instead return the missing mask and 
+# manually set the missing_values
+
+# This time we are not passing the previous missing_mask and allowing it to get computed
+# on the fly
+
+# Let us store the old missing mask
+mm_old = mm.copy()
+
+X, y, mm = drop_values(X, y,
+                       drop_fraction=0.3,
+                       # Explicitly specify we want missing values correlated to class 0
+                       labels=[1, ],
+                       return_labels=False,
+                       return_missing_mask=True,
+                       missing_mask_only=True,
+                       copy=False,
+                       verbose=True,
+                       random_state=42)
+
+print("NOTE that the missing_values are set. Only the missing mask is updated...")
+print("y \t missing_mask")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], '\t', mm[i])
+    
+print('\n\n')
+print('\nThe X is not modified')
+print("y \t X")
+pr
9E12
int("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], '\t', X[i])
+    
+print('\n\n')
+
+    
+# Manually update the missing values from the mask
+# only for the newly missing values
+
+mm_new = mm_old ^ mm
+X[mm_new] = np.nan
+
+print("After manually updating the new missing values")
+print("y \t X")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], '\t', X[i])
+    
+print('\n\n')
+
+# Now let us add additional 10% of random missing values
+
+X, y = drop_values(X, y,
+                   drop_fraction=0.4,
+                   # Explicitly specify we want missing values correlated to class 0
+                   label_correlation=0,
+                   copy=False,
+                   verbose=True,
+                   random_state=42)
+
+print("y \t X")
+print("------------------------")
+for i in range(y.shape[0]):
+    print(y[i], '\t', X[i])