8000 EXA Add example to illustrate generation of MCAR/MNAR missing vals · scikit-learn/scikit-learn@fcc9d11 · GitHub
[go: up one dir, main page]

Skip to content

Commit fcc9d11

Browse files
committed
EXA Add example to illustrate generation of MCAR/MNAR missing vals
1 parent 4b2ed1f commit fcc9d11

File tree

1 file changed

+156
-0
lines changed

1 file changed

+156
-0
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""
2+
=============================================
3+
Generating MNAR / MCAR missing_values in data
4+
=============================================
5+
6+
This example illustrates how the :func:`sklearn.datasets.value_dropper` can
7+
be used to generate missing values that are correlated/non-correlated
8+
with the target.
9+
10+
This function provisions generating missing values incrementally so that an
11+
exact fraction of missing values can be introduced for benchmarking
12+
missing-value handling strategies and evaluating the performance of such
13+
strategies with respect to the type and extent of missingness in data.
14+
15+
MNAR or Missing Not At Random refers to the case when the missingness in the
16+
data is correlated with the target value(s).
17+
18+
MCAR or Missing Completely At Random refers to the case when the missingness
19+
in the data is completely random and does not correlate with the target
20+
value(s).
21+
"""
22+
# Author: Raghav RV <rvraghav93@gmail.com>
23+
#
24+
# License: BSD 3 clause
25+
26+
from __future__ import print_function
27+
from sklearn.datasets import drop_values
28+
import numpy as np
29+
30+
print(__doc__)
31+
32+
33+
X = [[0, 1, 2],
34+
[3, 4, 5],
35+
[6, 7, 8],
36+
[9, 0, 1],
37+
[2, 3, 4],
38+
[8, 9, 8],
39+
[1, 0, 5],
40+
[7, 8, 9],
41+
[5, 4, 3],
42+
[2, 1, 1],
43+
[3, 4, 5],
44+
[2, 3, 4],
45+
[8, 9, 8],
46+
[1, 0, 5],
47+
[7, 8, 9],]
48+
y = [0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2]
49+
50+
# Drop 10% of values across all features for samples
51+
# where the target class label is randomly chosen
52+
53+
X, y, mm, labels = drop_values(X, y,
54+
drop_fraction=0.1,
55+
return_missing_mask=True,
56+
return_labels=True,
57+
copy=False,
58+
verbose=True,
59+
random_state=42)
60+
61+
print("After dropping 10%% of values when class label(s) are %r\n" % labels)
62+
print("y \t X")
63+
print("------------------------")
64+
for i in range(y.shape[0]):
65+
print(y[i], '\t', X[i])
66+
67+
68+
print("\n\n")
69+
# Drop 10% of values across all features for samples
70+
# where the target class label is same as what was chosen before
71+
72+
# NOTE We can now pass the missing mask from the previous step
73+
# to avoid it getting recomputed.
74+
75+
X, y, mm = drop_values(X, y, missing_mask=mm,
76+
drop_fraction=0.2,
77+
labels=labels,
78+
return_labels=False,
79+
return_missing_mask=True,
80+
copy=False,
81+
verbose=True,
82+
random_state=42)
83+
84+
print("After dropping another 10%% of values when class label(s) are %r\n" % labels)
85+
print("y \t X")
86+
print("------------------------")
87+
for i in range(y.shape[0]):
88+
print(y[i], '\t', X[i])
89+
90+
print('\n\n')
91+
92+
# Now drop another 10%, but this time from class 0
93+
# This time let us not modify X inplace and instead return the missing mask and
94+
# manually set the missing_values
95+
96+
# This time we are not passing the previous missing_mask and allowing it to get computed
97+
# on the fly
98+
99+
# Let us store the old missing mask
100+
mm_old = mm.copy()
101+
102+
X, y, mm = drop_values(X, y,
103+
drop_fraction=0.3,
104+
# Explicitly specify we want missing values correlated to class 0
105+
labels=[1, ],
106+
return_labels=False,
107+
return_missing_mask=True,
108+
missing_mask_only=True,
109+
copy=False,
110+
verbose=True,
111+
random_state=42)
112+
113+
print("NOTE that the missing_values are set. Only the missing mask is updated...")
114+
print("y \t missing_mask")
115+
print("------------------------")
116+
for i in range(y.shape[0]):
117+
print(y[i], '\t', mm[i])
118+
119+
print('\n\n')
120+
print('\nThe X is not modified')
121+
print("y \t X")
122+
pr 9E12 int("------------------------")
123+
for i in range(y.shape[0]):
124+
print(y[i], '\t', X[i])
125+
126+
print('\n\n')
127+
128+
129+
# Manually update the missing values from the mask
130+
# only for the newly missing values
131+
132+
mm_new = mm_old ^ mm
133+
X[mm_new] = np.nan
134+
135+
print("After manually updating the new missing values")
136+
print("y \t X")
137+
print("------------------------")
138+
for i in range(y.shape[0]):
139+
print(y[i], '\t', X[i])
140+
141+
print('\n\n')
142+
143+
# Now let us add additional 10% of random missing values
144+
145+
X, y = drop_values(X, y,
146+
drop_fraction=0.4,
147+
# Explicitly specify we want missing values correlated to class 0
148+
label_correlation=0,
149+
copy=False,
150+
verbose=True,
151+
random_state=42)
152+
153+
print("y \t X")
154+
print("------------------------")
155+
for i in range(y.shape[0]):
156+
print(y[i], '\t', X[i])

0 commit comments

Comments
 (0)
0