40
40
41
41
42
42
def fetch_kddcup99 (subset = None , shuffle = False , random_state = None ,
43
- percent10 = False ):
44
- """Load and return the kddcup 99 dataset (regression ).
43
+ percent10 = False , download_if_missing = True ):
44
+ """Load and return the kddcup 99 dataset (classification ).
45
45
46
46
The KDD Cup '99 dataset was created by processing the tcpdump portions
47
47
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
@@ -93,7 +93,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
93
93
94
94
================ ==========================================
95
95
Samples total 699691
96
- Dimensionality 40
96
+ Dimensionality 4
97
97
Features discrete (int) or continuous (float)
98
98
Targets str, 'normal.' or name of the anomaly type
99
99
================ ==========================================
@@ -102,7 +102,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
102
102
103
103
================ ==========================================
104
104
Samples total 619052
105
- Dimensionality 39
105
+ Dimensionality 3
106
106
Features discrete (int) or continuous (float)
107
107
Targets str, 'normal.' or name of the anomaly type
108
108
================ ==========================================
@@ -111,7 +111,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
111
111
112
112
================ ==========================================
113
113
Samples total 95373
114
- Dimensionality 39
114
+ Dimensionality 3
115
115
Features discrete (int) or continuous (float)
116
116
Targets str, 'normal.' or name of the anomaly type
117
117
================ ==========================================
@@ -135,6 +135,10 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
135
135
percent10 : bool, default=False
136
136
Whether to load only 10 percent of the data.
137
137
138
+ download_if_missing : bool, default=True
139
+ If False, raise a IOError if the data is not locally available
140
+ instead of trying to download the data from the source site.
141
+
138
142
Returns
139
143
-------
140
144
data : Bunch
@@ -153,13 +157,14 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
153
157
Intrusions in Unlabeled Data (2002) by Eleazar Eskin, Andrew Arnold,
154
158
Michael Prerau, Leonid Portnoy, Sal Stolfo
155
159
"""
156
- kddcup99 = _fetch_brute_kddcup99 (shuffle = shuffle , percent10 = percent10 )
160
+ kddcup99 = _fetch_brute_kddcup99 (shuffle = shuffle , percent10 = percent10 ,
161
+ download_if_missing = download_if_missing )
157
162
158
163
data = kddcup99 .data
159
164
target = kddcup99 .target
160
165
161
166
if subset == 'SA' :
162
- s = target == 'normal.'
167
+ s = target == b 'normal.'
163
168
t = np .logical_not (s )
164
169
normal_samples = data [s , :]
165
170
normal_targets = target [s ]
@@ -187,13 +192,13 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
187
192
data [:, 5 ] = np .log ((data [:, 5 ] + 0.1 ).astype (float ))
188
193
189
194
if subset == 'http' :
190
- s = data [:, 2 ] == 'http'
195
+ s = data [:, 2 ] == b 'http'
191
196
data = data [s ]
192
197
target = target [s ]
193
198
data = np .c_ [data [:, 0 ], data [:, 4 ], data [:, 5 ]]
194
199
195
200
if subset == 'smtp' :
196
- s = data [:, 2 ] == 'smtp'
201
+ s = data [:, 2 ] == b 'smtp'
197
202
data = data [s ]
198
203
target = target [s ]
199
204
data = np .c_ [data [:, 0 ], data [:, 4 ], data [:, 5 ]]
0 commit comments