8000 BUG: Fixed fetch_kddcup99 for Python 3 · scikit-learn/scikit-learn@51addc0 · GitHub
[go: up one dir, main page]

Skip to content 8000

Commit 51addc0

Browse files
author
Nikolay Mayorov
committed
BUG: Fixed fetch_kddcup99 for Python 3
BUG: Fixed comparison with bytes in kddcup99.py + test MAINT: Changed default 'percent10' to True in fetch_kddcup99
1 parent f63ac6e commit 51addc0

File tree

2 files changed

+58
-10
lines changed

2 files changed

+58
-10
lines changed

sklearn/datasets/kddcup99.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from .base import get_data_home
2626
from .base import Bunch
27-
from ..externals import joblib
27+
from ..externals import joblib, six
2828
from ..utils import check_random_state
2929
from ..utils import shuffle as shuffle_method
3030

@@ -40,8 +40,8 @@
4040

4141

4242
def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
43-
percent10=False):
44-
"""Load and return the kddcup 99 dataset (regression).
43+
percent10=True, download_if_missing=True):
44+
"""Load and return the kddcup 99 dataset (classification).
4545
4646
The KDD Cup '99 dataset was created by processing the tcpdump portions
4747
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
@@ -93,7 +93,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
9393
9494
================ ==========================================
9595
Samples total 699691
96-
Dimensionality 40
96+
Dimensionality 4
9797
Features discrete (int) or continuous (float)
9898
Targets str, 'normal.' or name of the anomaly type
9999
================ ==========================================
@@ -102,7 +102,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
102102
103103
================ ==========================================
104104
Samples total 619052
105-
Dimensionality 39
105+
Dimensionality 3
106106
Features discrete (int) or continuous (float)
107107
Targets str, 'normal.' or name of the anomaly type
108108
================ ==========================================
@@ -111,7 +111,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
111111
112112
================ ==========================================
113113
Samples total 95373
114-
Dimensionality 39
114+
Dimensionality 3
115115
Features discrete (int) or continuous (float)
116116
Targets str, 'normal.' or name of the anomaly type
117117
================ ==========================================
@@ -135,6 +135,10 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
135135
percent10 : bool, default=False
136136
Whether to load only 10 percent of the data.
137137
138+
download_if_missing : bool, default=True
139+
If False, raise a IOError if the data is not locally available
140+
instead of trying to download the data from the source site.
141+
138142
Returns
139143
-------
140144
data : Bunch
@@ -153,13 +157,14 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
153157
Intrusions in Unlabeled Data (2002) by Eleazar Eskin, Andrew Arnold,
154158
Michael Prerau, Leonid Portnoy, Sal Stolfo
155159
"""
156-
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10)
160+
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10,
161+
download_if_missing=download_if_missing)
157162

158163
data = kddcup99.data
159164
target = kddcup99.target
160165

161166
if subset == 'SA':
162-
s = target == 'normal.'
167+
s = target == b'normal.'
163168
t = np.logical_not(s)
164169
normal_samples = data[s, :]
165170
normal_targets = target[s]
@@ -187,13 +192,13 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
187192
data[:, 5] = np.log((data[:, 5] + 0.1).astype(float))
188193

189194
if subset == 'http':
190-
s = data[:, 2] == 'http'
195+
s = data[:, 2] == b'http'
191196
data = data[s]
192197
target = target[s]
193198
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
194199

195200
if subset == 'smtp':
196-
s = data[:, 2] == 'smtp'
201+
s = data[:, 2] == b'smtp'
197202
data = data[s]
198203
target = target[s]
199204
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
@@ -320,6 +325,8 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
320325
file_ = GzipFile(fileobj=f, mode='r')
321326
Xy = []
322327
for line in file_.readlines():
328+
if six.PY3:
329+
line = line.decode()
323330
Xy.append(line.replace('\n', '').split(','))
324331
file_.close()
325332
print('extraction done')
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data
2+
is too big to use in unit-testing.
3+
4+
The test is skipped if the data wasn't previously fetched and saved to
5+
scikit-learn data folder.
6+
"""
7+
8+
import errno
9+
from sklearn.datasets import fetch_kddcup99
10+
from sklearn.utils.testing import assert_equal, SkipTest
11+
12+
13+
def test_percent10():
14+
try:
15+
data = fetch_kddcup99(download_if_missing=False)
16+
except IOError as e:
17+
if e.errno == errno.ENOENT:
18+
raise SkipTest("kddcup99 dataset can not be loaded.")
19+
20+
assert_equal(data.data.shape, (494021, 41))
21+
assert_equal(data.target.shape, (494021,))
22+
23+
data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
24+
assert_equal(data.data.shape, data_shuffled.data.shape)
25+
assert_equal(data.target.shape, data_shuffled.target.shape)
26+
27+
data = fetch_kddcup99('SA')
28+
assert_equal(data.data.shape, (100655, 41))
29+
assert_equal(data.target.shape, (100655,))
30+
31+
data = fetch_kddcup99('SF')
32+
assert_equal(data.data.shape, (73237, 4))
33+
assert_equal(data.target.shape, (73237,))
34+
35+
data = fetch_kddcup99('http')
36+
assert_equal(data.data.shape, (58725, 3))
37+
assert_equal(data.target.shape, (58725,))
38+
39+
data = fetch_kddcup99('smtp')
40+
assert_equal(data.data.shape, (9571, 3))
41+
assert_equal(data.target.shape, (9571,))

0 commit comments

Comments
 (0)
0