8000 BUG: Fixed comparison with bytes in kddcup99.py + test · scikit-learn/scikit-learn@8497c89 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8497c89

Browse files
author
Nikolay Mayorov
committed
BUG: Fixed comparison with bytes in kddcup99.py + test
1 parent 4b8dc27 commit 8497c89

File tree

2 files changed

+60
-9
lines changed

2 files changed

+60
-9
lines changed

sklearn/datasets/kddcup99.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@
4040

4141

4242
def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
43-
percent10=False):
44-
"""Load and return the kddcup 99 dataset (regression).
43+
percent10=False, download_if_missing=True):
44+
"""Load and return the kddcup 99 dataset (classification).
4545
4646
The KDD Cup '99 dataset was created by processing the tcpdump portions
4747
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
@@ -93,7 +93,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
9393
9494
================ ==========================================
9595
Samples total 699691
96-
Dimensionality 40
96+
Dimensionality 4
9797
Features discrete (int) or continuous (float)
9898
Targets str, 'normal.' or name of the anomaly type
9999
================ ==========================================
@@ -102,7 +102,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
102102
103103
================ ==========================================
104104
Samples total 619052
105-
Dimensionality 39
105+
Dimensionality 3
106106
Features discrete (int) or continuous (float)
107107
Targets str, 'normal.' or name of the anomaly type
108108
================ ==========================================
@@ -111,7 +111,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
111111
112112
================ ==========================================
113113
Samples total 95373
114-
Dimensionality 39
114+
Dimensionality 3
115115
Features discrete (int) or continuous (float)
116116
Targets str, 'normal.' or name of the anomaly type
117117
================ ==========================================
@@ -135,6 +135,10 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
135135
percent10 : bool, default=False
136136
Whether to load only 10 percent of the data.
137137
138+
download_if_missing : bool, default=True
139+
If False, raise a IOError if the data is not locally available
140+
instead of trying to download the data from the source site.
141+
138142
Returns
139143
-------
140144
data : Bunch
@@ -153,13 +157,14 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
153157
Intrusions in Unlabeled Data (2002) by Eleazar Eskin, Andrew Arnold,
154158
Michael Prerau, Leonid Portnoy, Sal Stolfo
155159
"""
156-
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10)
160+
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10,
161+
download_if_missing=download_if_missing)
157162

158163
data = kddcup99.data
159164
target = kddcup99.target
160165

161166
if subset == 'SA':
162-
s = target == 'normal.'
167+
s = target == b'normal.'
163168
t = np.logical_not(s)
164169
normal_samples = data[s, :]
165170
normal_targets = target[s]
@@ -187,13 +192,13 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
187192
data[:, 5] = np.log((data[:, 5] + 0.1).astype(float))
188193

189194
if subset == 'http':
190-
s = data[:, 2] == 'http'
195+
s = data[:, 2] == b'http'
191196
data = data[s]
192197
target = target[s]
193198
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
194199

195200
if subset == 'smtp':
196-
s = data[:, 2] == 'smtp'
201+
s = data[:, 2] == b'smtp'
197202
data = data[s]
198203
target = target[s]
199204
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data
2+
is too big to use in unit-testing.
3+
4+
The test is skipped if the data wasn't previously fetched and saved to
5+
scikit-learn data folder.
6+
"""
7+
8+
import errno
9+
from sklearn.datasets import fetch_kddcup99
10+
from sklearn.utils.testing import assert_equal, SkipTest
11+
12+
13+
def test_percent10():
14+
try:
15+
data = fetch_kddcup99(download_if_missing=False, percent10=True)
16+
except IOError as e:
17+
if e.errno == errno.ENOENT:
18+
raise SkipTest("kddcup99 dataset can not be loaded.")
19+
20+
assert_equal(data.data.shape, (494021, 41))
21+
assert_equal(data.target.shape, (494021,))
22+
23+
data_shuffled = fetch_kddcup99(shuffle=True, random_state=0,
24+
percent10=True)
25+
assert_equal(data.data.shape, data_shuffled.data.shape)
26+
assert_equal(data.target.shape, data_shuffled.target.shape)
27+
28+
data = fetch_kddcup99('SA', percent10=True)
29+
assert_equal(data.data.shape, (100655, 41))
30+
assert_equal(data.target.shape, (100655,))
31+
32+
data = fetch_kddcup99('SF', percent10=True)
33+
assert_equal(data.data.shape, (73237, 4))
34+
assert_equal(data.target.shape, (73237,))
35+
36+
data = fetch_kddcup99('http', percent10=True)
37+
assert_equal(data.data.shape, (58725, 3))
38+
assert_equal(data.target.shape, (58725,))
39+
40+
data = fetch_kddcup99('smtp', percent10=True)
41+
assert_equal(data.data.shape, (9571, 3))
42+
assert_equal(data.target.shape, (9571,))
43+
44+
45+
if __name__ == '__main__':
46+
test_percent10()

0 commit comments

Comments
 (0)
0