scikit-learn
diff --git a/‎sklearn/datasets/base.py
Lines changed: 27 additions & 3 deletions b/‎sklearn/datasets/base.py
Lines changed: 27 additions & 3 deletions
diff --git a/‎sklearn/datasets/tests/test_20news.py
Lines changed: 13 additions & 0 deletions b/‎sklearn/datasets/tests/test_20news.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎sklearn/datasets/twenty_newsgroups.py
Lines changed: 4 additions & 4 deletions b/‎sklearn/datasets/twenty_newsgroups.py
Lines changed: 4 additions & 4 deletions
@@ -18,19 +18,43 @@
 from os.path import isdir
 from os import listdir
 from os import makedirs
+import re
 
 import numpy as np
 
 from ..utils import check_random_state
 
 
 class Bunch(dict):
-    """Container object for datasets: dictionary-like object that
-       exposes its keys as attributes."""
+    """Container object for datasets
+
+    Dictionary-like object that exposes its keys as attributes.
+
+    >>> b = Bunch(a=1, b=2)
+    >>> b['b']
+    2
+    >>> b.b
+    2
+    >>> b.a = 3
+    >>> b['a']
+    3
+    >>> b.c = 6
+    >>> b['c']
+    6
+
+    """
 
     def __init__(self, **kwargs):
         dict.__init__(self, kwargs)
-        self.__dict__ = self
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def __getattr__(self, key):
+        return self[key]
+
+    def __getstate__(self):
+        return self.__dict__
 
 
 def get_data_home(data_home=None):
 
@@ -38,6 +38,19 @@ def test_20news():
     entry2 = data.data[np.where(data.target == label)[0][0]]
     assert_equal(entry1, entry2)
 
+def test_20news_length_consistency():
+    """Checks the length consistencies within the bunch"""
+    try:
+        data = datasets.fetch_20newsgroups(
+            subset='all', download_if_missing=False, shuffle=False)
+    except IOError:
+        raise SkipTest("Download 20 newsgroups to run this test")
+    # Extract the full dataset
+    data = datasets.fetch_20newsgroups(subset='all')
+    assert_equal(len(data['data']), len(data.data))
+    assert_equal(len(data['target']), len(data.target))
+    assert_equal(len(data['filenames']), len(data.filenames))
+
 
 def test_20news_vectorized():
     # This test is slow.
 
@@ -161,7 +161,7 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
         for the test set, 'all' for both, with shuffled ordering.
 
     data_home: optional, default: None
-        Specify an download and cache folder for the datasets. If None,
+        Specify a download and cache folder for the datasets. If None,
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
     categories: None or collection of string or unicode
@@ -231,9 +231,9 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
             target.extend(data.target)
             filenames.extend(data.filenames)
 
-        data.data = data_lst
-        data.target = np.array(target)
-        data.filenames = np.array(filenames)
+        data['data'] = data_lst
+        data['target'] = np.array(target)
+        data['filenames'] = np.array(filenames)
     else:
         raise ValueError(
             "subset can only be 'train', 'test' or 'all', got '%s'" % subset)