-
Notifications
You must be signed in to change notification settings - Fork 236
Closed
Labels
Description
When creating a boxplot plotting some continuous value against a categorical X axis, it fails when for any category there is only one sample in the data frame.
E.g. this works fine
df = pd.DataFrame(
{
'weight': np.random.normal(size=20),
# Creating two categories, one with 18 samples, one with 2 samples
'category': pd.Categorical(18 * [0] + 2 * [1], categories=[0,1], ordered=True)
}
)
(
ggplot(df, aes(x='category', y='weight'))
+ geom_boxplot()
)
However, this example fails:
df = pd.DataFrame(
{
'weight': np.random.normal(size=20),
# Creating two categories, one with 19 samples, one with 1 sample
'category': pd.Categorical(19 * [0] + 1 * [1], categories=[0,1], ordered=True)
}
)
(
ggplot(df, aes(x='category', y='weight'))
+ geom_boxplot()
)
Below is the trace from the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/.virtualenvs/pandas/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
691 type_pprinters=self.type_printers,
692 deferred_pprinters=self.deferred_printers)
--> 693 printer.pretty(obj)
694 printer.flush()
695 return stream.getvalue()
~/.virtualenvs/pandas/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj)
378 if callable(meth):
379 return meth(obj, self, cycle)
--> 380 return _default_pprint(obj, self, cycle)
381 finally:
382 self.end_group()
~/.virtualenvs/pandas/lib/python3.6/site-packages/IPython/lib/pretty.py in _default_pprint(obj, p, cycle)
493 if _safe_getattr(klass, '__repr__', None) is not object.__repr__:
494 # A user-provided repr. Find newlines and replace them with p.break_()
--> 495 _repr_pprint(obj, p, cycle)
496 return
497 p.begin_group(1, '<')
~/.virtualenvs/pandas/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
691 """A pprint that just redirects to the normal repr function."""
692 # Find newlines and replace them with p.break_()
--> 693 output = repr(obj)
694 for idx,output_line in enumerate(output.splitlines()):
695 if idx:
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/ggplot.py in __repr__(self)
81 Print/show the plot
82 """
---> 83 self.draw()
84 plt.show()
85 return '<ggplot: (%d)>' % self.__hash__()
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/ggplot.py in draw(self)
138 # assign a default theme
139 self = deepcopy(self)
--> 140 self._build()
141
142 # If no theme we use the default
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/ggplot.py in _build(self)
234
235 # Apply and map statistics
--> 236 layers.compute_statistic(layout)
237 layers.map_statistic(self)
238
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/layer.py in compute_statistic(self, layout)
92 def compute_statistic(self, layout):
93 for l in self:
---> 94 l.compute_statistic(layout)
95
96 def map_statistic(self, plot):
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/layer.py in compute_statistic(self, layout)
369 data = self.stat.use_defaults(data)
370 data = self.stat.setup_data(data)
--> 371 data = self.stat.compute_layer(data, params, layout)
372 self.data = data
373
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/stats/stat.py in compute_layer(cls, data, params, layout)
194 return cls.compute_panel(pdata, pscales, **params)
195
--> 196 return groupby_apply(data, 'PANEL', fn)
197
198 @classmethod
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/utils.py in groupby_apply(df, cols, func, *args, **kwargs)
615 # do not mark d as a slice of df i.e no SettingWithCopyWarning
616 d.is_copy = None
--> 617 lst.append(func(d, *args, **kwargs))
618 return pd.concat(lst, axis=axis, ignore_index=True)
619
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/stats/stat.py in fn(pdata)
192 return pdata
193 pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 194 return cls.compute_panel(pdata, pscales, **params)
195
196 return groupby_apply(data, 'PANEL', fn)
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/stats/stat.py in compute_panel(cls, data, scales, **params)
221 for _, old in data.groupby('group'):
222 old.is_copy = None
--> 223 new = cls.compute_group(old, scales, **params)
224 unique = uniquecols(old)
225 missing = unique.columns.difference(new.columns)
~/.virtualenvs/pandas/lib/python3.6/site-packages/plotnine/stats/stat_boxplot.py in compute_group(cls, data, scales, **params)
69 labels = ['x', 'y']
70 X = np.array(data[labels])
---> 71 res = boxplot_stats(X, whis=params['coef'], labels=labels)[1]
72 try:
73 n = data['weight'].sum()
~/.virtualenvs/pandas/lib/python3.6/site-packages/matplotlib/cbook.py in boxplot_stats(X, whis, bootstrap, labels, autorange)
1998 labels = repeat(None)
1999 elif len(labels) != ncols:
-> 2000 raise ValueError("Dimensions of labels and X must be compatible")
2001
2002 input_whis = whis
ValueError: Dimensions of labels and X must be compatible
This can be worked around by removing the classes with only a single sample from the data frame and overlaying only those with a geom_point()
, as there is no interesting boxplot for them anyway, but it's a bit of a hassle and would be nicer if it just worked.