8000 Fix for DataFrame.hist() with by- and weights-keyword by Twizzledrizzle · Pull Request #11441 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

Fix for DataFrame.hist() with by- and weights-keyword #11441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
changed weights= to end of functions
Uses dropna(subset=...) to delete where nan's over the columns supplied
Also doing this in the beginning so we do not have to duplicate this logic
  • Loading branch information
nosterlu committed Oct 28, 2015
commit cbe68ecb97cb68c632f65f0e788e2fa30095a3b1
84 changes: 36 additions & 48 deletions pandas/tools/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -2770,9 +2770,9 @@ def plot_group(group, ax):
return fig


def hist_frame(data, column=None, weights=None, by=None, grid=True,
xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None,
sharex=False, sharey=False, figsize=None, layout=None, bins=10,
def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, weights=None,
**kwds):
"""
Draw histogram of the DataFrame's series using matplotlib / pylab.
Expand All @@ -2782,8 +2782,6 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True,
data : DataFrame
column : string or sequence
If passed, will be used to limit data to a subset of columns
weights : string or sequence
If passed, will be used to weight the data
by : object, optional
If passed, then used to form histograms for separate groups
grid : boolean, default True
Expand All @@ -2810,38 +2808,42 @@ def hist_frame(data, column=None, weights=None, by=None, grid=True,
layout: (optional) a tuple (rows, columns) for the layout of the histograms
bins: integer, default 10
Number of histogram bins to be used
weights : string or sequence
If passed, will be used to weight the data
kwds : other plotting keyword arguments
To be passed to hist function
"""
subset_cols_drop_nan = []
if weights is not None:
if isinstance(weights, np.ndarray):
# weights supplied as an array instead of a part of the dataframe
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this will not work if weights is a >1 dimensional ndarray... Need to think on this

if 'weights' in data.columns:
raise NameError('weights already in data.columns. Could not ' +
'add dummy column')
data = data.copy()
data['weights'] = weights
weights = 'weights'
subset_cols_drop_nan.append(weights)
if column is not None:
subset_cols_drop_nan.append(column)
data = data.dropna(subset=subset_cols_drop_nan)

if by is not None:
axes = grouped_hist(data, column=column, weights=weights, by=by, ax=ax, grid=grid, figsize=figsize,
axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize,
sharex=sharex, sharey=sharey, layout=layout, bins=bins,
xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot,
**kwds)
weights=weights, **kwds)
return axes

inx_na = np.zeros(len(data), dtype=bool)
if weights is not None:
# first figure out if given my column name, or by an array
if isinstance(weights, str):
weights = data[weights]
if isinstance(weights, np.ndarray) == False:
weights = weights.values
# remove fields where we have nan in weights OR in group
# for both data sets
inx_na = (np.isnan(weights))
weights = data[weights]
weights = weights._get_numeric_data()

if column is not None:
if not isinstance(column, (list, np.ndarray, Index)):
column = [column]
data = data[column]
data = data._get_numeric_data()
inx_na |= np.isnan(data.T.values)[0]

data = data.ix[~inx_na]
if weights is not None:
weights = weights[~inx_na]

naxes = len(data.columns)

fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
Expand Down Expand Up @@ -2935,18 +2937,17 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None,
return axes


def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
def grouped_hist(data, column=None, by=None, ax=None, bins=50,
figsize=None, layout=None, sharex=False, sharey=False, rot=90,
grid=True, xlabelsize=None, xrot=None, ylabelsize=None,
yrot=None, **kwargs):
yrot=None, weights=None, **kwargs):
"""
Grouped histogram

Parameters
----------
data: Series/DataFrame
column: object, optional
weights: object, optional
by: object, optional
ax: axes, optional
bins: int, default 50
Expand All @@ -2956,6 +2957,7 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
sharey: boolean, default False
rot: int, default 90
grid: bool, default True
weights: object, optional
kwargs: dict, keyword arguments passed to matplotlib.Axes.hist

Returns
Expand All @@ -2965,25 +2967,20 @@ def grouped_hist(data, column=None, weights=None, by=None, ax=None, bins=50,
def plot_group(group, ax, weights=None):
if isinstance(group, np.ndarray) == False:
group = group.values
inx_na = np.isnan(group)
if weights is not None:
# remove fields where we have nan in weights OR in group
# for both data sets
if isinstance(weights, np.ndarray) == False:
weights = weights.values
inx_na |= (np.isnan(weights))
weights = weights[~inx_na]
group = group[~inx_na]
if len(group) > 0:
# if length is less than 0, we had only NaN's for this group
# nothing to print!
ax.hist(group, weights=weights, bins=bins, **kwargs)

xrot = xrot or rot

fig, axes = _grouped_plot(plot_group, data, column=column, weights=weights,
by=by, sharex=sharex, sharey=sharey, ax=ax,
figsize=figsize, layout=layout, rot=rot)
fig, axes = _grouped_plot(plot_group, data, column=column, by=by,
sharex=sharex, sharey=sharey, ax=ax,
figsize=figsize, layout=layout, rot=rot,
weights=weights)

_set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
ylabelsize=ylabelsize, yrot=yrot)
Expand Down Expand Up @@ -3068,9 +3065,9 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None,
return ret


def _grouped_plot(plotf, data, column=None, weights=None, by=None,
def _grouped_plot(plotf, data, column=None, by=None,
numeric_only=True, figsize=None, sharex=True, sharey=True,
layout=None, rot=0, ax=None, **kwargs):
layout=None, rot=0, ax=None, weights=None, **kwargs):
from pandas import DataFrame

if figsize == 'default':
Expand All @@ -3079,22 +3076,13 @@ def _grouped_plot(plotf, data, column=None, weights=None, by=None,
"size by tuple instead", FutureWarning, stacklevel=4)
figsize = None

added_weights_dummy_column = False
if isinstance(weights, np.ndarray):
# weights supplied as an array instead of a part of the datafra 6CCC me
data['weights'] = weights
weights = 'weights'
added_weights_dummy_column = True

grouped = data.groupby(by)

if weights is not None:
weights = grouped[weights]
if column is not None:
if weights is not None:
weights = grouped[weights]
grouped = grouped[column]

if added_weights_dummy_column:
data = data.drop('weights', axis=1)

naxes = len(grouped)
fig, axes = _subplots(naxes=naxes, figsize=figsize,
sharex=sharex, sharey=sharey, ax=ax,
Expand Down
0