"""
Functions to get KDE smoothed historgams
"""
from __future__ import absolute_import, division
from kde.cudakde import gaussian_kde, bootstrap_kde
import numpy as np
from uncertainties import unumpy as unp
import copy
from pisa.core.binning import OneDimBinning, MultiDimBinning
__all__ = ["get_hist", "kde_histogramdd", "test_kde_histogramdd"]
__author__ = "P. Eller"
__license__ = """Copyright (c) 2014-2017, The IceCube Collaboration
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License."""
[docs]
def get_hist(
sample,
binning,
weights=None,
bw_method="scott",
adaptive=True,
alpha=0.3,
use_cuda=False,
coszen_reflection=0.25,
coszen_name="coszen",
oversample=1,
bootstrap=False,
bootstrap_niter=10,
):
"""Helper function for histograms from KDE
For description of args see kde_histogramdd()
Handling the reflctions at the coszen edges
ToDo:
----
* Handle zenith like coszen? Or better: Define set of variables to perform
reflection on and reflection parameters (e.g. `reflect_fract` or somesuch
to stand in for for `coszen_reflection` and `reflect_dims` as standin for
`coszen_name`; also need some way to specify whether to reflect about lower
and/or upper edge); each such parameter can either be a single value, or a
sequence with one value per variable.
* Any good reason for 0.25 and 'scott' defaults? If not, don't define a
default and force the user to explicitly set this when function is called.
"""
if bootstrap and oversample > 1:
# Because the errors within a bin are highly correlated, they could not just
# be added in quadrature to create an oversampled histogram with errors.
raise ValueError("Bootstrapping cannot be combined with oversampling.")
# the KDE implementation expects an empty weights array instead of `None`
if weights is None:
weights = []
# Get the overall normalization here, because the KDE will be normalized
# to one and we'll need to rescale in the end
if len(weights) == 0:
norm = sample.shape[0]
else:
norm = np.sum(np.nan_to_num(weights))
binning = binning.oversample(oversample)
# Flip around to satisfy the kde implementation
x = sample.T
# Must have same amount of dimensions as binning dimensions
assert x.shape[0] == len(binning)
# TODO: What if coszen isn't in binning? Does this fail?
# Yes, coszen is expected
cz_bin = binning.index(coszen_name)
# Swap out cz bin to first place (index 0)
if cz_bin != 0:
# Also swap binning:
new_binning = [binning[coszen_name]]
for b in binning:
if b.name != coszen_name:
new_binning.append(b)
binning = MultiDimBinning(new_binning)
x[[0, cz_bin]] = x[[cz_bin, 0]]
# Check if edge needs to be reflected
reflect_lower = binning[coszen_name].bin_edges[0] == -1
reflect_upper = binning[coszen_name].bin_edges[-1] == 1
# Get the kernel weights
kde_kwargs = dict(
weights=np.nan_to_num(weights),
bw_method=bw_method,
adaptive=adaptive,
alpha=alpha,
use_cuda=use_cuda,
)
if bootstrap:
kernel_weights_adaptive = bootstrap_kde(x, niter=bootstrap_niter, **kde_kwargs)
else:
kernel_weights_adaptive = gaussian_kde(x, **kde_kwargs)
# Get the bin centers, where we're going to evaluate the KDEs, and extend
# the bin range for reflection
bin_points = []
for b in binning:
c = b.weighted_centers.m
if b.name == coszen_name:
# how many bins to add for reflection
l = int(len(c) * float(coszen_reflection))
if reflect_lower:
c0 = 2 * c[0] - c[1 : l + 1][::-1]
else:
c0 = []
if reflect_upper:
c1 = 2 * c[-1] - c[-l - 1 : -1][::-1]
else:
c1 = []
c = np.concatenate([c0, c, c1])
bin_points.append(c)
# Shape including reflection edges
megashape = (
binning.shape[0] + (int(reflect_upper) + int(reflect_lower)) * l,
binning.shape[1],
)
# Shape of the reflection edges alone
minishape = (binning.shape[0] - l, binning.shape[1])
# Create a set of points
grid = np.meshgrid(*bin_points, indexing="ij")
points = np.array([g.ravel() for g in grid])
# Evaluate KDEs at given points
if bootstrap:
hist, errors = kernel_weights_adaptive(points)
# variances can simply be added together when we apply reflections, we take
# the root afterwards
variances = errors ** 2
else:
hist = kernel_weights_adaptive(points)
# Reshape 1d array into nd
hist = hist.reshape(megashape)
if bootstrap:
variances = variances.reshape(megashape)
def apply_reflection(hist_):
# Cut off the reflection edges, mirror them, fill up remaining space with
# zeros and add to histo
if reflect_lower:
hist0 = hist_[0:l, :]
hist0_0 = np.zeros(minishape)
hist0 = np.flipud(np.concatenate([hist0_0, hist0]))
hist_ = hist_[l:, :]
else:
hist0 = 0
if reflect_upper:
hist1 = hist_[-l:, :]
hist1_0 = np.zeros(minishape)
hist1 = np.flipud(np.concatenate([hist1, hist1_0]))
hist_ = hist_[:-l, :]
else:
hist1 = 0
hist_ = hist_ + hist1 + hist0
return hist_
hist = apply_reflection(hist)
if bootstrap:
variances = apply_reflection(variances)
errors = np.sqrt(variances)
# Bin volumes
volume = binning.bin_volumes(attach_units=False)
hist = hist * volume
if bootstrap:
errors = errors * volume
# Downsample, not applicable when bootstrapping
if oversample != 1:
for i, b in enumerate(binning):
hist = np.add.reduceat(
hist, np.arange(0, len(b.bin_edges) - 1, oversample), axis=i
)
# Swap back the axes
if cz_bin != 0:
hist = np.swapaxes(hist, 0, cz_bin)
if bootstrap:
errors = np.swapaxes(errors, 0, cz_bin)
if bootstrap:
return hist * norm, errors * norm
else:
return hist * norm
[docs]
def kde_histogramdd(
sample,
binning,
weights=None,
bw_method="scott",
adaptive=True,
alpha=0.3,
use_cuda=False,
coszen_reflection=0.25,
coszen_name="coszen",
oversample=1,
stack_pid=True,
bootstrap=False,
bootstrap_niter=10
):
"""Run kernel density estimation (KDE) for an array of data points, and
then evaluate them on a histogram-like grid to effectively produce a
histogram-like output.
Handles reflection at coszen edges, and will expect coszen to be in the binning
Based on Sebastian Schoenen's KDE implementation:
http://code.icecube.wisc.edu/svn/sandbox/schoenen/kde
Parameters
----------
sample : array
Shape (N_evts, vars), with vars in the right order corresponding to the
binning order.
binning : MultiDimBinning
A coszen dimension is expected
weights : None or array
Same shape as `sample`
bw_method: string
'scott' or 'silverman' (see kde module)
adaptive : bool
(see kde module)
alpha : float
A parameter for the KDEs (see kde module)
use_cuda : bool
Run on GPU (only works with <= 2d)
coszen_reflection : float
Part (number between 0 and 1) of binning that is reflected at the
coszen -1 and 1 edges
coszen_name : string
Binning name to identify the coszen bin that needs to undergo special
treatment for reflection
oversample : int
Evaluate KDE at more points per bin, takes longer, but is more accurate
stack_pid : bool
Treat each pid bin separately, not as another dimension of the KDEs
Only supported for two additional dimensions, pid binning must be named `pid`
bootstrap : bool
Use the ``bootstrap_kde`` class to produce error estimates on the KDE histograms.
Slow, not recommended during fits.
bootstrap_niter : int
Number of bootstrap iterations.
Returns
-------
histogram : numpy.ndarray
ToDo:
-----
* Maybe return Map with binnings attached insted of nd-array?
* Generalize to handle any dimensions with any reflection criterias
"""
if weights is not None and len(weights) != sample.shape[0]:
raise ValueError(
"Length of sample (%s) and weights (%s) incompatible"
% (sample.shape[0], len(weights))
)
if not stack_pid:
return get_hist(
sample=sample,
binning=binning,
weights=weights,
bw_method=bw_method,
adaptive=adaptive,
alpha=alpha,
use_cuda=use_cuda,
coszen_reflection=coszen_reflection,
coszen_name=coszen_name,
oversample=oversample,
bootstrap=bootstrap,
bootstrap_niter=bootstrap_niter
)
# treat pid bins separately
# asuming we're dealing with 2d apart from PID
bin_names = copy.copy(binning.names)
bin_edges = [b.bin_edges.m for b in binning]
pid_bin = bin_names.index("pid")
other_bins = [0, 1, 2]
other_bins.pop(pid_bin)
bin_names.pop(pid_bin)
assert len(bin_names) == 2
pid_bin_edges = bin_edges.pop(pid_bin)
d2d_binning = []
for b in binning:
if b.name != "pid":
d2d_binning.append(b)
d2d_binning = MultiDimBinning(d2d_binning)
pid_stack = []
if bootstrap:
pid_stack_errors = []
for pid in range(len(pid_bin_edges) - 1):
mask_pid = (sample.T[pid_bin] >= pid_bin_edges[pid]) & (
sample.T[pid_bin] < pid_bin_edges[pid + 1]
)
data = np.array(
[sample.T[other_bins[0]][mask_pid], sample.T[other_bins[1]][mask_pid]]
)
if weights is None:
weights_pid = None
else:
weights_pid = weights[mask_pid]
hist_kwargs = dict(
sample=data.T,
weights=weights_pid,
binning=d2d_binning,
coszen_name=coszen_name,
use_cuda=use_cuda,
bw_method=bw_method,
alpha=alpha,
oversample=oversample,
coszen_reflection=coszen_reflection,
adaptive=adaptive,
bootstrap=bootstrap,
bootstrap_niter=bootstrap_niter
)
if bootstrap:
hist, errors = get_hist(**hist_kwargs)
pid_stack.append(hist)
pid_stack_errors.append(errors)
else:
pid_stack.append(get_hist(**hist_kwargs))
hist = np.dstack(pid_stack)
if bootstrap:
errors = np.dstack(pid_stack_errors)
if pid_bin != 2:
hist = np.swapaxes(hist, pid_bin, 2)
if bootstrap:
errors = np.swapaxes(errors, pid_bin, 2)
if bootstrap:
return hist, errors
else:
return hist
# TODO: make the plotting optional but add comparisons against some known
# results. This can be accomplished by seeding before calling random to obtain
# a reference result, and check that the same values are returned when run
# below.
[docs]
def test_kde_histogramdd():
"""Unit tests for kde_histogramdd"""
from argparse import ArgumentParser
from shutil import rmtree
from tempfile import mkdtemp
from pisa import ureg
from pisa.core.map import Map, MapSet
from pisa.utils.log import logging, set_verbosity
from pisa.utils.plotter import Plotter
parser = ArgumentParser()
parser.add_argument("-v", action="count", default=None, help="set verbosity level")
args = parser.parse_args()
set_verbosity(args.v)
temp_dir = mkdtemp()
try:
my_plotter = Plotter(
stamp="",
outdir=temp_dir,
fmt="pdf",
log=False,
annotate=False,
symmetric=False,
ratio=True,
)
b1 = OneDimBinning(
name="coszen", num_bins=20, is_lin=True, domain=[-1, 1], tex=r"\cos(\theta)"
)
b2 = OneDimBinning(
name="energy", num_bins=10, is_log=True, domain=[1, 80] * ureg.GeV, tex=r"E"
)
b3 = OneDimBinning(name="pid", num_bins=2, bin_edges=[0, 1, 2], tex=r"pid")
binning = b1 * b2 * b3
# now let's generate some toy data
N = 100000
cz = np.random.normal(1, 1.2, N)
# cut away coszen outside -1, 1
cz = cz[(cz >= -1) & (cz <= 1)]
e = np.random.normal(30, 20, len(cz))
pid = np.random.uniform(0, 2, len(cz))
data = np.array([cz, e, pid]).T
# make numpy histogram for validation
bins = [unp.nominal_values(b.bin_edges) for b in binning]
raw_hist, _ = np.histogramdd(data, bins=bins)
# get KDE'ed histo
hist = kde_histogramdd(
data,
binning,
bw_method="silverman",
coszen_name="coszen",
oversample=10,
use_cuda=True,
stack_pid=True,
)
# put into mapsets and plot
m1 = Map(name="KDE", hist=hist, binning=binning)
m2 = Map(name="raw", hist=raw_hist, binning=binning)
with np.errstate(divide="ignore", invalid="ignore"):
m3 = m2 / m1
m3.name = "hist/KDE"
m3.tex = m3.name
m4 = m1 - m2
m4.name = "KDE - hist"
m4.tex = m4.name
ms = MapSet([m1, m2, m3, m4])
my_plotter.plot_2d_array(ms, fname="test_kde", cmap="summer")
except:
rmtree(temp_dir)
raise
else:
logging.warning(
"Inspect and manually clean up output(s) saved to %s" % temp_dir
)
if __name__ == "__main__":
test_kde_histogramdd()