Source code for pisa.utils.kde_hist

"""
Functions to get KDE smoothed historgams
"""


from __future__ import absolute_import, division

from kde.cudakde import gaussian_kde, bootstrap_kde
import numpy as np
from uncertainties import unumpy as unp
import copy

from pisa.core.binning import OneDimBinning, MultiDimBinning


__all__ = ["get_hist", "kde_histogramdd", "test_kde_histogramdd"]

__author__ = "P. Eller"

__license__ = """Copyright (c) 2014-2017, The IceCube Collaboration

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License."""



[docs]
def get_hist(
    sample,
    binning,
    weights=None,
    bw_method="scott",
    adaptive=True,
    alpha=0.3,
    use_cuda=False,
    coszen_reflection=0.25,
    coszen_name="coszen",
    oversample=1,
    bootstrap=False,
    bootstrap_niter=10,
):
    """Helper function for histograms from KDE

    For description of args see kde_histogramdd()

    Handling the reflctions at the coszen edges

    ToDo:
    ----
    * Handle zenith like coszen? Or better: Define set of variables to perform
      reflection on and reflection parameters (e.g. `reflect_fract` or somesuch
      to stand in for for `coszen_reflection` and `reflect_dims` as standin for
      `coszen_name`; also need some way to specify whether to reflect about lower
      and/or upper edge); each such parameter can either be a single value, or a
      sequence with one value per variable.
    * Any good reason for 0.25 and 'scott' defaults? If not, don't define a
      default and force the user to explicitly set this when function is called.
    """

    if bootstrap and oversample > 1:
        # Because the errors within a bin are highly correlated, they could not just
        # be added in quadrature to create an oversampled histogram with errors.
        raise ValueError("Bootstrapping cannot be combined with oversampling.")

    # the KDE implementation expects an empty weights array instead of `None`
    if weights is None:
        weights = []

    # Get the overall normalization here, because the KDE will be normalized
    # to one and we'll need to rescale in the end
    if len(weights) == 0:
        norm = sample.shape[0]
    else:
        norm = np.sum(np.nan_to_num(weights))

    binning = binning.oversample(oversample)

    # Flip around to satisfy the kde implementation
    x = sample.T

    # Must have same amount of dimensions as binning dimensions
    assert x.shape[0] == len(binning)

    # TODO: What if coszen isn't in binning? Does this fail?
    # Yes, coszen is expected
    cz_bin = binning.index(coszen_name)

    # Swap out cz bin to first place (index 0)
    if cz_bin != 0:
        # Also swap binning:
        new_binning = [binning[coszen_name]]
        for b in binning:
            if b.name != coszen_name:
                new_binning.append(b)
        binning = MultiDimBinning(new_binning)
        x[[0, cz_bin]] = x[[cz_bin, 0]]

    # Check if edge needs to be reflected
    reflect_lower = binning[coszen_name].bin_edges[0] == -1
    reflect_upper = binning[coszen_name].bin_edges[-1] == 1

    # Get the kernel weights
    kde_kwargs = dict(
        weights=np.nan_to_num(weights),
        bw_method=bw_method,
        adaptive=adaptive,
        alpha=alpha,
        use_cuda=use_cuda,
    )
    if bootstrap:
        kernel_weights_adaptive = bootstrap_kde(x, niter=bootstrap_niter, **kde_kwargs)
    else:
        kernel_weights_adaptive = gaussian_kde(x, **kde_kwargs)

    # Get the bin centers, where we're going to evaluate the KDEs, and extend
    # the bin range for reflection
    bin_points = []
    for b in binning:
        c = b.weighted_centers.m
        if b.name == coszen_name:
            # how many bins to add for reflection
            l = int(len(c) * float(coszen_reflection))
            if reflect_lower:
                c0 = 2 * c[0] - c[1 : l + 1][::-1]
            else:
                c0 = []
            if reflect_upper:
                c1 = 2 * c[-1] - c[-l - 1 : -1][::-1]
            else:
                c1 = []
            c = np.concatenate([c0, c, c1])
        bin_points.append(c)

    # Shape including reflection edges
    megashape = (
        binning.shape[0] + (int(reflect_upper) + int(reflect_lower)) * l,
        binning.shape[1],
    )

    # Shape of the reflection edges alone
    minishape = (binning.shape[0] - l, binning.shape[1])

    # Create a set of points
    grid = np.meshgrid(*bin_points, indexing="ij")
    points = np.array([g.ravel() for g in grid])

    # Evaluate KDEs at given points
    if bootstrap:
        hist, errors = kernel_weights_adaptive(points)
        # variances can simply be added together when we apply reflections, we take
        # the root afterwards
        variances = errors ** 2
    else:
        hist = kernel_weights_adaptive(points)

    # Reshape 1d array into nd
    hist = hist.reshape(megashape)
    if bootstrap:
        variances = variances.reshape(megashape)

    def apply_reflection(hist_):
        # Cut off the reflection edges, mirror them, fill up remaining space with
        # zeros and add to histo
        if reflect_lower:
            hist0 = hist_[0:l, :]
            hist0_0 = np.zeros(minishape)
            hist0 = np.flipud(np.concatenate([hist0_0, hist0]))
            hist_ = hist_[l:, :]
        else:
            hist0 = 0

        if reflect_upper:
            hist1 = hist_[-l:, :]
            hist1_0 = np.zeros(minishape)
            hist1 = np.flipud(np.concatenate([hist1, hist1_0]))
            hist_ = hist_[:-l, :]
        else:
            hist1 = 0

        hist_ = hist_ + hist1 + hist0
        return hist_

    hist = apply_reflection(hist)
    if bootstrap:
        variances = apply_reflection(variances)
        errors = np.sqrt(variances)

    # Bin volumes
    volume = binning.bin_volumes(attach_units=False)
    hist = hist * volume
    if bootstrap:
        errors = errors * volume

    # Downsample, not applicable when bootstrapping
    if oversample != 1:
        for i, b in enumerate(binning):
            hist = np.add.reduceat(
                hist, np.arange(0, len(b.bin_edges) - 1, oversample), axis=i
            )

    # Swap back the axes
    if cz_bin != 0:
        hist = np.swapaxes(hist, 0, cz_bin)
        if bootstrap:
            errors = np.swapaxes(errors, 0, cz_bin)

    if bootstrap:
        return hist * norm, errors * norm
    else:
        return hist * norm




[docs]
def kde_histogramdd(
    sample,
    binning,
    weights=None,
    bw_method="scott",
    adaptive=True,
    alpha=0.3,
    use_cuda=False,
    coszen_reflection=0.25,
    coszen_name="coszen",
    oversample=1,
    stack_pid=True,
    bootstrap=False,
    bootstrap_niter=10
):
    """Run kernel density estimation (KDE) for an array of data points, and
    then evaluate them on a histogram-like grid to effectively produce a
    histogram-like output.
    Handles reflection at coszen edges, and will expect coszen to be in the binning

    Based on Sebastian Schoenen's KDE implementation:
    http://code.icecube.wisc.edu/svn/sandbox/schoenen/kde

    Parameters
    ----------
    sample : array
        Shape (N_evts, vars), with vars in the right order corresponding to the
        binning order.

    binning : MultiDimBinning
        A coszen dimension is expected

    weights : None or array
        Same shape as `sample`

    bw_method: string
        'scott' or 'silverman' (see kde module)

    adaptive : bool
        (see kde module)

    alpha : float
        A parameter for the KDEs (see kde module)

    use_cuda : bool
        Run on GPU (only works with <= 2d)

    coszen_reflection : float
        Part (number between 0 and 1) of binning that is reflected at the
        coszen -1 and 1 edges

    coszen_name : string
        Binning name to identify the coszen bin that needs to undergo special
        treatment for reflection

    oversample : int
        Evaluate KDE at more points per bin, takes longer, but is more accurate

    stack_pid : bool
        Treat each pid bin separately, not as another dimension of the KDEs
        Only supported for two additional dimensions, pid binning must be named `pid`

    bootstrap : bool
        Use the ``bootstrap_kde`` class to produce error estimates on the KDE histograms.
        Slow, not recommended during fits.

    bootstrap_niter : int
        Number of bootstrap iterations.

    Returns
    -------
    histogram : numpy.ndarray

    ToDo:
    -----

    * Maybe return Map with binnings attached insted of nd-array?
    * Generalize to handle any dimensions with any reflection criterias

    """
    if weights is not None and len(weights) != sample.shape[0]:
        raise ValueError(
            "Length of sample (%s) and weights (%s) incompatible"
            % (sample.shape[0], len(weights))
        )

    if not stack_pid:
        return get_hist(
            sample=sample,
            binning=binning,
            weights=weights,
            bw_method=bw_method,
            adaptive=adaptive,
            alpha=alpha,
            use_cuda=use_cuda,
            coszen_reflection=coszen_reflection,
            coszen_name=coszen_name,
            oversample=oversample,
            bootstrap=bootstrap,
            bootstrap_niter=bootstrap_niter
        )

    # treat pid bins separately
    # asuming we're dealing with 2d apart from PID
    bin_names = copy.copy(binning.names)
    bin_edges = [b.bin_edges.m for b in binning]
    pid_bin = bin_names.index("pid")
    other_bins = [0, 1, 2]
    other_bins.pop(pid_bin)
    bin_names.pop(pid_bin)
    assert len(bin_names) == 2
    pid_bin_edges = bin_edges.pop(pid_bin)
    d2d_binning = []
    for b in binning:
        if b.name != "pid":
            d2d_binning.append(b)
    d2d_binning = MultiDimBinning(d2d_binning)
    pid_stack = []
    if bootstrap:
        pid_stack_errors = []

    for pid in range(len(pid_bin_edges) - 1):
        mask_pid = (sample.T[pid_bin] >= pid_bin_edges[pid]) & (
            sample.T[pid_bin] < pid_bin_edges[pid + 1]
        )
        data = np.array(
            [sample.T[other_bins[0]][mask_pid], sample.T[other_bins[1]][mask_pid]]
        )

        if weights is None:
            weights_pid = None
        else:
            weights_pid = weights[mask_pid]

        hist_kwargs = dict(
            sample=data.T,
            weights=weights_pid,
            binning=d2d_binning,
            coszen_name=coszen_name,
            use_cuda=use_cuda,
            bw_method=bw_method,
            alpha=alpha,
            oversample=oversample,
            coszen_reflection=coszen_reflection,
            adaptive=adaptive,
            bootstrap=bootstrap,
            bootstrap_niter=bootstrap_niter
        )
        if bootstrap:
            hist, errors = get_hist(**hist_kwargs)
            pid_stack.append(hist)
            pid_stack_errors.append(errors)
        else:
            pid_stack.append(get_hist(**hist_kwargs))

    hist = np.dstack(pid_stack)
    if bootstrap:
        errors = np.dstack(pid_stack_errors)

    if pid_bin != 2:
        hist = np.swapaxes(hist, pid_bin, 2)
        if bootstrap:
            errors = np.swapaxes(errors, pid_bin, 2)

    if bootstrap:
        return hist, errors
    else:
        return hist



# TODO: make the plotting optional but add comparisons against some known
# results. This can be accomplished by seeding before calling random to obtain
# a reference result, and check that the same values are returned when run
# below.



[docs]
def test_kde_histogramdd():
    """Unit tests for kde_histogramdd"""
    from argparse import ArgumentParser
    from shutil import rmtree
    from tempfile import mkdtemp
    from pisa import ureg
    from pisa.core.map import Map, MapSet
    from pisa.utils.log import logging, set_verbosity
    from pisa.utils.plotter import Plotter

    parser = ArgumentParser()
    parser.add_argument("-v", action="count", default=None, help="set verbosity level")
    args = parser.parse_args()
    set_verbosity(args.v)

    temp_dir = mkdtemp()

    try:
        my_plotter = Plotter(
            stamp="",
            outdir=temp_dir,
            fmt="pdf",
            log=False,
            annotate=False,
            symmetric=False,
            ratio=True,
        )

        b1 = OneDimBinning(
            name="coszen", num_bins=20, is_lin=True, domain=[-1, 1], tex=r"\cos(\theta)"
        )
        b2 = OneDimBinning(
            name="energy", num_bins=10, is_log=True, domain=[1, 80] * ureg.GeV, tex=r"E"
        )
        b3 = OneDimBinning(name="pid", num_bins=2, bin_edges=[0, 1, 2], tex=r"pid")
        binning = b1 * b2 * b3

        # now let's generate some toy data

        N = 100000
        cz = np.random.normal(1, 1.2, N)
        # cut away coszen outside -1, 1
        cz = cz[(cz >= -1) & (cz <= 1)]
        e = np.random.normal(30, 20, len(cz))
        pid = np.random.uniform(0, 2, len(cz))
        data = np.array([cz, e, pid]).T

        # make numpy histogram for validation
        bins = [unp.nominal_values(b.bin_edges) for b in binning]
        raw_hist, _ = np.histogramdd(data, bins=bins)

        # get KDE'ed histo
        hist = kde_histogramdd(
            data,
            binning,
            bw_method="silverman",
            coszen_name="coszen",
            oversample=10,
            use_cuda=True,
            stack_pid=True,
        )

        # put into mapsets and plot
        m1 = Map(name="KDE", hist=hist, binning=binning)
        m2 = Map(name="raw", hist=raw_hist, binning=binning)
        with np.errstate(divide="ignore", invalid="ignore"):
            m3 = m2 / m1
        m3.name = "hist/KDE"
        m3.tex = m3.name
        m4 = m1 - m2
        m4.name = "KDE - hist"
        m4.tex = m4.name
        ms = MapSet([m1, m2, m3, m4])
        my_plotter.plot_2d_array(ms, fname="test_kde", cmap="summer")
    except:
        rmtree(temp_dir)
        raise
    else:
        logging.warning(
            "Inspect and manually clean up output(s) saved to %s" % temp_dir
        )



if __name__ == "__main__":
    test_kde_histogramdd()