Source code for apmapflow.data_processing.histogram

"""
================================================================================
Histogram
================================================================================
| Calculates a simple histogram for a data map. This also serves as the
| super class for variants of a simple histogram.

| Written By: Matthew Stadelman
| Date Written: 2016/02/29
| Last Modifed: 2016/10/25

"""
import scipy as sp
from .. import _get_logger, calc_percentile
from .base_processor import BaseProcessor
logger = _get_logger(__name__)


[docs]class Histogram(BaseProcessor):
    r"""
    Performs a basic histogram of the data based on the number of bins
    desired. The first bin contains all values below the 1st percentile
    and the last bin contains all values above the 99th percentile to keep
    axis scales from being bloated by extrema.
    kwargs include:
        num_bins - integer value for the total number of bins
    """
    def __init__(self, field, **kwargs):
        super().__init__(field)
        self.args.update(kwargs)
        self.output_key = 'hist'
        self.action = 'histogram'
        self.bins = []

    @classmethod
[docs]    def _add_subparser(cls, subparsers, parent):
        r"""
        Adds a specific action based sub-parser to the supplied arg_parser
        instance.
        """
        parser = subparsers.add_parser(cls.__name__,
                                       aliases=['hist'],
                                       parents=[parent],
                                       help=cls.__doc__)
        #
        parser.add_argument('num_bins', type=int,
                            help='number of bins to utilze in histogram')
        parser.set_defaults(func=cls)

[docs]    def define_bins(self):
        r"""
        This defines the bins for a regular histogram
        """
        self.data_vector.sort()
        num_bins = self.args['num_bins']
        min_val = calc_percentile(1.0, self.data_vector, False)
        max_val = calc_percentile(99.0, self.data_vector, False)
        #
        # creating initial bins
        low = list(sp.linspace(min_val, max_val, num_bins))
        high = list(sp.linspace(min_val, max_val, num_bins))[1:]
        high.append(self.data_vector[-1]*1.0001)
        #
        # adding lower bin if needed
        if self.data_vector[0] < min_val:
            low.insert(0, self.data_vector[0])
            high.insert(0, min_val)
        #
        self.bins = [bin_ for bin_ in zip(low, high)]

[docs]    def _process_data(self, preserve_bins=False):
        r"""
        Calculates a histogram from a range of data. This uses the 1st and
        99th percentiles as limits when defining bins
        """
        #
        if not preserve_bins:
            self.define_bins()
        #
        # populating bins
        edges = sp.array(self.bins[0][0])
        edges = sp.append(edges, sp.array(self.bins)[:, 1])
        data, edges = sp.histogram(self.data_vector, bins=edges)
        #
        # storing data
        self.processed_data = []
        for (low, high), count in zip(self.bins, data):
            self.processed_data.append((low, high, count))

[docs]    def _output_data(self, filename=None, delim=','):
        r"""
        Creates the output content for histograms
        """
        #
        if filename is None:
            filename = self.outfile_name
            #
            # getting index before the extension
            ldot = filename.rfind('.')
            #
            # naming ouput file
            filename = filename[:ldot]+'-'+self.action+filename[ldot:]
        self.outfile_name = filename
        #
        # outputting data
        content = 'Histogram data from file: '+self.infile+'\n'
        content += 'Low value, High value, # Data Points\n'
        fmt_str = '{0}'+delim+'{1}'+delim+'{2}\n'
        for dat in self.processed_data:
            content += fmt_str.format(dat[0], dat[1], dat[2])
        content += '\n'
        #
        self.outfile_content = content