Source code for apmapflow.data_processing.histogram

"""
================================================================================
Histogram
================================================================================
| Calculates a simple histogram for a data map. This also serves as the
| super class for variants of a simple histogram.

| Written By: Matthew Stadelman
| Date Written: 2016/02/29
| Last Modifed: 2016/10/25

"""
import scipy as sp
from .. import _get_logger, calc_percentile
from .base_processor import BaseProcessor
logger = _get_logger(__name__)


[docs]class Histogram(BaseProcessor): r""" Performs a basic histogram of the data based on the number of bins desired. The first bin contains all values below the 1st percentile and the last bin contains all values above the 99th percentile to keep axis scales from being bloated by extrema. kwargs include: num_bins - integer value for the total number of bins """ def __init__(self, field, **kwargs): super().__init__(field) self.args.update(kwargs) self.output_key = 'hist' self.action = 'histogram' self.bins = [] @classmethod
[docs] def _add_subparser(cls, subparsers, parent): r""" Adds a specific action based sub-parser to the supplied arg_parser instance. """ parser = subparsers.add_parser(cls.__name__, aliases=['hist'], parents=[parent], help=cls.__doc__) # parser.add_argument('num_bins', type=int, help='number of bins to utilze in histogram') parser.set_defaults(func=cls)
[docs] def define_bins(self): r""" This defines the bins for a regular histogram """ self.data_vector.sort() num_bins = self.args['num_bins'] min_val = calc_percentile(1.0, self.data_vector, False) max_val = calc_percentile(99.0, self.data_vector, False) # # creating initial bins low = list(sp.linspace(min_val, max_val, num_bins)) high = list(sp.linspace(min_val, max_val, num_bins))[1:] high.append(self.data_vector[-1]*1.0001) # # adding lower bin if needed if self.data_vector[0] < min_val: low.insert(0, self.data_vector[0]) high.insert(0, min_val) # self.bins = [bin_ for bin_ in zip(low, high)]
[docs] def _process_data(self, preserve_bins=False): r""" Calculates a histogram from a range of data. This uses the 1st and 99th percentiles as limits when defining bins """ # if not preserve_bins: self.define_bins() # # populating bins edges = sp.array(self.bins[0][0]) edges = sp.append(edges, sp.array(self.bins)[:, 1]) data, edges = sp.histogram(self.data_vector, bins=edges) # # storing data self.processed_data = [] for (low, high), count in zip(self.bins, data): self.processed_data.append((low, high, count))
[docs] def _output_data(self, filename=None, delim=','): r""" Creates the output content for histograms """ # if filename is None: filename = self.outfile_name # # getting index before the extension ldot = filename.rfind('.') # # naming ouput file filename = filename[:ldot]+'-'+self.action+filename[ldot:] self.outfile_name = filename # # outputting data content = 'Histogram data from file: '+self.infile+'\n' content += 'Low value, High value, # Data Points\n' fmt_str = '{0}'+delim+'{1}'+delim+'{2}\n' for dat in self.processed_data: content += fmt_str.format(dat[0], dat[1], dat[2]) content += '\n' # self.outfile_content = content