Source code for irfpy.util.fivenumsum

''' Module for five value summary.

The five value summary is defined as below:

-   Median (M) of the dataset, i.e. the N/2-th data.
-   The lower 4th value (LF), i.e. the (N+1)/4-th data sorted
    from low to high.
-   The highter 4th value (HF), i.e. the (N+1)/4-th data sorted
    from high to low.
-   The minimum value inside the inner fence (MI).
    Inner fence is determined by [LF-1.5*(HF-LF), HF+1.5*(HF-LF)].
-   The maximum value inside the inner fence (MA).
-   The array of the data in the range betweeen inner fence
    and outer fence.  Outer range is defined by
    [LF-3.0*(HF-LF), HF+3.0*(HF-LF)]
-   The array of the data in the range far out
    (outside of outer fence).

.. codeauthor:: Yoshifumi Futaana

'''

import numpy
from numpy.ma import where
import logging
_logger = logging.getLogger(__name__)


[docs]def fivenumsum(data_array): ''' Calculate the five number summary. Make a data for box and whisker plot. Return is [median, lower4th, higher4th, minimum_inside, maximum_inside, outside(array), farout(array)] :param: 1-D numpy array to be analyzed. :returns: The five number summary. [M, LF, HF, MI, MA, OS, FOS]. >>> v = numpy.array([-100, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 180]) >>> print(fivenumsum(v)) # doctest: +NORMALIZE_WHITESPACE (5.5, 2.25, 8.75, 1, 9, array([20.]), array([-100, 180])) ''' data = numpy.array(sorted(data_array.copy())) ndat = len(data) _logger.debug('Number of data=%d' % ndat) _logger.debug(data) if ndat <= 2: _logger.error('Data length should be >2 (Current=%d)' % ndat) return None ### Median if ndat % 2 == 1: M = data[(ndat + 1) // 2 - 1] else: M0 = data[(ndat) // 2 - 1] M1 = data[(ndat) // 2] M = (M0 + M1) / 2. _logger.debug('Median = %f' % M) ### lower/higher 4-th. if ndat % 4 == 0: idx0 = int((ndat + 1) * 0.25) - 1 # -1 is because the index starts from 0 idx1 = idx0 + 1 LF = data[idx0] * 0.75 + data[idx1] * 0.25 idx0 = int((ndat + 1) * 0.75) - 1 idx1 = idx0 + 1 HF = data[idx0] * 0.25 + data[idx1] * 0.75 elif ndat % 4 == 1: idx0 = int((ndat + 1) * 0.25) - 1 # -1 is because the index starts from 0 idx1 = idx0 + 1 LF = data[idx0] * 0.5 + data[idx1] * 0.5 idx0 = int((ndat + 1) * 0.75) - 1 idx1 = idx0 + 1 HF = data[idx0] * 0.5 + data[idx1] * 0.5 elif ndat % 4 == 2: idx0 = int((ndat + 1) * 0.25) - 1 # -1 is because the index starts from 0 idx1 = idx0 + 1 LF = data[idx0] * 0.25 + data[idx1] * 0.75 idx0 = int((ndat + 1) * 0.75) - 1 idx1 = idx0 + 1 HF = data[idx0] * 0.75 + data[idx1] * 0.25 else: idx0 = (ndat + 1) // 4 - 1 LF = data[idx0] idx0 = 3 * (ndat + 1) // 4 - 1 HF = data[idx0] _logger.debug('LF=%f : HF=%f' % (LF, HF)) D = HF - LF _logger.debug('D=%f' % D) INFENCEL = LF - D * 1.5 INFENCEH = HF + D * 1.5 OUTFENCEL = LF - D * 3 OUTFENCEH = HF + D * 3 _logger.debug('INFENCE=[%f %f]' % (INFENCEL, INFENCEH)) _logger.debug('OUTFENCE=[%f %f]' % (OUTFENCEL, OUTFENCEH)) ### lowest value in the fence idx = where(data >= INFENCEL)[0] if len(idx) == 0: raise RuntimeError('Strange... It should not happen...') MI = None else: MI = data[idx[0]] _logger.debug('Minimum in fence = %f' % MI) ### Higherst value in the fence idx = where(data <= INFENCEH)[0] if len(idx) == 0: raise RuntimeError('Strange... It should not happen...') MA = None else: MA = data[idx[-1]] _logger.debug('Maximum in fence = %f' % MA) ### Outside idxh = where(data > INFENCEH)[0] if len(idxh) == 0: arrh = numpy.array([]) else: arrh = numpy.array(data[idxh]) ### Remove far out idxh = where(arrh <= OUTFENCEH)[0] if len(idxh) == 0: arrh = numpy.array([]) else: arrh = numpy.array(arrh[idxh]) idxl = where(data < INFENCEL)[0] if len(idxl) == 0: arrl = numpy.array([]) else: arrl = numpy.array(data[idxl]) idxl = where(arrl >= OUTFENCEL)[0] if len(idxl) == 0: arrl = numpy.array([]) else: arrl = numpy.array(arrl[idxl]) outside = numpy.array(sorted(numpy.concatenate([arrl, arrh]))) _logger.debug('OUTSIDE = %s' % str(outside)) ### Far Outside idxh = where(data > OUTFENCEH)[0] if len(idxh) == 0: arrh = numpy.array([]) else: arrh = numpy.array(data[idxh]) idxl = where(data < OUTFENCEL)[0] if len(idxl) == 0: arrl = numpy.array([]) else: arrl = numpy.array(data[idxl]) # print idxh,arrh # print idxl,arrl farout = numpy.array(sorted(numpy.concatenate([arrl, arrh]))) _logger.debug('FAR OUT = %s' % str(farout)) return (M, LF, HF, MI, MA, outside, farout)
import unittest import doctest
[docs]def doctests(): return unittest.TestSuite(( doctest.DocTestSuite(), ))
if __name__ == '__main__': unittest.main(defaultTest='doctests')