''' Module for five value summary.
The five value summary is defined as below:
- Median (M) of the dataset, i.e. the N/2-th data.
- The lower 4th value (LF), i.e. the (N+1)/4-th data sorted
from low to high.
- The highter 4th value (HF), i.e. the (N+1)/4-th data sorted
from high to low.
- The minimum value inside the inner fence (MI).
Inner fence is determined by [LF-1.5*(HF-LF), HF+1.5*(HF-LF)].
- The maximum value inside the inner fence (MA).
- The array of the data in the range betweeen inner fence
and outer fence. Outer range is defined by
[LF-3.0*(HF-LF), HF+3.0*(HF-LF)]
- The array of the data in the range far out
(outside of outer fence).
.. codeauthor:: Yoshifumi Futaana
'''
import numpy
from numpy.ma import where
import logging
_logger = logging.getLogger(__name__)
[docs]def fivenumsum(data_array):
''' Calculate the five number summary.
Make a data for box and whisker plot.
Return is [median, lower4th, higher4th, minimum_inside,
maximum_inside, outside(array), farout(array)]
:param: 1-D numpy array to be analyzed.
:returns: The five number summary. [M, LF, HF, MI, MA, OS, FOS].
>>> v = numpy.array([-100, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 180])
>>> print(fivenumsum(v)) # doctest: +NORMALIZE_WHITESPACE
(5.5, 2.25, 8.75, 1, 9, array([20.]), array([-100, 180]))
'''
data = numpy.array(sorted(data_array.copy()))
ndat = len(data)
_logger.debug('Number of data=%d' % ndat)
_logger.debug(data)
if ndat <= 2:
_logger.error('Data length should be >2 (Current=%d)'
% ndat)
return None
### Median
if ndat % 2 == 1:
M = data[(ndat + 1) // 2 - 1]
else:
M0 = data[(ndat) // 2 - 1]
M1 = data[(ndat) // 2]
M = (M0 + M1) / 2.
_logger.debug('Median = %f' % M)
### lower/higher 4-th.
if ndat % 4 == 0:
idx0 = int((ndat + 1) * 0.25) - 1
# -1 is because the index starts from 0
idx1 = idx0 + 1
LF = data[idx0] * 0.75 + data[idx1] * 0.25
idx0 = int((ndat + 1) * 0.75) - 1
idx1 = idx0 + 1
HF = data[idx0] * 0.25 + data[idx1] * 0.75
elif ndat % 4 == 1:
idx0 = int((ndat + 1) * 0.25) - 1
# -1 is because the index starts from 0
idx1 = idx0 + 1
LF = data[idx0] * 0.5 + data[idx1] * 0.5
idx0 = int((ndat + 1) * 0.75) - 1
idx1 = idx0 + 1
HF = data[idx0] * 0.5 + data[idx1] * 0.5
elif ndat % 4 == 2:
idx0 = int((ndat + 1) * 0.25) - 1
# -1 is because the index starts from 0
idx1 = idx0 + 1
LF = data[idx0] * 0.25 + data[idx1] * 0.75
idx0 = int((ndat + 1) * 0.75) - 1
idx1 = idx0 + 1
HF = data[idx0] * 0.75 + data[idx1] * 0.25
else:
idx0 = (ndat + 1) // 4 - 1
LF = data[idx0]
idx0 = 3 * (ndat + 1) // 4 - 1
HF = data[idx0]
_logger.debug('LF=%f : HF=%f' % (LF, HF))
D = HF - LF
_logger.debug('D=%f' % D)
INFENCEL = LF - D * 1.5
INFENCEH = HF + D * 1.5
OUTFENCEL = LF - D * 3
OUTFENCEH = HF + D * 3
_logger.debug('INFENCE=[%f %f]' % (INFENCEL, INFENCEH))
_logger.debug('OUTFENCE=[%f %f]' % (OUTFENCEL, OUTFENCEH))
### lowest value in the fence
idx = where(data >= INFENCEL)[0]
if len(idx) == 0:
raise RuntimeError('Strange... It should not happen...')
MI = None
else:
MI = data[idx[0]]
_logger.debug('Minimum in fence = %f' % MI)
### Higherst value in the fence
idx = where(data <= INFENCEH)[0]
if len(idx) == 0:
raise RuntimeError('Strange... It should not happen...')
MA = None
else:
MA = data[idx[-1]]
_logger.debug('Maximum in fence = %f' % MA)
### Outside
idxh = where(data > INFENCEH)[0]
if len(idxh) == 0:
arrh = numpy.array([])
else:
arrh = numpy.array(data[idxh])
### Remove far out
idxh = where(arrh <= OUTFENCEH)[0]
if len(idxh) == 0:
arrh = numpy.array([])
else:
arrh = numpy.array(arrh[idxh])
idxl = where(data < INFENCEL)[0]
if len(idxl) == 0:
arrl = numpy.array([])
else:
arrl = numpy.array(data[idxl])
idxl = where(arrl >= OUTFENCEL)[0]
if len(idxl) == 0:
arrl = numpy.array([])
else:
arrl = numpy.array(arrl[idxl])
outside = numpy.array(sorted(numpy.concatenate([arrl, arrh])))
_logger.debug('OUTSIDE = %s' % str(outside))
### Far Outside
idxh = where(data > OUTFENCEH)[0]
if len(idxh) == 0:
arrh = numpy.array([])
else:
arrh = numpy.array(data[idxh])
idxl = where(data < OUTFENCEL)[0]
if len(idxl) == 0:
arrl = numpy.array([])
else:
arrl = numpy.array(data[idxl])
# print idxh,arrh
# print idxl,arrl
farout = numpy.array(sorted(numpy.concatenate([arrl, arrh])))
_logger.debug('FAR OUT = %s' % str(farout))
return (M, LF, HF, MI, MA, outside, farout)
import unittest
import doctest
[docs]def doctests():
return unittest.TestSuite((
doctest.DocTestSuite(),
))
if __name__ == '__main__':
unittest.main(defaultTest='doctests')