Source code for irfpy.ica.io

# -*- coding: utf-8 -*-
"""
Read data from a local copy of the ICA data pipeline.

Author: Martin Wieser

Module: irfpy.ica.io

Functions to load transparently ICA pipeline data from the lcoal filesystem.
Functions all return a dictionary with all loaded variables.

Example load data::

    mat=readraw('~/icadata','20150211','20150213',variables=['time_interval','E'])
    mat=readproc('~/icadata','20150211','20150213',variables=['time_instances','E'])
    mat=readlap('~/icadata','20150211',variables=['lap_time','lap_potential'])

Summary:
    The most modern and environmentally friendly way to access ICA data:

    - irfpy.ica.io.readraw
    - irfpy.ica.io.readproc
    - irfpy.ica.io.readaux
    - irfpy.ica.io.readspecial
    - irfpy.ica.io.readlap
    - irfpy.ica.io.readcops
    - irfpy.ica.io.readbestc
    - irfpy.ica.io.readflux


    For more advanced ways to access the data use these low level functions:

    - irfpy.ica.io.loadlevel0
    - irfpy.ica.io.loadlevel1
    - irfpy.ica.io.loadlevel1oktime
    - irfpy.ica.io.loadlevel1okall
    - irfpy.ica.io.loadlevel1filter
    - irfpy.ica.io.loadlevel2
    - irfpy.ica.io.loadlevel3

    Read any version of .mat or .h5 file directly without doing the data
    type conversions of the load or read functions:

    - irfpy.ica.io.loadmat
    - irfpy.ica.io.savemat
    - irfpy.ica.io.loadh5
    - irfpy.ica.io.saveh5


"""
#analysis:ignore


import gc
import os
import glob
from collections import defaultdict
import datetime as dt
import warnings
import re
import logging
import h5py
import hdf5storage


import numpy as np
from numpy import ma

logging.basicConfig()
_logger = logging.getLogger('ica.io')
_logger.setLevel(logging.DEBUG)


try:
    if __name__ == '__main__':
        import tools as icatools
    else:
        from . import tools as icatools
except ModuleNotFoundError:
    import irfpy.ica.tools as icatools

try:
    import deepdish
    _HAVE_DEEPDISH = True
except ModuleNotFoundError:
    _HAVE_DEEPDISH = False

try:
    import psutil
    _HAVE_PSUTIL = True
except ModuleNotFoundError:
    _HAVE_PSUTIL = False




# %%

# The most modern and environmentally friendly way to access the ica data:


[docs]def readraw(datarootpath,
            from_day=None,
            to_day='',
            variables=None,
            verbose=False,
            flat=False,
            dataformat='mat',
            **kwargs):
    r"""
    Loads one hour or more worth of level0 data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to the time interval specified.

    AUTHOR: Gabriella Stenberg Wieser

    PARAMETERS

    datarootpath (string):
        local path to the 'root' of the data tree
        This data tree should contain the file(s) in subdirectories
        level0/xxxx/yyyy/mm/dd/ as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (list of strings):
        a list of variable names to load. Default is to load
        all variables. If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)

    OUTPUT:

    returns a dictionary with all loaded variables


    EXAMPLES::

        mat=readraw('~/icadata','20150211')
        mat=readraw('~/icadata','20150211',variables=['iontime','ionspectra'])
        mat=readraw('~/icadata','20150211','20150213',variables=['time_interval','E'])

    """
    if variables is None:
        variables = []

    return loadlevel0(datarootpath, from_day, to_day, variables,
                      verbose, flat, dataformat=dataformat, **kwargs)


[docs]def readproc(datarootpath,
             from_day=None,
             to_day='',
             variables=None,
             verbose=False,
             flat=False,
             includeerrors=False,
             partialelevationscan=False,
             dataformat='mat',
             filters=None,
             time_instances='time_instances'):
    r"""
    Loads one day or more worth of proc*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories
        level1/xxx/yyyy/mm/dd/ as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where an
        elevation scan starts.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    includeerrors (boolean), OUTDATED:
        Optional parameter, if set to True the loading function will
        also load data with decoder or formatting errors
        This parameter is outdated. Use filters keyword instead,
        e.g. filters={'time_error_flag':0}
    dataformat (str):
        'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the
        matlab data tree is read.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        mat=readproc('~/icadata','20150211')
        mat=readproc('~/icadata','20150211',variables=['time_instances','E'])
        mat=readproc('~/icadata','20150211','20150213',variables=['time_instances','E'])

    """
    if variables is None:
        variables = []

    if includeerrors:
        if filters is None:
            filters = {}
            filters['time_error_flag'] = 0
    else:
        if filters is None:
            filters = {}
            filters['error_flags'] = 0

    return loadlevelN(datarootpath,
               fileprefix='proc',
               from_day=from_day,
               to_day=to_day,
               variables=variables,
               verbose=verbose,
               flat=flat,
               partialelevationscan=partialelevationscan,
               dataformat=dataformat,
               time_instances=time_instances,
               branch='level1',
               filters=filters,
               varshapes=default_varshapes1)




[docs]def readaux(datarootpath,
            from_day=None,
            to_day='',
            variables=None,
            verbose=False,
            flat=False,
            includeerrors=False,
            partialelevationscan=False,
            dataformat='mat',
            filters=None,
            time_instances='time_instances'):

    r"""Loads one day or more worth of aux*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories aux/xxxx/yyyy/mm/dd/
        as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where an elevation scan starts.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    includeerrors (boolean), OUTDATED:
        Optional parameter, if set to True the loading function will
        also load data with decoder or formatting erroes
        This parameter is outdated. Use filters keyword instead,
        e.g. filters={'time_error_flag':0}
    dataformat (str):
        'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the
        matlab data tree is read.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded. If bs_xxx variables are wanted and
    they are not in the aux files loaded, then an automatic fallback to xaux files
    is done.


    Use like this::

        mat=readaux('~/icadata','20150211')
        mat=readaux('~/icadata','20150211',variables=['sp_cso','time_instances'])
        mat=readaux('~/icadata','20150211','20150213')

    """
    if variables is None:
        variables = []

    if includeerrors:
        if filters is None:
            filters = {}
            filters['time_error_flag'] = 0
    else:
        if filters is None:
            filters = {}
            filters['error_flags'] = 0

    mat = loadlevelN(datarootpath,
               fileprefix='aux',
               from_day=from_day,
               to_day=to_day,
               variables=variables,
               verbose=verbose,
               flat=flat,
               partialelevationscan=partialelevationscan,
               dataformat=dataformat,
               time_instances=time_instances,
               branch='aux',
               filters=filters,
               varshapes=default_varshapes1)

    # now check if there was a whish for bs_xx and it could not be loaded. Try xaux.
    xvariables = list()
    if 'bs_cseq' in variables and 'bs_cseq' not in mat:
        xvariables.append('bs_cseq')
    if 'bs_cso' in variables and 'bs_cso' not in mat:
        xvariables.append('bs_cso')

    # there were some unfulfilled requests
    if len(xvariables) > 0: # there is need to load from xaux
        mat2 = loadlevelN(datarootpath,
                   fileprefix='xaux',
                   from_day=from_day,
                   to_day=to_day,
                   variables=xvariables,
                   verbose=verbose,
                   flat=flat,
                   partialelevationscan=partialelevationscan,
                   dataformat=dataformat,
                   time_instances='time_instances',
                   branch='xaux',
                   filters=filters,
                   varshapes=default_varshapes1)

        # some vars could be obtained from xaux
        if 'bs_cseq' in mat2:
            mat['bs_cseq'] = mat2['bs_cseq']
        if 'bs_cso' in mat2:
            mat['bs_cso'] = mat2['bs_cso']


    return mat




[docs]def readspecial(datarootpath,
                generation=None,
                variables=None,
                verbose=False):
    r"""
    Loads one day or more worth of special*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories
        special/gen??? as it is done in the pipeline.
    generation (int):
        generation number of the requested files. If None,
        variables are loaded from the newest generation.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        The variables are collected from .npz and .mat files in this order of priority.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        mat=readspecial('~/icadata')
        mat=readspecial('~/icadata',generation=6,verbose=True)

    """
    if variables is None:
        variables = []

    if not isinstance(datarootpath, str):
        raise ValueError(
            "irfpy.ica.io.loadspecial: The parameter datarootpath must be of <class 'str'> " +
            "(a string) but is now " + str(type(datarootpath)))
    if generation is not None:
        if not isinstance(generation, int):
            raise ValueError(
                "irfpy.ica.io.loadspecial: The parameter generation must be of <class 'int'> " +
                "(an integer) but is now " + str(type(datarootpath)))
        if generation < 0:
            generation = None

    if not isinstance(variables, (list, np.ndarray)):
        raise ValueError(
            "irfpy.ica.io.loadspecial: The parameter variables must be of <class 'list'> " +
            "(a list) or <class 'numpy.ndarray'> (a numpy array) but is now " +
            str(type(variables)))

    thevars = list(variables)
    tp = datarootpath + os.sep + 'special'
    if generation is None:
        # find the newest generation
        gens = glob.glob(tp + os.sep + 'gen*')
        gens.sort()
        thegen = gens[-1]
        generation = int(thegen[-3:])
    else:
        thegen = tp + os.sep + 'gen{:03d}'.format(generation)
    if verbose:
        print('** Loading special variables corresponding to GENERATION {:d} **'.format(generation))
    thenpz = glob.glob(thegen + os.sep + '*.npz')
    themat = glob.glob(thegen + os.sep + '*.mat')
    thenpz.sort()
    themat.sort()

    allvars = dict()
    allvars['generation'] = generation

    #first load all npz and then from all mat in sorted order
    for file in thenpz+themat:
        if verbose:
            print(' reading '+file)
        if file.endswith('.npz'):
            newvars = dict(np.load(file))
        else:
            newvars = loadmat(file)
        for k in newvars:
            if k not in allvars:
                if k in thevars or len(thevars) == 0:
                    allvars[k] = newvars[k]
                    if verbose:
                        print('  loading '+k)
            else:
                if verbose:
                    print("!!! Duplicate variable ('"+k+"') found. The duplicate will be ignored.")

    return allvars





[docs]def readlap(datarootpath,
            from_day=None,
            to_day='',
            variables=None,
            verbose=False,
            flat=False,
            dataformat='mat',
            filters=None,
            time_instances='asw_time'):

    r"""Loads one day or more worth of lap*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories lap/xxxx/yyyy/mm/dd/
        as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    dataformat (str):
        'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the
        matlab data tree is read.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        mat=readlap('~/icadata','20150211')
        mat=readlap('~/icadata','20150211',variables=['lap_time','lap_potential'])
        mat=readlap('~/icadata','20150211','20150213')

    """
    if variables is None:
        variables = []

    if filters is None:
        filters = dict()

    return loadlevelN(datarootpath,
               fileprefix='lap',
               from_day=from_day,
               to_day=to_day,
               variables=variables,
               verbose=verbose,
               flat=flat,
               partialelevationscan=True,
               dataformat=dataformat,
               time_instances=time_instances,
               branch='lap',
               filters=filters,
               varshapes=default_varshapes1)


[docs]def readcops(datarootpath,
             from_day=None,
             to_day='',
             variables=None,
             verbose=False,
             flat=False,
             dataformat='mat',
             filters=None,
             time_instances='cops_time'):

    r"""Loads one day or more worth of cops*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories
        cops/xxxx/yyyy/mm/dd/ as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    dataformat (str):
        'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the
        matlab data tree is read.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        mat=readcops('~/icadata','20150211')
        mat=readcops('~/icadata','20150211',variables=['cops_time','cops_pressure'])
        mat=readcops('~/icadata','20150211','20150213')

    """
    if variables is None:
        variables = []

    if filters is None:
        filters = dict()

    return loadlevelN(datarootpath,
               fileprefix='cops',
               from_day=from_day,
               to_day=to_day,
               variables=variables,
               verbose=verbose,
               flat=flat,
               partialelevationscan=True,
               dataformat=dataformat,
               time_instances=time_instances,
               branch='cops',
               filters=filters,
               varshapes=default_varshapes1)



[docs]def readmag(datarootpath,
            from_day=None,
            to_day='',
            variables=None,
            verbose=False,
            flat=False,
            dataformat='mat',
            filters=None,
            time_instances='mag_time'):

    r"""Loads one day or more worth of mag*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories mag/xxxx/yyyy/mm/dd/
        as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    dataformat (str):
        'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the
        matlab data tree is read.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        mat=readmag('~/icadata','20150211')
        mat=readmag('~/icadata','20150211',variables=['mag_B_cseq'])
        mat=readmag('~/icadata','20150211','20150213')

    """
    if variables is None:
        variables = []

    if filters is None:
        filters = dict()

    return loadlevelN(datarootpath,
               fileprefix='mag',
               from_day=from_day,
               to_day=to_day,
               variables=variables,
               verbose=verbose,
               flat=flat,
               partialelevationscan=True,
               dataformat=dataformat,
               time_instances=time_instances,
               branch='mag',
               filters=filters,
               varshapes=default_varshapes1)


[docs]def readbestc(datarootpath,
              from_day=None,
              to_day='',
              variables=None,
              verbose=False,
              flat=False,
              partialelevationscan=False,
              dataformat='mat',
              filters=None,
              time_instances='time_instances'):

    r"""Loads one day or more worth of bestc*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories
        level2/xxxx/yyyy/mm/dd/ as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where
        an elevation scan starts.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    dataformat (str):
        'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the
        matlab data tree is read.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        mat=readbestc('~/icadata','20150211')
        mat=readbestc('~/icadata','20150211',variables=['time_instances','h_counts'])
        mat=readbestc('~/icadata','20150211','20150213')

    """
    if variables is None:
        variables = []

    if filters is None:
        filters = dict()

    return loadlevelN(datarootpath,
               fileprefix='bestc',
               from_day=from_day,
               to_day=to_day,
               variables=variables,
               verbose=verbose,
               flat=flat,
               partialelevationscan=partialelevationscan,
               dataformat=dataformat,
               time_instances=time_instances,
               branch='level2',
               filters=filters,
               varshapes=default_varshapes2)



[docs]def readflux(datarootpath,
             from_day=None,
             to_day='',
             variables=None,
             verbose=False,
             flat=False,
             partialelevationscan=False,
             dataformat='mat',
             filters=None,
             time_instances='time_instances'):

    r"""Loads one day or more worth of flux*.mat data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of a data directory tree
        This data tree should contain the file(s) in subdirectories
        level3/xxxx/yyyy/mm/dd/ as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where an
        elevation scan starts.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    dataformat (str):
        'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the
        matlab data tree is read.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        mat=readflux('~/icadata','20150211')
        mat=readflux('~/icadata','20150211',variables=['time_instances','flux'])
        mat=readflux('~/icadata','20150211','20150213')

    """
    if variables is None:
        variables = []

    if filters is None:
        filters = dict()

    return loadlevelN(datarootpath,
               fileprefix='flux',
               from_day=from_day,
               to_day=to_day,
               variables=variables,
               verbose=verbose,
               flat=flat,
               partialelevationscan=partialelevationscan,
               dataformat=dataformat,
               time_instances=time_instances,
               branch='level3',
               filters=filters,
               varshapes=default_varshapes3)

# %%

# Below follow all level 0/1/2/3... variable names that need
# concatenation when several files are loaded
# Variables not listed here will be only copied (the one from the first file is returned)
# sorted for shape:
# --- level 3 ------


_level3shape1vars = [  # following stuff is from proc

]

_level3shape2vars = [
]


_level3shape3vars = [
]

_level3shape4vars = [  # following stuff is from bestc
]

_level3datetimevars = ['time_instances'
                       ]


# --- level 2 ------


_level2shape1vars = [  # following stuff is from proc
    'time_instances',
    'sw_version',
    'cur_pacc',
    'mode',
    'noise_reduction_corrsum', # following stuff is from bestc
    'random_noise_corrsum', # following stuff is from bestc
    'dead_time_corrsum',
    'flat_mass_corrsum',
    'flat_sector_corrsum',
    'ghost_corrsum',
    'shadowmask',
    'processing_level'
]

_level2shape2vars = [
    'processing_level'
]


_level2shape3vars = [
    'heavy_ions',
    'light_ions'
]

_level2shape4vars = [  # following stuff is from bestc
    'best_ionspectra',
    'sigma_ionspectra',
    'ghost_ionspectra'
]

_level2datetimevars = [
    'time_instances'
]


# --- level 1 ------


_level1shape1vars = [  # following stuff is from proc
    'clean_ionspectra',
    'cur_pacc',
    'cur_pacc_table',
    'elevation_step',
    'mode',
    'is_mass_matrix',
    'massbinwidth',
    'time_error_flag',
    'decoder_error_flag',
    'edf_error_flag',
    'fmt_error_flag',
    'error_flags',
    'size_of_lookup_table',
    'time_instances',
    'sw_version',
    'error_flags',
    # below is for compatibility with pipeline v2.5 files
    #                    'orig_ionspectra',
    # following stuff is from aux
    'ilat', 'ilon',
    'azimbinwidth',
    'elevbinwidth',
    # following stuff is from special


    # following stuff is from lap
    'lap_time',
    'lap_quality',
    #'lap_electrondensity',  # removed
    #'lap_electrontemp',     # removed
    'lap_potential',
    'usc_time',
    'usc_potential',
    'usc_potential_q',
    'usc_quality',
    'usc_source',
    'asw_time',
    'asw_electrondensity',
    'asw_electrondensity_q',
    'asw_photosaturationcurrent',
    'asw_photosaturationcurrent_q',
    'asw_bulkionspeed',
    'asw_bulkionspeed_q',
    'asw_electrontemperature',
    'asw_electrontemperature_q',
    'asw_electrontemperature_xcal',
    'asw_electrontemperature_xcal_q',
    'asw_potential',
    'asw_potential_q',
    'asw_quality',
    'ned_time',
    'ned_electrondensity',
    'ned_electrondensity_q',
    'ned_source',
    'ned_quality',


    # following stuff is from cops
    'cops_time',
    'cops_density',
    'cops_pressure_ng',
    'cops_pressure_ng_flag',
    # following stuff is from mag
    'mag_time',
    'mag_time_level_g',
    'mag_quality',
    'mag_quality_level_g',
    # following stuff is moved from bestc to proc
    'noise_reduction',
    'valven_time'

]



_level1shape2vars = [  # following stuff is from proc
    'sum_orig_ionspectra',
    'shadow',
    # following stuff is from aux
    'cp_eclip',
    'sa_cseqx',
    'sa_cseqy',
    'sa_cseqz',
    'sa_csox',
    'sa_csoy',
    'sa_csoz',
    'sp_cseq',
    'sp_cso',
    'sv_cseq',
    'sv_cso',
    # following stuff is from mag
    'mag_b_cseq',
    'mag_b_cseq_level_g'
]

_level1shape3vars = [
]


_level1shape4vars = [
    'bs_cseq',
    'bs_cso',
    'orig_ionspectra',
    # following stuff is from bestc
    'clean_ionspectra'
]

_level1datetimevars = [
    'time_instances',
    'time_elev_scan_sta',
    'time_elev_scan_sto',
    'lap_time',
    'usc_time',
    'asw_time',
    'ned_time',
    'cops_time',
    'mag_time',
    'mag_time_level_g',
    'valven_time'
]

# --- level 0 ------

_level0shape1vars = [
    'badhvmask',
    'decoder_error',
    'fifo_fill',
    'fmt_len',
    'fmt_len_error',
    'iontime',
    'mode',
    'mtable',
    'pacc_high_low',
    'pacclevel',
    'promsection',
    'shadowmask',
    'hk_cmdstat',
    'hk_defl_hv_ref',
    'hk_defl_lv_ref',
    'hk_deflhv_mon',
    'hk_deflhv_sw',
    'hk_defllv_mon',
    'hk_defllv_sw',
    'hk_direct_cmd_sw',
    'hk_dpu_temp',
    'hk_entr_lower_mon',
    'hk_entr_ref',
    'hk_entr_sw',
    'hk_entr_upper_mon',
    'hk_fifofill',
    'hk_first_word_cmdret',
    'hk_grid_mon',
    'hk_grid_ref',
    'hk_grid_sw',
    'hk_main28_present',
    'hk_main_sw',
    'hk_mcp_curr_ref',
    'hk_mcp_def_ref',
    'hk_mcp_mon',
    'hk_mcp_present',
    'hk_mcp_sw',
    'hk_mode',
    'hk_newcmd',
    'hk_opto_curr_ref',
    'hk_opto_def_ref',
    'hk_opto_mon',
    'hk_opto_present',
    'hk_opto_sw',
    'hk_pacc_high_ref',
    'hk_pacc_hl',
    'hk_pacc_low_ref',
    'hk_pacc_mode',
    'hk_pacc_mon',
    'hk_pacc_sw',
    'hk_sensor_temp',
    'hk_sid',
    'hk_time',
    'hk_tmfifo_ovrf'
]

_level0shape2vars = [
    'azim',
    'elev',
    'ionspectra',
    'masschannel'
]

_level0datetimevars = [
    'iontime',
    'hk_time'
]

# --- black list ------

_blacklistvars = [
    'cal_orig_ionspectra',
    'time_error_mask',
    'fmt_error_mask',
    'decoder_error_mask']

# ---------------------

_force_datatype_float = [
    'orig_ionspectra',
    'sum_orig_ionspectra',
    'clean_ionspectra',
    'light_ions',
    'heavy_ions',
    'noise_reduction'
]

_force_datatype_int = [
    'sw_version',
    'mode',
    'cur_pacc',
    'error_flags',
    'time_error_flag',
    'decoder_error_flag',
    'edf_error_flag',
    'fmt_error_flag',
    'version_list',
    'dead_channles',
    'is_matrix',
    'mass_looku_table',
    'shadow',
    'size_of_lookup_table'


]

_force_datatype_stringlist = [
     'ica_processed_data_comment',
     'aux_processed_data_comment',
     'bestc_processed_data_comment',
     'flux_processed_data_comment',
     'lap_processed_data_comment',
     'cops_processed_data_comment',
     'mag_processed_data_comment',
     'mip_processed_data_comment'

]

_force_datatype_unicodestring = [
    'processing_level'
]

# convenient configuration variables:

default_varshapes1 =  {'1d':_level1shape1vars,
                       '2d':_level1shape2vars,
                       '3d':_level1shape3vars,
                       '4d':_level1shape4vars,
                       'datetime':_level1datetimevars,
                       'int32':_force_datatype_int,
                       'float32':_force_datatype_float,
                       'unicodestring':_force_datatype_unicodestring,
                       'stringlist':_force_datatype_stringlist,
                       'blacklist':_blacklistvars}

default_varshapes2 =  {'1d':_level2shape1vars,
                       '2d':_level2shape2vars,
                       '3d':_level2shape3vars,
                       '4d':_level2shape4vars,
                       'datetime':_level2datetimevars,
                       'int32':_force_datatype_int,
                       'float32':_force_datatype_float,
                       'unicodestring':_force_datatype_unicodestring,
                       'stringlist':_force_datatype_stringlist,
                       'blacklist':_blacklistvars}

default_varshapes3 =  {'1d':_level3shape1vars,
                       '2d':_level3shape2vars,
                       '3d':_level3shape3vars,
                       '4d':_level3shape4vars,
                       'datetime':_level3datetimevars,
                       'int32':_force_datatype_int,
                       'float32':_force_datatype_float,
                       'unicodestring':_force_datatype_unicodestring,
                       'stringlist':_force_datatype_stringlist,
                       'blacklist':_blacklistvars}

# %%

[docs]def conditional_gc_collect():
    r"""
    Calls gc.collect() if the available memory is less than 30%.
    """
    if _HAVE_PSUTIL:
        mem = psutil.virtual_memory()
        if mem.percent > 70.0:
            gc.collect()
    else:
        gc.collect()

# %%

[docs]def loadmat(file, variable_names=None):
    r"""
    Loads the dictionary "variable_names" from a v7 or  v7.3 mat file.
    if the dictionary is not specified, then everything is loaded.::

        filename = 'test.mat'
        m = irfpy.ica.io.loadmat(filename)
        print (m.keys())


    """
    afile = os.path.expanduser(file)
    # check the requested keys are in the file
    fil = None
    try:
        fil = h5py.File(afile, 'r')
        fk = fil.keys()
        existingvariables = [x for x in variable_names if x in fk]
    except:
        if variable_names is not None:
            existingvariables = list(variable_names)
        else:
            existingvariables = None
    if fil is not None:
        fil.close()
    try:
        mat = hdf5storage.loadmat(afile, variable_names=existingvariables)
    except KeyboardInterrupt:
        mat = {}
        raise
    except:
        warnings.warn("irfpy.ica.io.loadmat: Unable to read data from " + afile,
                      stacklevel=4)
        mat = {}
    return mat


[docs]def loadh5(file, variable_names=None):
    r""" loads the dictionary "variable_names" from a h5 file. """
    afile = os.path.expanduser(file)
    # load the h5 file completely
    # h5 = load_dict_from_hdf5(afile)
    h5 = dict()
    try:
        if variable_names is not None:
            for vv in variable_names:
                try:
                    data = deepdish.io.load(afile,[os.sep+vv])
                    h5[vv] = data[0]
                except ValueError:
                    pass
                conditional_gc_collect()

        else:
            h5 = deepdish.io.load(afile)
#            # check the requested keys are in the file
#            if variable_names is not None:  # there is a positive list of variables that are wanted
#                # remove unwanted ones
#                for k in h5.keys():
#                    if k not in variable_names:
#                        h5.pop(k, None)
#                        gc.collect()
#        gc.collect()
    except KeyboardInterrupt:
        raise

    except:
        warnings.warn("irfpy.ica.io.loadh5: Unable to read data from " + afile,
                      stacklevel=4)
    return h5


[docs]def savemat(file, data):
    r"""
    Save the dictionary data in v7.3 mat format to file analog to scipy.io.savemat::

        a = np.range(10)
        b = np.range(33)
        c = np.range(22) ** 2

        filename = 'test.mat'

        m = dict()
        m['a'] = a
        m['b'] = b
        m['c'] = c

        irfpy.ica.io.savemat(filename,m)


    """

    afile = os.path.expanduser(str(file))
    hdf5storage.savemat(file_name=afile,
                        mdict=data,
                        store_python_metadata=True,
                        matlab_compatible=True,
                        truncate_existing=True,
                        truncate_invalid_matlab=True)


[docs]def saveh5(file, data):
    r""" Save the dictionary data in hdf5 format using deepdish"""
    afile = os.path.expanduser(str(file))
    deepdish.io.save(afile, data, compression=('blosc', 5))
    # save_dict_to_hdf5(data,afile)
# ---------------------


def _loaddata(file, variable_names, datafileextension):
    r""" Loads  a dictionary data using the selected format"""
    if datafileextension == 'mat':
        data = loadmat(file, variable_names=variable_names)
    else:
        data = loadh5(file, variable_names=variable_names)
    return data


# ---------------------


[docs]def minonedim(a):
    r""" return an array with at least one dimension based on a """
    if len(np.shape(a)) >= 1:
        return a  # is already ok
    return np.array([a])  # we need one dimension at least

[docs]def flatstring(version):
    r""" Tries hard to make a string out of the argument. """
    return str(np.array(version).squeeze())


[docs]def flatstringlist(list_of_lists):
    r""" Tries hard to make a list of strings out of the argument. """
    strlist=list()
    try:
        if isinstance(list_of_lists, np.ndarray):  # It was a cell array in matlab. take apart:
            list_of_lists_sq = np.squeeze(list_of_lists)
            for c in list_of_lists_sq:
                if type(c) == np.str_:
                    strlist.append(str(c))
                else:
                    strlist.append(str(c[0][0]))
        else:
            for c in list_of_lists:
                strlist.append(str(c))
    except:
        strlist.append('Could not decode comment list for some reson :-( ')

    return strlist


# %%


_WANTVERSIONS=dict()
_HAVEVERSIONS=dict()
_VERBOSEVERSION=False
_ALREADYSHOWNVERSIONS=dict()


[docs]def expectedversion(proc=None,
                   aux=None,
                   bestc=None,
                   flux=None,
                   lap=None,
                   cops=None,
                   mag=None,
                   mip=None,
                   verbose=True,
                   **kwargs):

    r""" Defines the data versions that the user would like to have.
    If the existing version is different, then at the first occurence there
    will be a warning printed explaining what the difference is between the
    required version and the present version. If no version is specified
    then all versions are allowed.

    All parameters are strings of the form "xx.y" or floating numbers.
    e.g. proc='2.3' means that for proc, version 2.3 is requested.

    The version found in the datafiles is printed once.
    If verbose is set to false, this is suppressed.

    Usage::

        expectedversion(proc='2.3',lap='0.3')


    """

    global _WANTVERSIONS
    global _HAVEVERSIONS
    global _VERBOSEVERSION
    global _ALREADYSHOWNVERSIONS
    _WANTVERSIONS=dict()
    _HAVEVERSIONS=dict()
    _VERBOSEVERSION = verbose
    _ALREADYSHOWNVERSIONS=dict()

    if proc is not None:
        _WANTVERSIONS['proc'] = str(proc)
    if aux is not None:
        _WANTVERSIONS['aux'] = str(aux)
    if bestc is not None:
        _WANTVERSIONS['bestc'] = str(bestc)
    if flux is not None:
        _WANTVERSIONS['flux'] = str(flux)
    if lap is not None:
        _WANTVERSIONS['lap'] = str(lap)
    if cops is not None:
        _WANTVERSIONS['cops'] = str(cops)
    if mag is not None:
        _WANTVERSIONS['mag'] = str(mag)
    if mip is not None:
        _WANTVERSIONS['mip'] = str(mip)



def _checkdataversion(mat, mattype, versionkey, commentkey):
    r"""
    This is called internally after loading a mat file to cehck if the version
    is according to specifications.
    """

#    global versionstr
    versionstr='no version info available'
    if versionkey in mat:
        versionstr=''.join(flatstring(mat[versionkey]).split())  # remove also whitespace

#    global commentlist
    commentlist=list()
    if commentkey in mat:
        commentlist=flatstringlist(mat[commentkey])

    if _VERBOSEVERSION:
        if mattype+versionstr not in _ALREADYSHOWNVERSIONS:
            print(' +++ BEGIN INFO +++++++++++++++++++++++++++++++++++++++++++++++++++++')
            print(' Loading file of type: '+mattype)
            print(' The version of this file is : ', versionstr)
#            print(' Comments: ')
#            for c in commentlist:
#                print('  '+flatstring(c))
            _ALREADYSHOWNVERSIONS[mattype+versionstr] = True
            print(' +++ END INFO +++++++++++++++++++++++++++++++++++++++++++++++++++++++')

    if versionkey == '':
        return
    if versionkey not in mat:
        return

    if not mattype in _HAVEVERSIONS:
        if mattype in _WANTVERSIONS:  # check if there is a requiremetn
            if _WANTVERSIONS[mattype] < versionstr:
                print()
                print('*** BEGIN WARNING ***************************************************')
                print('You expected an ealier version of the '+mattype+' data to be present:')
                print('Desired version was                  : '+ _WANTVERSIONS[mattype])
                print('But the currently loading version is : '+ versionstr)
                print('Please review the differences between the two versions:')
                if len(commentlist)==0:
                    print('  '+'No version history available to show')
                for c in commentlist:
                    lval = re.sub(r'[^0123456789\.]','',flatstring(c).split(':',maxsplit=1)[0])
                    if float(lval) > float(_WANTVERSIONS[mattype]):
                        print('  '+flatstring(c))
                print('  ---')
                print('*** END WARNING *****************************************************')
                print()

    else:
        if _HAVEVERSIONS[mattype] != versionstr:
            print()
            print('*** BEGIN WARNING ***************************************************')
            print('You are loading differnt versions of '+mattype+' data at the same time:')
            print('Previoulsy loaded version was '+ _HAVEVERSIONS[mattype])
            print('Currently loading version is '+ versionstr)
            print('This may give unpredictable results.')
            print('*** END WARNING *****************************************************')
            print()

    _HAVEVERSIONS[mattype] = versionstr


def _getversionkey(mattype):
    r""" Returns the name of the version key associated with a certain mattype.
    e.g. mattype=='proc' returns 'ica_processed_data_version'
    """
    if mattype == 'proc':
        return 'ica_processed_data_version'

    return mattype+'_processed_data_version'  # default


def _getcommentkey(mattype):
    r""" Returns the name of the comment key associated with a certain mattype.
    e.g. mattype=='proc' returns 'ica_processed_data_comment'
    """
    if mattype == 'proc':
        return 'ica_processed_data_comment'

    return mattype+'_processed_data_comment'  # default


# %%

def _filtermatfile(m,filters,partialelevationscan,
                   verbose=False,
                   time_instances='time_instances',
                   varshapes=None):
    r""" Removes time_instances according to specifications given in the filters list.
        Manages Level1, Level2 and Level3 variables.
     """

    if not filters:
        return m

    if not m:
        return m

    if time_instances not in m:
        if verbose:
            print("  Warning: '"+time_instances+"' was not present in the loaded data. "+
                  "Can not apply filters.")
        return m

#    oktimes=np.ones_like(np.squeeze(m['time_instances']),dtype=bool)  # initialize with True
    oktimes=np.ones((m[time_instances].shape[-1],),dtype=bool)  # initialize with True

    for filterkey in filters:
        if filterkey not in m:
            if verbose:
                print("  Warning: The filter key '"+filterkey+
                     "' was not present in the loaded data. This filter is therfore ignored.")
        else:  # e have the fitlerkey loaded. check it is an applicable key

            # remove the matlab extra dimension for 1D variables prior making filter calculations.
            if filterkey in varshapes['1d']:
                m[filterkey] = minonedim(np.squeeze(m[filterkey]))
            else:
                m[filterkey] = np.squeeze(m[filterkey])

            # apply filter
            if filterkey in varshapes['1d']+ \
                            varshapes['2d']+ \
                            varshapes['3d']+ \
                            varshapes['4d']:

                oktimes[m[filterkey] != filters[filterkey]] = False   # deselect
                if verbose>1:
                    print('    filtering using  '+filterkey)


            else:
                if verbose:
                    print("  Warning: filter key '"+filterkey+
                     "' is not a valid filter, it was therefore ignored.")

    if not partialelevationscan:
        # now make sure that if an elevation scan contains an error, the whole
        # elevation scan is removed.
        for block in range(len(oktimes) // 16):
            # walk through the error mask in steps of 16
            if not np.all(oktimes[block * 16:(block + 1) * 16]):
                oktimes[block * 16:(block + 1) * 16] = False


    for key in m.keys():
        # prior filtering, remove the matlab extra dimension for 1D variables.
        if key in varshapes['1d']:
            m[key] = minonedim(np.squeeze(m[key]))
#            if verbose:
#                print(' +++ squeezed ',key)

        # now filter:
        theshape = np.shape(m[key])
        # all 1D vars:
        if len(theshape) == 1:
            # the following things contain time series
            if key in varshapes['1d']:
                mk = m[key]
                m[key] = mk[oktimes]

        # all 2D vars:
        elif len(theshape) == 2:
            # the following things need to be concatenated
            if key in varshapes['2d']:
                mk = m[key]
                m[key] = mk[:, oktimes]

        # all 3D vars:
        elif len(theshape) == 3:
            # the following things need to be concatenated
            if key in varshapes['3d']:
                mk = m[key]
                m[key] = mk[:, :, oktimes]

        # 4D vars:
        elif len(theshape) == 4:
            # the following things need to be concatenated
            if key in varshapes['4d']:
                mk = m[key]
                m[key] = mk[:, :, :, oktimes]

        conditional_gc_collect()

    return m



# %% Generic load function

[docs]def loadlevelN(datarootpath,
               fileprefix='bestc',
               from_day=None,
               to_day='',
               variables=None,
               verbose=False,
               flat=False,
               partialelevationscan=False,
               dataformat='mat',
               branch=None,
               time_instances='time_instances',
               filters=None,
               varshapes=None
               ):
    r"""
    Loads one day or more worth of level1 data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.

    PARAMETERS

    datarootpath (string):
        local path to the 'root' of the level1 data tree
        This data tree should contain the file(s) in subdirectories yyyy/mm/dd/
        as it is done in the pipeline.
    fileprefix (string):
        Data files start with this prefix. e.g. 'bestc'
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where
        an elevation scan starts.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes.
    dataformat (string):
        One of: 'mat'(default) or 'h5'.
    time_instances (str):
        Name of the time_instances variable (default: 'time_instances')
    branch (str):
        Directory below datarootpath the data is stored, e.g. 'level1'
    filters (dict):
        Filter list
    varshapes (dict):
        data structure describing the dimensionality and the special data types
        of the variables to be read. Use predefined defaults, e.g. default_varshapes1
        for level1 variable shapes.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        matfile=irfpy.ica.io.loadlevel?(icapath,'bestc','20150211')
        matfile=irfpy.ica.io.loadlevel?(icapath,'bestc','20150211',
                                        variables=['time_instances','E'])
        matfile=irfpy.ica.io.loadlevel?(icapath,'bestc','20150211','20150213',
                                        variables=['time_instances','E'])

    """

    # do some sanity checking
    if variables is None:
        variables = []
    if filters is None:
        filters = {}

    if not isinstance(datarootpath, str):
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter datarootpath must be of <class 'str'> " +
            "(a string) but is now " + str(type(datarootpath)))
    if not isinstance(fileprefix, str):
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter fileprefix must be of <class 'str'> " +
            "(a string) but is now " + str(type(fileprefix)))
    if not isinstance(from_day, (str, dt.datetime)):
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter from_day must be of <class 'str'> " +
            "(a string) or of <class 'datetime.datetime'> but is now " + str(type(from_day)))
    if not isinstance(to_day, (str, dt.datetime)):
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter to_day must be of <class 'str'> " +
            "(a string) or of <class 'datetime.datetime'> but is now " + str(type(to_day)))
    if not isinstance(variables, (list, np.ndarray)):
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter variables must be of <class 'list'> " +
            "(a list) or <class 'numpy.ndarray'> (a numpy array) but is now " +
            str(type(variables)))

    if varshapes is None:
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter varshapes must be of <class 'dict'> " +
            "(a dictionary) but is now None. Add a varshapes=default_varshapes1, ..2 or ..3 parameter")


    def getvarshapes(varshapes,akey,default=None):
        levelNshape = default
        if akey in varshapes:
            levelNshape = varshapes[akey]
        if not isinstance(levelNshape,list):
            raise ValueError(
                "irfpy.ica.io.loadlevelN: The value of varshapes['{:s}'] must be of <class 'list'> (a list).".format(akey))
        return levelNshape

    levelNshape1vars = getvarshapes(varshapes,'1d')
    levelNshape2vars = getvarshapes(varshapes,'2d')
    levelNshape3vars = getvarshapes(varshapes,'3d')
    levelNshape4vars = getvarshapes(varshapes,'4d')
    levelNdatetimevars = getvarshapes(varshapes,'datetime')
    levelNint32 = getvarshapes(varshapes,'int32')
    levelNfloat32 = getvarshapes(varshapes,'float32')
    levelNstringlist = getvarshapes(varshapes,'stringlist',default=list())
    levelNunicodestring = getvarshapes(varshapes,'unicodestring',default=list())
    levelNblacklist = getvarshapes(varshapes,'blacklist',default=_blacklistvars)


    if branch is None:
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter branch must be of <class 'str'> " +
            "(a dictionary) but is now None.")


    if isinstance(from_day, dt.datetime):
        from_day = icatools.datetime2string(from_day)
    if isinstance(to_day, dt.datetime):
        to_day = icatools.datetime2string(to_day)

    datarootpath = os.path.expanduser(datarootpath)

    tp = ""
    if datarootpath[-1] != os.sep:
        tp = datarootpath + os.sep
    else:
        tp = datarootpath
    checkdirectory(datarootpath)

    subdir, extension = icatools.get_data_path_info(branch, dataformat)
    subdir = subdir.replace(os.sep,os.sep)
    tp = tp + subdir

    mat = defaultdict(list)  # a dictionary with lists as elements

    # Are year, month or day from from_day and to_day identical?
    # If so limit the search pattern for glob
    myyear = '*'
    if len(from_day) >= 4:
        if len(to_day) == 0:
            myyear = from_day[0:4]
        elif from_day[0:4] == to_day[0:4]:
            myyear = from_day[0:4]

    mymonth = '*'
    if len(from_day) >= 6:
        if len(to_day) == 0:
            mymonth = from_day[4:6]
        elif from_day[0:6] == to_day[0:6]:
            mymonth = from_day[4:6]

    myday = '*'
    if len(from_day) >= 8:
        if len(to_day) == 0:
            myday = from_day[6:8]
        elif from_day[0:8] == to_day[0:8]:
            myday = from_day[6:8]

    # handle default only one day
    if to_day == '':
#       print('searching for: '+tp+myyear+os.sep+mymonth+os.sep+
#                               myday+os.sep+mattype+from_day+'*mat')
        files = np.array(glob.glob(tp + myyear + os.sep + mymonth + os.sep + myday +
                                   os.sep + fileprefix + from_day[0:11] + '*' + extension))
        if len(files) == 0:  # nothing found assume non subdirectory stee!
            if flat:
                files = np.array(glob.glob(tp + fileprefix + from_day[0:11] + '*' + extension))
            if len(files) == 0:  # nothing found assume non subdirectory stee!
                return {}

        files.sort()
        npfiles = np.array(files)
        usethese = np.array([npfiles > ''])

    else:  # several days
        # get all candidates
        files = np.array(glob.glob(tp + myyear + os.sep + mymonth + os.sep +
                                   myday + os.sep + fileprefix + '*' + extension))
        if len(files) == 0:  # nothing found!
            if flat:
                files = np.array(glob.glob(tp + fileprefix + from_day + '*' + extension))
            if len(files) == 0:  # nothing found assume non subdirectory stee!
                return {}
        files.sort()
        npfiles = np.array(files)
        filesshort = np.array([os.path.split(f)[1] for f in npfiles])
        # no .mat here because of sorting order
        fromfil = fileprefix + from_day[0:8].ljust(8, '0') + 'T' + from_day[9:11].ljust(6, '0')
        tofil = fileprefix + to_day[0:8].ljust(8, '9') + 'T' + \
            to_day[9:11].ljust(6, '9') + '.' + extension
        usethese = np.logical_and([filesshort >= fromfil], [filesshort <= tofil])


    versionkey=_getversionkey(fileprefix)
    commentkey=_getcommentkey(fileprefix)

    # There is a possibility that usethese has shape (1,n) but we want (n,)
    # np.squeeze() is not used because if the shape is (1,1) then the
    # result will be a skalar.
    if len(usethese.shape)==2:
        usethese = usethese[0,:]

    # print('Usethese 1:',usethese)

    if variables == []:
        variablelist = None
    else:
        variablelist = variables.copy()
        if time_instances not in variablelist:
            variablelist.append(time_instances)

        if len(versionkey)>0 and versionkey not in variablelist:
            variablelist.append(versionkey)

        if len(commentkey)>0 and commentkey not in variablelist:
            variablelist.append(commentkey)

        # make sure the required filter values are loaded
        for key in filters.keys():
            if key not in variablelist:
                variablelist.append(key)

        if 'sum_orig_ions' in variablelist:
            if verbose:
                print(' Warning: "sum_orig_ions" is calculated on a hourly basis '+
                      'and not according to "from_day" and "to_day" parameters '+
                      'of loadlevel1() or readproc()')
    if time_instances !='time_instances':
        if verbose:
            print(' Using the variable "'+time_instances+'" to select time intervals')

    for file in npfiles[usethese]:
        if verbose:
            print(' loading: ' + file)

        matfile = _loaddata(file, variable_names=variablelist, datafileextension=extension)

        # Check if the data version is acceptable
        _checkdataversion(matfile,fileprefix,versionkey,commentkey)

        # Remove values that do not match the filters dictionary.
        # Do this here to avoid loading and handling large amounts of unwanted data.
        matfile = _filtermatfile(matfile,filters,partialelevationscan,
                                 verbose=verbose,
                                 time_instances=time_instances,
                                 varshapes=varshapes)

        for key in matfile:
            if '__' in key:
                continue
            # the black list:
            if key in levelNblacklist:
                continue

            if (variables == []) or (key in variablelist):
                if key not in mat:
                    if verbose:
                        print('  adding key: ' + key)
                    # a new key not seen before
                    if np.shape(matfile[key]) == ():
                        mat[key].append(matfile[key])  # scalar
                    else:
                        # handle 2D vars from matlab that are actually 1d
                        if key in levelNshape1vars:
                            mat[key].append(minonedim(np.squeeze(matfile[key])))
                        else:
                            mat[key].append(np.squeeze(matfile[key]))
                else:
                    # something we already have
                    theshape = np.shape(mat[key][0])
                    # all 1D vars:
                    if len(theshape) == 1:
                        # the following things from proc need to be concatenated
                        if key in levelNshape1vars:
                            # print(np.shape(matfile[key]))
                            mat[key].append(minonedim(np.squeeze(matfile[key])))

                    # all 2D vars:
                    elif len(theshape) == 2:
                        # the following things from proc need to be concatenated
                        if key in levelNshape2vars:
                            mat[key].append(matfile[key])

                        # this one needs to be summed up
                        elif key in ['sum_orig_ions']:
                            mat[key][0] = np.add(mat[key][0], matfile[key])

                    # all 3D vars:
                    elif len(theshape) == 3:
                        # the following things from proc need to be concatenated
                        if key in levelNshape3vars:
                            mat[key].append(matfile[key])

                    # all 4D vars:
                    elif len(theshape) == 4:
                        # the following things from proc need to be concatenated
                        if key in levelNshape4vars:
                            mat[key].append(matfile[key])

                # force data types where needed. do this before concatenating to minimize
                # memory footprint
                if key in levelNdatetimevars:
                    if mat[key][-1].dtype != 'O':  # object
                        mat[key][-1] = icatools.matlab2datetime(mat[key][-1])

                if key in levelNfloat32:
                    if mat[key][-1].dtype != 'float32':  # it is not right float type
                        mat[key][-1] = mat[key][-1].astype('float32')  # then force it to float

                if key in levelNint32:
                    if mat[key][-1].dtype != 'int32':  # it is not right int type
                        mat[key][-1] = mat[key][-1].astype('int32')  # then force it to int

                if key in levelNstringlist:
                    # this is mainly for the comment field
                    mat[key][-1] = flatstringlist(mat[key][-1])
                elif key.endswith('processed_data_comment'):
                    mat[key][-1] = flatstringlist(mat[key][-1])

                if key in levelNunicodestring:
                    # this is mainly for the processing_level field
                    mat[key][-1] = mat[key][-1].decode()

    conditional_gc_collect()

    # generate masks for the first and the last file to make an exact match of
    # the requested time interval
    tmaskfirst = []
    tmasklast = []

    if partialelevationscan:  # default is to load only full elevation scans
        boundary = 1
    else:
        boundary = 16

    if time_instances in mat.keys():
        # modify the first and the last element of mat[key] to fit the exact time interval
        tmaskfirst = icatools.selecttime(
            mat[time_instances][0], from_day, to_day, mod_boundary=boundary)
        if len(mat[time_instances][:]) > 1:
            tmasklast = icatools.selecttime(
                mat[time_instances][-1], from_day, to_day, mod_boundary=boundary)

    allmat = {}  # the final result

    # concatenate the lists
    for key in mat.keys():
        # something we already have
        # take the shape from the first element
        theshape = np.shape(mat[key][0])
        # all 1D vars:
        if len(theshape) == 1:
            # the following things from need to be concatenated
            if key in levelNshape1vars:
                if len(tmaskfirst) > 0:
                    mat[key][0] = mat[key][0][tmaskfirst]
                if len(tmasklast) > 0:
                    mat[key][-1] = mat[key][-1][tmasklast]
                # for element in  mat[key]:
                #    print(key,': ',np.shape(element))
                allmat[key] = np.concatenate(mat[key], axis=0)
                continue

        # all 2D vars:
        elif len(theshape) == 2:
            # the following things from proc need to be concatenated
            if key in levelNshape2vars:
                if len(tmaskfirst) > 0:
                    mat[key][0] = mat[key][0][:, tmaskfirst]
                if len(tmasklast) > 0:
                    mat[key][-1] = mat[key][-1][:, tmasklast]
                allmat[key] = np.concatenate(mat[key], axis=1)
                continue

        # all 3D vars:
        elif len(theshape) == 3:
            # the following things from proc need to be concatenated
            if key in levelNshape3vars:
                if len(tmaskfirst) > 0:
                    mat[key][0] = mat[key][0][:, :, tmaskfirst]
                if len(tmasklast) > 0:
                    mat[key][-1] = mat[key][-1][:, :, tmasklast]
                allmat[key] = np.concatenate(mat[key], axis=2)
                continue

        # all 4D vars:
        elif len(theshape) == 4:
            # the following things from proc need to be concatenated
            if key in levelNshape4vars:
                # for ii in range(6):
                #   print(np.shape(mat[key][ii]))
                if len(tmaskfirst) > 0:
                    mat[key][0] = mat[key][0][:, :, :, tmaskfirst]
                if len(tmasklast) > 0:
                    mat[key][-1] = mat[key][-1][:, :, :, tmasklast]
                allmat[key] = np.concatenate(mat[key], axis=3)
                continue

        # if no match with anything, just copy the first element
        allmat[key] = mat[key][0]

    conditional_gc_collect()

    return allmat


# %% Loadlevel2 backward compatibility

[docs]def loadlevel2(datarootpath,
                mattype='bestc',
                from_day=None,
                to_day='',
                variables=None,
                verbose=False,
                flat=False,
                partialelevationscan=False,
                dataformat='mat',
                filters=None):
    r"""
    Loads one day or more worth of level1 data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.

    PARAMETERS

    datarootpath (string):
        local path to the 'root' of the level1 data tree
        This data tree should contain the file(s) in subdirectories yyyy/mm/dd/
        as it is done in the pipeline.
    mattype (string):
        one of 'bestc'
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where
        an elevation scan starts.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes.
    dataformat (string):
        One of: 'mat'(default) or 'h5'.

    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::

        matfile=irfpy.ica.io.loadlevel2(icapath,'bestc','20150211')
        matfile=irfpy.ica.io.loadlevel2(icapath,'bestc','20150211',
                                        variables=['time_instances','E'])
        matfile=irfpy.ica.io.loadlevel2(icapath,'bestc','20150211','20150213',
                                        variables=['time_instances','E'])

    """

    return loadlevelN(datarootpath,
                      fileprefix=mattype,
                      from_day=from_day,
                      to_day=to_day,
                      variables=variables,
                      verbose=verbose,
                      flat=flat,
                      partialelevationscan=partialelevationscan,
                      dataformat=dataformat,
                      time_instances='time_instances',
                      branch='level2',
                      filters=filters,
                      varshapes=default_varshapes2)

# %% loadlevel1 for backward compatibility

[docs]def loadlevel1(datarootpath,
                mattype='proc',
                from_day=None,
                to_day='',
                variables=None,
                verbose=False,
                flat=False,
                partialelevationscan=False,
                dataformat='mat',
                filters=None):
    r"""
    Loads one day or more worth of level1 data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day.

    Consider using one of the readproc(), readaux(), read...() functions instead.


    PARAMETERS

    datarootpath (string):
        local path to the 'root' of the level1 data tree
        This data tree should contain the file(s) in subdirectories yyyy/mm/dd/
        as it is done in the pipeline.
    mattype (string):
        one of 'proc', 'special' or 'aux'
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where
        an elevation scan starts.
    verbose (boolean):
        Optional parameter, if set to True the loading function will
        print the names of the files it loads for debugging purposes
    dataformat (string):
        One of: 'mat'(default) or 'h5'.



    OUTPUT

    Returns a dictionary with all loaded variables. The dictionary is
    empty if no matching data could be loaded


    Use like this::
        icapath = '/home/user/icadata'
        matfile=irfpy.ica.io.loadlevel1(icapath,'proc','20150211')
        matfile=irfpy.ica.io.loadlevel1(icapath,'proc','20150211',
                                        variables=['time_instances','E'])
        matfile=irfpy.ica.io.loadlevel1(icapath,'proc',
                                        '20150211','20150213',
                                        variables=['time_instances','E'])
        matfile=irfpy.ica.io.loadlevel1(icapath, 'aux', theinterval,
                                        variables=['sp_cso','time_instances'])

    """
    return loadlevelN(datarootpath,
                      fileprefix=mattype,
                      from_day=from_day,
                      to_day=to_day,
                      variables=variables,
                      verbose=verbose,
                      flat=flat,
                      partialelevationscan=partialelevationscan,
                      dataformat=dataformat,
                      time_instances='time_instances',
                      branch='level1',
                      filters=filters,
                      varshapes=default_varshapes1)


[docs]def loadlevel1oktime(datarootpath,
                     mattype,
                     from_day,
                     to_day='',
                     variables=None,
                     verbose=False,
                     flat=False,
                     partialelevationscan=False,
                     dataformat='mat',
                     filters=None):
    r"""
    Loads one day or more worth of level1 data analog to
    irfpy.ica.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day. Only data with timeswhere time_error_flag is == 0 is loaded

    Consider using one of the readproc(), readaux(), read...() functions instead.

    PARAMETERS

    datarootpath (string):
        local path to the 'root' of the level1 data tree
        This data tree should contain the file(s) in subdirectories yyyy/mm/dd/
        as it is done in the pipeline.
    mattype (string):
        filetype to be loaded, one of 'proc' or 'aux'
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where an
        elevation scan starts.
    verbose (boolean):
        Prints what the function does
    dataformat (string):
        One of: 'mat'(default) or 'h5'.


    This function is identical to the call of loadlevel1 with a filters parameter:
        loadlevel1(datarootpath, .... , filters={'time_error_flag':0})

    OUTPUT

    Returns a dictionary with all loaded variables


    Limitation: If this function is used to load an aux file, the aux file
                must be version 1.9 or larger.
                This function can not be used to load special files.


    """

    if variables is None:
        variables = []
    if filters is None:
        filters = {}

    filters['time_error_flag'] = 0

    # m = loadlevel1(datarootpath, mattype, from_day, to_day, variables,
    #                verbose, flat, partialelevationscan, dataformat,
    #                filters=filters)
    m = loadlevelN(datarootpath,
                   fileprefix=mattype,
                   from_day=from_day,
                   to_day=to_day,
                   variables=variables,
                   verbose=verbose,
                   flat=flat,
                   partialelevationscan=partialelevationscan,
                   dataformat=dataformat,
                   time_instances='time_instances',
                   branch='level1',
                   filters=filters,
                   varshapes=default_varshapes1)

    if 'time_error_flag' not in m:
        if verbose:
            print('irfpy.ica.io.loadlevel1oktime() :')
            print(' This function requires access to the time_error_flag which ' +
                  'should be in both aux and proc')
            print(' Also verify that the dataformat is correct and you have the ' +
                  'corresponding data files. ')
            print(' Valid datatypes are "mat" and "h5". You are trying to load "' +
                  dataformat + '"-files.')
        return {}

    conditional_gc_collect()

    return m


[docs]def loadlevel1okall(datarootpath,
                    mattype='proc',
                    from_day=None,
                    to_day='',
                    variables=None,
                    verbose=False,
                    flat=False,
                    partialelevationscan=False,
                    dataformat='mat',
                    filters=None):
    r"""
    Loads one day or more worth of level1 data analog to
    irfpy.ica.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day. Only data with times where
    error_flags is == 0 is loaded

    Consider using one of the readproc(), readaux(), read...() functions instead.

    PARAMETERS

    datarootpath (string):
        local path to the 'root' of the level1 data tree
        This data tree should contain the file(s) in subdirectories yyyy/mm/dd/
        as it is done in the pipeline.
    mattype (string):
        filetype to be loaded, one of 'proc' or 'aux'
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where
        an elevation scan starts.
    verbose (boolean):
        Prints what the function does
    dataformat (string):
        One of: 'mat'(default) or 'h5'.


    This function is identical to the call of loadlevel1 with a filters parameter:
        loadlevel1(datarootpath, .... , filters={'error_flags':0})

    OUTPUT

    Returns a dictionary with all loaded variables


    """

#    variables_flags = variables.copy()
#    if variables_flags != []:
#        # no need to add the flags if the variable list is empty as this means load all
#        if 'error_flags' not in variables_flags:
#            variables_flags.append('error_flags')
    if variables is None:
        variables = []
    if filters is None:
        filters = {}

    filters['error_flags'] = 0

    m = loadlevelN(datarootpath,
                   fileprefix=mattype,
                   from_day=from_day,
                   to_day=to_day,
                   variables=variables,
                   verbose=verbose,
                   flat=flat,
                   partialelevationscan=partialelevationscan,
                   dataformat=dataformat,
                   time_instances='time_instances',
                   branch='level1',
                   filters=filters,
                   varshapes=default_varshapes1)

    if 'error_flags' not in m:
        if verbose:
            print('irfpy.ica.io.loadlevel1okall() : error_flags not found while loading ' +
                  mattype)
            print(' This function requires access to error_flags,')
            print(' which should be in the same file that is being loaded.')
            print(' Also verify that the dataformat is correct and you have ')
            print(' the corresponding data files. ')
            print(' Valid datatypes are "mat" and "h5". You are trying to load "' +
                  dataformat + '"-files.')
        return {}

    conditional_gc_collect()

    return m


[docs]def loadlevel1filter(datarootpath,
                     mattype='proc',
                     from_day=None,
                     to_day='',
                     variables=None,
                     verbose=False,
                     flat=False,
                     partialelevationscan=False,
                     dataformat='mat',
                     time_error_flag=None,
                     decoder_error_flag=None,
                     fmt_error_flag=None,
                     edf_error_flag=None,
                     error_flags=None,
                     mode=None,
                     sw_version=None,
                     cur_pacc=None
                     ):

    r"""
    Loads one day or more worth of level1 data analog to
    irfpy.ica.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to that day. Only data with times where
    the filter criteria apply are loaded

    PARAMETERS

    datarootpath (string):
        local path to the 'root' of the level1 data tree
        This data tree should contain the file(s) in subdirectories yyyy/mm/dd/
        as it is done in the pipeline.
    mattype (string):
        filetype to be loaded, one of 'proc' or 'aux'
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (dict):
        a list of variable names to load. Default is to load all variables.
        If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object

    filters select the data loaded:
        time_error_flag (int)    : 0
        decoder_error_flag (int) : 0
        fmt_error_flag (int)     : 0
        edf_error_flag (int)     : 0
        error_flags (int)        : 0
        mode (int)               : 0
        sw_version (int)         : 6
        cur_pacc (int)           : 6

    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    partialelevationscan (boolean):
        Optional parameter, default False. If set to False
        the loading function will only load complete elevation scans. If set to True,
        the neartest time_instances will be loaded independent of where
        an elevation scan starts.
    verbose (boolean):
        Prints what the function does
    dataformat (string):
        One of: 'mat'(default) or 'h5'.


    OUTPUT

    Returns a dictionary with all loaded variables.


    """
    if variables is None:
        variables = []

    filters = {}
    # create a filter list
    if error_flags is not None:
        filters['error_flags'] = error_flags
    if time_error_flag is not None:
        filters['time_error_flag'] = time_error_flag
    if decoder_error_flag is not None:
        filters['decoder_error_flag'] = decoder_error_flag
    if edf_error_flag is not None:
        filters['edf_error_flag'] = edf_error_flag
    if fmt_error_flag is not None:
        filters['fmt_error_flag'] = fmt_error_flag
    if mode is not None:
        filters['mode'] = mode
    if sw_version is not None:
        filters['sw_version'] = sw_version
    if cur_pacc is not None:
        filters['cur_pacc'] = cur_pacc


    m = loadlevelN(datarootpath,
                   fileprefix=mattype,
                   from_day=from_day,
                   to_day=to_day,
                   variables=variables,
                   verbose=verbose,
                   flat=flat,
                   partialelevationscan=partialelevationscan,
                   dataformat=dataformat,
                   time_instances='time_instances',
                   branch='level1',
                   filters=filters,
                   varshapes=default_varshapes1)

    conditional_gc_collect()
    return m


# %% Level 0 is still treated special.

[docs]def loadlevel0(datarootpath,
               from_day,
               to_day='',
               variables=None,
               verbose=False,
               flat=False,
               dataformat='mat',
               **kwargs):
    r"""
    Loads one hour or more worth of level0 data analog to
    scipy.io.loadmat('xyz.mat') but with assembling all hourly files
    corresponding to the time interval specified.

    Consider using the readraw() function instead.


    AUTHOR: Gabriella Stenberg Wieser

    PARAMETERS

    datarootpath  (string):
        local path to the 'root' of the level0 data tree
        This data tree should contain the file(s) in subdirectories yyyy/mm/dd/
        as it is done in the pipeline.
    from_day (string or datetime.datetime):
        string is of the form '20150115T1011'
    to_day (string or datetime.datetime):
        optional, string is of the form '20150116T1112'.
        If given data to the end of the given intervall is loaded, otherwise one day.
    variables (list of strings):
        a list of variable names to load. Default is to load
        all variables. If the loaded variables include 'time_instances' then these are
        automatically converted to python datetime object
    flat (boolean):
        If flat is set to True, the load function will if no files were found
        also attempt to load files which are at the root of the data tree
        (without the yyyy/mm/dd/ subdirectories)
    dataformat (string):
        One of: 'mat'(default) or 'h5'.

    OUTPUT

    returns a dictionary with all loaded variables


    EXAMPLES::

        matfile=ica.io.loadlevel0(icapath,'20150211')
        matfile=ica.io.loadlevel0(icapath,'20150211',
                                  variables=['iontime','ionspectra'])
        matfile=ica.io.loadlevel0(icapath,'20150211','20150213',
                                  variables=['time_interval','E'])

    """

    # do some sanity checking
    if variables is None:
        variables = []

    if not isinstance(datarootpath, str):
        raise ValueError(
            "irfpy.ica.io.loadlevel0: The parameter datarootpath must be of " +
            "<class 'str'> (a string) but is now " + str(type(datarootpath)))
    if not isinstance(from_day, (str, dt.datetime)):
        raise ValueError(
            "irfpy.ica.io.loadlevel0: The parameter from_day must be of " +
            "<class 'str'> (a string) or of <class 'datetime.datetime'> but is now " +
            str(type(from_day)))
    if not isinstance(to_day, (str, dt.datetime)):
        raise ValueError(
            "irfpy.ica.io.loadlevel0: The parameter to_day must be of " +
            "<class 'str'> (a string) or of <class 'datetime.datetime'> but is now " +
            str(type(to_day)))
    if not isinstance(variables, (list, np.ndarray)):
        raise ValueError(
            "irfpy.ica.io.loadlevel0: The parameter variables must be of " +
            "<class 'list'> (a list) or <class 'numpy.ndarray'> (a numpy array) " +
            "but is now " + str(type(variables)))

    if isinstance(from_day, dt.datetime):
        from_day = icatools.datetime2string(from_day)
    if isinstance(to_day, dt.datetime):
        to_day = icatools.datetime2string(to_day)

    tp = ""
    if datarootpath[-1] != os.sep:
        tp = datarootpath + os.sep
    else:
        tp = datarootpath

    checkdirectory(datarootpath)

    # spacial handling of proc and special that are both located in level1
    subdir, extension = icatools.get_data_path_info('level0', dataformat)
    tp = tp + subdir

    mat = defaultdict(list)  # a dictionary with lists as elements

    mattype = 'RPC_ICA_'


    # handle default only one day
    if to_day == '':
#       print('searching for: '+tp+'*/*/*/'+mattype+from_day+'*mat')
        files = np.array(glob.glob(tp + '*'+os.sep+'*'+os.sep+'*'+os.sep +
                                   mattype + from_day[0:11] + '*' + extension))

        if len(files) == 0:  # nothing found assume non subdirectory stee!
            if flat:
                files = np.array(glob.glob(tp + mattype + from_day[0:11] + '*' + extension))
            if len(files) == 0:  # nothing found assume non subdirectory stee!
                return {}
        # print(files)
        files.sort()
        npfiles = np.array(files)
        usethese = np.array([npfiles > ''])

    else:  # several days
        # get all candidates

        files = np.array(glob.glob(tp + '*'+os.sep+'*'+os.sep+'*'+os.sep +
                                   mattype + '*' + extension))
        if len(files) == 0:  # nothing found!
            if flat:
                files = np.array(glob.glob(tp + mattype + from_day[0:11] + '*' + extension))

            if len(files) == 0:  # nothing found assume non subdirectory stee!
                return {}
        files.sort()
        npfiles = np.array(files)
        filesshort = np.array([os.path.split(f)[1] for f in npfiles])
        # no .mat here because of sorting order
        fromfil = mattype + from_day[0:8].ljust(8, '0') + 'T' + from_day[9:15].ljust(6, '0')
        tofil = mattype + to_day[0:8].ljust(8, '9') + 'T' + \
            to_day[9:15].ljust(6, '9') + '.' + extension
        usethese = np.logical_and([filesshort >= fromfil], [filesshort <= tofil])


    # There is a possibility that usethese has shape (1,n) but we want (n,)
    # np.squeeze() should not be used because if the shape is (1,1) then
    # the result will be a skalar.
    if len(usethese.shape)==2:
        usethese = usethese[0,:]

    if variables == []:
        variablelist = None
    else:
        variablelist = variables.copy()

    for file in npfiles[usethese]:
        if verbose:
            print(' loading: ' + file)

        matfile = _loaddata(file, variable_names=variablelist, datafileextension=extension)

        for key in matfile:
            if '__' in key:
                continue
            # the black list:
            if key in _blacklistvars:
                continue

            if (variables == []) or (key in variablelist):
                if key not in mat:
                    # a new key not seen before
                    if np.shape(matfile[key]) == ():
                        mat[key].append(matfile[key])  # scalar
                    else:
                        mat[key].append(np.squeeze(matfile[key]))

                else:
                    # something we already have
                    theshape = np.shape(mat[key][0])
                    # all 1D vars:
                    if len(theshape) == 1:
                        # the following things from proc need to be concatenated
                        if key in _level0shape1vars:
                            mat[key].append(np.squeeze(matfile[key], axis=0))
                    # all 2D vars:
                    elif len(theshape) == 2:
                        # the following things from proc need to be concatenated
                        if key in _level0shape2vars:
                            mat[key].append(matfile[key])
                            continue

    allmat = {}  # the final result

    # concatenate the lists
    for key in mat.keys():
        # something we already have
        theshape = np.shape(mat[key][0])  # take the shape from the first element
        # all 1D vars:
        if len(theshape) == 1:
            # the following things from proc need to be concatenated
            if key in _level0shape1vars:
                allmat[key] = np.concatenate(mat[key], axis=0)
                continue

        # all 2D vars:
        elif len(theshape) == 2:
            # the following things from proc need to be concatenated
            if key in _level0shape2vars:
                allmat[key] = np.concatenate(mat[key], axis=1)
                continue

        # if no match with anything, just copy the first element
        allmat[key] = mat[key][0]

    # post processing
    for key in allmat:
        # change time variables to datetime objects
        if key in _level0datetimevars:
            allmat[key] = icatools.matlab2datetime(allmat[key])

        # make sure values are float
        # if key in ['sum_orig_ionspectra','orig_ionspectra','sum_orig_ions','clean_ionspectra']:
        #    mat[key] = mat[key]*1.0

    conditional_gc_collect()
    return allmat


[docs]def checkdirectory(d, branch='level1'):
    """ Verifies that the given directory points to the root of an ICA data tree """

    if d[-1] != os.sep:
        d = d + os.sep
    pathendswith = os.path.basename(os.path.normpath(d))
    if (len(glob.glob(d + '20*')) > 0) or \
            (pathendswith in ['aux', 'bestc', 'proc', 'mag', 'cops',
                              'lap', 'level1', 'level2', 'level3', branch]):
        raise ValueError("\n" +
                         "*********************************************************************\n" +
                         "The given path to ICA data files '" + d +
                         "'\npoints to the wrong place: " +
                         "Starting from version 3.5.0 of irfpy.ica,\n" +
                         "the data path should point ot the top of a subdirectory structure\n" +
                         "as shown in \n" +
                         "https://rosetta-wiki.irf.se/doku.php?id=pipeline_directory_structure\n" +
                         "For the given path '" + d +
                         "', \nthe data files were expected to be in: \n" +
                         "'" + d + "xxxxx/matlab/20yy/mm/dd/*.mat'\n".replace('/',os.sep) +
                         "(with xxxxx one of level0, level1, level2, mag, cops, aux, etc.),\n" +
                         "but your data is apparently located in: \n" +
                         "'" + d + "20yy/mm/dd/*.mat'\n".replace('/',os.sep) +
                         "Your path must not contain the 'xxxxx' or 'matlab' parts.\n" +
                         "Please update your local data structure by moving files.\n" +
                         "Contact wieser@irf.se if you need support in this matter.\n" +
                         "*********************************************************************\n" +
                         "\n")

# %% write data to a readxxx compatible structure

[docs]def buildfilename(apath,someday,prefix,postfix,hour):
    if isinstance(someday, dt.datetime):
        someday = icatools.datetime2string(someday)
    return apath + os.sep + someday[:4]+os.sep+someday[4:6]+os.sep+someday[6:8]+os.sep+\
        prefix+someday[:8]+'T{:02d}'.format(hour)+'0000'+postfix



[docs]def writedailydata(dataroot,
                    data,
                    theday,
                    fileprefix=None,
                    branch=None,
                    time_instances="time_instances",
                    dataformat='mat',
                    keepNaN=False,
                    commentlist=None,
                    dataversion = '1.0'):

    if not isinstance(data, dict):
        raise ValueError(
            "irfpy.ica.io.writedailydata: The parameter 'data' must be of <class 'dict'> " +
            "(a dict) but is now "+ str(type(data)) + ".")
    if branch is None:
        raise ValueError(
            "irfpy.ica.io.writedailydata: The parameter 'branch' must be of <class 'str'> " +
            "(a string) but is now None. 'branch' is the directory name below 'dataroot' where the data is written.")
    if fileprefix is None:
        raise ValueError(
            "irfpy.ica.io.writedailydata: The parameter 'fileprefix' must be of <class 'str'> " +
            "(a string) but is now None. 'fileprefix' is the string at the start if the filename " +
            "to be written. e.g. fileprefix='proc' will write files like 'proc20990101T010000.mat'")
    if dataformat not in ['mat','h5']:
        raise ValueError(
            "irfpy.ica.io.writedailydata: The parameter 'dataformat' must be one of 'mat' or 'h5'. "+
            "It determines the filetype that is written ")

    if time_instances not in data:
        raise ValueError(
            "irfpy.ica.io.writedailydata: The parameter 'data' must contain a key named " + time_instances +
            " to allow for proper time handling.")

    if not isinstance(theday, (str, dt.datetime)):
        raise ValueError(
            "irfpy.ica.io.loadlevelN: The parameter theday must be of <class 'str'> " +
            "(a string) or of <class 'datetime.datetime'> but is now " + str(type(theday)))

    thetime = data[time_instances]
    if not isinstance(thetime[0], dt.datetime):
        raise ValueError(
            "irfpy.ica.io.loadlevelN: the type of data["+time_instances+"] "+
            " of <class 'datetime.datetime'> but is now " + str(type(thetime[0])))

    if commentlist is None:
        commentlist = list()
        commentlist.append('0.0: Generated using irfpy.ica.io.writedailydata()')
        commentlist.append(str(dataversion)+': This version')

    if isinstance(theday,dt.datetime):
        theday = theday.strftime("%Y%m%d")

        # loop over all hours. magtime is sorted already
    yy = int(theday[:4])
    mm = int(theday[4:6])
    dd = int(theday[6:8])

    for hh in range(24):
        # make a mask containing only this hour
        thishour = np.array([x.hour == hh and x.year == yy and x.month== mm and x.day==dd for x in thetime])
        # if the mask is not empty make a file:
        has_H = any(x for x in thishour)
        if has_H:
            # extract the data
            dic = dict()

            # extract the data
            dic[time_instances] = icatools.datetime2matlab(ma.masked_array(
                thetime, mask=~thishour).compressed())

            hasfinitevalues = False
            for key in data.keys():
                print("key: "+key)
                if key != time_instances:
                    try:
                        dic[key] = data[key][...,thishour]
                        if not np.isnan(dic[key]).all():
                            hasfinitevalues = True
                        #print(dic[key].shape)
                    except:
                        #dic[key] = data[key]
                        print("Not using key '{}' as it is not a numpy array".format(key))


            if hasfinitevalues or keepNaN:

                #dicshapes = [type(dic[k]) for k in dic.keys()]
                #print(dic.keys(),dicshapes)

                dic[fileprefix+'_processed_data_comment'] = commentlist
                dic[fileprefix+'_processed_data_version'] = [dataversion,]

                if dataformat=='mat':
                    afilename = buildfilename(dataroot+os.sep + branch + os.sep + 'matlab',
                                               theday,
                                               prefix=fileprefix,
                                               postfix='.mat',
                                               hour=hh)
                else:
                    afilename = buildfilename(dataroot+os.sep + branch + os.sep + 'hdf5',
                                               theday,
                                               prefix=fileprefix,
                                               postfix='.h5',
                                               hour=hh)
                os.makedirs(os.path.dirname(afilename), exist_ok=True)
                print(' writing: ' + afilename)
                savemat(afilename, dic)
                gc.collect()

            else:
                print(' *** skipped ' + str(hh) + ' hours')

        else:
            print(' *** skipped ' + str(hh) + ' hours')
    return dic



# %%

if __name__ == '__main__':

    #    Example content of example ~/.irfpyrc:
    #    [icadds]
    #    dataroot =/home/tinu/ica/processed/

    from irfpy.util.irfpyrc import Rc
    rc = Rc()
    icadatarootpath = rc.get('icadds', 'dataroot')
    print(icadatarootpath)

    expectedversion(proc='4.5',lap='1.1',verbose=True)


    theday = '20160419'
    import irfpy.ica.pipeline as pip
#    import pipeline as pip
#    pip.updateraw(icadatarootpath,theday,progress=print)
    pip.updateproc(icadatarootpath,theday,progress=print,dataformat='mat')

    pip.updatespecial(icadatarootpath,progress=print)
    gmat=readspecial(icadatarootpath,verbose=True)

    pip.updatelap(icadatarootpath,theday,progress=print)
    pip.updatecops(icadatarootpath,theday,progress=print,private=True)
    pip.updatemag(icadatarootpath,theday,progress=print)

    pip.updatebestc(icadatarootpath,theday,progress=print)
#    gmat=readproc(icadatarootpath,'20160101T0633','20160101T1133',
#                       variables=['time_instances'],verbose=True)
    # gmat=readproc(icadatarootpath,theday,
    #                    variables=['time_instances','noise_reduction','orig_ionspectra'],
    #                    verbose=True)
    matlap=readlap(icadatarootpath,theday,verbose=True)
    gmat=readmag(icadatarootpath,theday,variables=['time_instances'],verbose=True)
    gmat=readspecial(icadatarootpath,generation=6,verbose=True)
    gmat=readbestc(icadatarootpath,'20160929T01','20160929T04',verbose=True)


#    pip.updateproc(icadatarootpath,'20150730',progress=print,dataformat='h5')
    pip.updateproc(icadatarootpath,'20150730',progress=print,dataformat='mat')
    pip.updateproc(icadatarootpath,'20160101',progress=print,dataformat='mat')


    gh5=readproc(icadatarootpath,'20150730T0',
                 variables=['time_instances','noise_reduction','orig_ionspectra'],
                 verbose=True, dataformat='h5')
    gmat=readproc(icadatarootpath,'20150730T0',
                  variables=['time_instances','noise_reduction','orig_ionspectra'],
                  verbose=True, dataformat='mat')

    gh5=readproc(icadatarootpath,'20150730T01','20150730T01',
                 verbose=True,dataformat='h5')
    gmat=readproc(icadatarootpath,'20150730T01','20150730T01',
                  verbose=True,dataformat='mat')

    gmat=readproc(icadatarootpath,
                  dt.datetime(2015,7,30,6,33),
                  dt.datetime(2015,7,30,6,43),verbose=True)
    print(len(gmat['time_instances']))

    gmat=readproc(icadatarootpath,
                  dt.datetime(2016,1,1,6,0),
                  dt.datetime(2016,1,2,6,59),
                  variables=['time_instances'],
                  verbose=True, partialelevationscan=False)
    print(len(gmat['time_instances']))

    #h = readbestc(icadatarootpath, '20160930', verbose=True, dataformat='mat')

#    set_datafiletype('mat')
#    m=readbestc(icadatarootpath,'20160920', verbose=True)
    spec = readspecial(icadatarootpath, generation=6,
          variables=['ICAanalyzerconstant',
                     'ICAhighvoltageoffset',
                     'ESC_H_volt',
                     'ESC_L_volt',
                     'ICAsoftwareversion'],verbose=True)

#%%
    xmat=readproc(icadatarootpath,'20150729T2330','20150730T0910',
                  variables=['time_instances','noise_reduction','orig_ionspectra'],
                  verbose=True, dataformat='mat')

    #xmat.pop('ica_processed_data_version')
    #xmat.pop('ica_processed_data_comment')

#%%
    test= writedailydata(icadatarootpath,
                    xmat,
                    '20150730',
                    fileprefix='wonder',
                    branch='mix',
                    time_instances="time_instances",
                    dataformat='mat')
#%%
    xmatread = loadlevelN(icadatarootpath,
               fileprefix='wonder',
               from_day='20150730T00',to_day='20150730T09',
               branch='mix',
               time_instances='time_instances',
               varshapes=default_varshapes1,
               verbose=True)


#%%

    bbb = np.array(['',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V02_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V03_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V04_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V05_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V06_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V07_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V08_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V09_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V10_00.txt'])


    aaa = np.array(['',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V02_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V03_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V04_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V05_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V06_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V07_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V08_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V09_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaElTable_V10_00.txt'])

    ccc = np.array(['',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V02_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V03_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V04_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V05_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V06_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V07_01.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V08_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V09_00.txt',
        'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V10_00.txt'])