Source code for irfpy.ica.io

# -*- coding: utf-8 -*-
"""
Read data from a local copy of the ICA data pipeline.

Author: Martin Wieser

Module: irfpy.ica.io

Functions to load transparently ICA pipeline data from the lcoal filesystem.
Functions all return a dictionary with all loaded variables.

Example load data::

    mat=readraw('~/icadata','20150211','20150213',variables=['time_interval','E'])
    mat=readproc('~/icadata','20150211','20150213',variables=['time_instances','E'])
    mat=readlap('~/icadata','20150211',variables=['lap_time','lap_potential'])

Summary:
    The most modern and environmentally friendly way to access ICA data:

    - irfpy.ica.io.readraw
    - irfpy.ica.io.readproc
    - irfpy.ica.io.readaux
    - irfpy.ica.io.readspecial
    - irfpy.ica.io.readlap
    - irfpy.ica.io.readcops
    - irfpy.ica.io.readbestc
    - irfpy.ica.io.readflux


    For more advanced ways to access the data use these low level functions:

    - irfpy.ica.io.loadlevel0
    - irfpy.ica.io.loadlevel1
    - irfpy.ica.io.loadlevel1oktime
    - irfpy.ica.io.loadlevel1okall
    - irfpy.ica.io.loadlevel1filter
    - irfpy.ica.io.loadlevel2
    - irfpy.ica.io.loadlevel3

    Read any version of .mat or .h5 file directly without doing the data
    type conversions of the load or read functions:

    - irfpy.ica.io.loadmat
    - irfpy.ica.io.savemat
    - irfpy.ica.io.loadh5
    - irfpy.ica.io.saveh5


"""
#analysis:ignore


import gc
import os
import glob
from collections import defaultdict
import datetime as dt
import warnings
import re
import logging
import h5py
import hdf5storage


import numpy as np
from numpy import ma

logging.basicConfig()
_logger = logging.getLogger('ica.io')
_logger.setLevel(logging.DEBUG)


try:
    if __name__ == '__main__':
        import tools as icatools
    else:
        from . import tools as icatools
except ModuleNotFoundError:
    import irfpy.ica.tools as icatools

try:
    import deepdish
    _HAVE_DEEPDISH = True
except ModuleNotFoundError:
    _HAVE_DEEPDISH = False

try:
    import psutil
    _HAVE_PSUTIL = True
except ModuleNotFoundError:
    _HAVE_PSUTIL = False




# %%

# The most modern and environmentally friendly way to access the ica data:


[docs]def readraw(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, dataformat='mat', **kwargs): r""" Loads one hour or more worth of level0 data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to the time interval specified. AUTHOR: Gabriella Stenberg Wieser PARAMETERS datarootpath (string): local path to the 'root' of the data tree This data tree should contain the file(s) in subdirectories level0/xxxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (list of strings): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) OUTPUT: returns a dictionary with all loaded variables EXAMPLES:: mat=readraw('~/icadata','20150211') mat=readraw('~/icadata','20150211',variables=['iontime','ionspectra']) mat=readraw('~/icadata','20150211','20150213',variables=['time_interval','E']) """ if variables is None: variables = [] return loadlevel0(datarootpath, from_day, to_day, variables, verbose, flat, dataformat=dataformat, **kwargs)
[docs]def readproc(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, includeerrors=False, partialelevationscan=False, dataformat='mat', filters=None, time_instances='time_instances'): r""" Loads one day or more worth of proc*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories level1/xxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes includeerrors (boolean), OUTDATED: Optional parameter, if set to True the loading function will also load data with decoder or formatting errors This parameter is outdated. Use filters keyword instead, e.g. filters={'time_error_flag':0} dataformat (str): 'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the matlab data tree is read. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: mat=readproc('~/icadata','20150211') mat=readproc('~/icadata','20150211',variables=['time_instances','E']) mat=readproc('~/icadata','20150211','20150213',variables=['time_instances','E']) """ if variables is None: variables = [] if includeerrors: if filters is None: filters = {} filters['time_error_flag'] = 0 else: if filters is None: filters = {} filters['error_flags'] = 0 return loadlevelN(datarootpath, fileprefix='proc', from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances=time_instances, branch='level1', filters=filters, varshapes=default_varshapes1)
[docs]def readaux(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, includeerrors=False, partialelevationscan=False, dataformat='mat', filters=None, time_instances='time_instances'): r"""Loads one day or more worth of aux*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories aux/xxxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes includeerrors (boolean), OUTDATED: Optional parameter, if set to True the loading function will also load data with decoder or formatting erroes This parameter is outdated. Use filters keyword instead, e.g. filters={'time_error_flag':0} dataformat (str): 'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the matlab data tree is read. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded. If bs_xxx variables are wanted and they are not in the aux files loaded, then an automatic fallback to xaux files is done. Use like this:: mat=readaux('~/icadata','20150211') mat=readaux('~/icadata','20150211',variables=['sp_cso','time_instances']) mat=readaux('~/icadata','20150211','20150213') """ if variables is None: variables = [] if includeerrors: if filters is None: filters = {} filters['time_error_flag'] = 0 else: if filters is None: filters = {} filters['error_flags'] = 0 mat = loadlevelN(datarootpath, fileprefix='aux', from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances=time_instances, branch='aux', filters=filters, varshapes=default_varshapes1) # now check if there was a whish for bs_xx and it could not be loaded. Try xaux. xvariables = list() if 'bs_cseq' in variables and 'bs_cseq' not in mat: xvariables.append('bs_cseq') if 'bs_cso' in variables and 'bs_cso' not in mat: xvariables.append('bs_cso') # there were some unfulfilled requests if len(xvariables) > 0: # there is need to load from xaux mat2 = loadlevelN(datarootpath, fileprefix='xaux', from_day=from_day, to_day=to_day, variables=xvariables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances='time_instances', branch='xaux', filters=filters, varshapes=default_varshapes1) # some vars could be obtained from xaux if 'bs_cseq' in mat2: mat['bs_cseq'] = mat2['bs_cseq'] if 'bs_cso' in mat2: mat['bs_cso'] = mat2['bs_cso'] return mat
[docs]def readspecial(datarootpath, generation=None, variables=None, verbose=False): r""" Loads one day or more worth of special*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories special/gen??? as it is done in the pipeline. generation (int): generation number of the requested files. If None, variables are loaded from the newest generation. variables (dict): a list of variable names to load. Default is to load all variables. The variables are collected from .npz and .mat files in this order of priority. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: mat=readspecial('~/icadata') mat=readspecial('~/icadata',generation=6,verbose=True) """ if variables is None: variables = [] if not isinstance(datarootpath, str): raise ValueError( "irfpy.ica.io.loadspecial: The parameter datarootpath must be of <class 'str'> " + "(a string) but is now " + str(type(datarootpath))) if generation is not None: if not isinstance(generation, int): raise ValueError( "irfpy.ica.io.loadspecial: The parameter generation must be of <class 'int'> " + "(an integer) but is now " + str(type(datarootpath))) if generation < 0: generation = None if not isinstance(variables, (list, np.ndarray)): raise ValueError( "irfpy.ica.io.loadspecial: The parameter variables must be of <class 'list'> " + "(a list) or <class 'numpy.ndarray'> (a numpy array) but is now " + str(type(variables))) thevars = list(variables) tp = datarootpath + os.sep + 'special' if generation is None: # find the newest generation gens = glob.glob(tp + os.sep + 'gen*') gens.sort() thegen = gens[-1] generation = int(thegen[-3:]) else: thegen = tp + os.sep + 'gen{:03d}'.format(generation) if verbose: print('** Loading special variables corresponding to GENERATION {:d} **'.format(generation)) thenpz = glob.glob(thegen + os.sep + '*.npz') themat = glob.glob(thegen + os.sep + '*.mat') thenpz.sort() themat.sort() allvars = dict() allvars['generation'] = generation #first load all npz and then from all mat in sorted order for file in thenpz+themat: if verbose: print(' reading '+file) if file.endswith('.npz'): newvars = dict(np.load(file)) else: newvars = loadmat(file) for k in newvars: if k not in allvars: if k in thevars or len(thevars) == 0: allvars[k] = newvars[k] if verbose: print(' loading '+k) else: if verbose: print("!!! Duplicate variable ('"+k+"') found. The duplicate will be ignored.") return allvars
[docs]def readlap(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, dataformat='mat', filters=None, time_instances='asw_time'): r"""Loads one day or more worth of lap*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories lap/xxxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes dataformat (str): 'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the matlab data tree is read. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: mat=readlap('~/icadata','20150211') mat=readlap('~/icadata','20150211',variables=['lap_time','lap_potential']) mat=readlap('~/icadata','20150211','20150213') """ if variables is None: variables = [] if filters is None: filters = dict() return loadlevelN(datarootpath, fileprefix='lap', from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=True, dataformat=dataformat, time_instances=time_instances, branch='lap', filters=filters, varshapes=default_varshapes1)
[docs]def readcops(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, dataformat='mat', filters=None, time_instances='cops_time'): r"""Loads one day or more worth of cops*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories cops/xxxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes dataformat (str): 'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the matlab data tree is read. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: mat=readcops('~/icadata','20150211') mat=readcops('~/icadata','20150211',variables=['cops_time','cops_pressure']) mat=readcops('~/icadata','20150211','20150213') """ if variables is None: variables = [] if filters is None: filters = dict() return loadlevelN(datarootpath, fileprefix='cops', from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=True, dataformat=dataformat, time_instances=time_instances, branch='cops', filters=filters, varshapes=default_varshapes1)
[docs]def readmag(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, dataformat='mat', filters=None, time_instances='mag_time'): r"""Loads one day or more worth of mag*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories mag/xxxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes dataformat (str): 'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the matlab data tree is read. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: mat=readmag('~/icadata','20150211') mat=readmag('~/icadata','20150211',variables=['mag_B_cseq']) mat=readmag('~/icadata','20150211','20150213') """ if variables is None: variables = [] if filters is None: filters = dict() return loadlevelN(datarootpath, fileprefix='mag', from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=True, dataformat=dataformat, time_instances=time_instances, branch='mag', filters=filters, varshapes=default_varshapes1)
[docs]def readbestc(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', filters=None, time_instances='time_instances'): r"""Loads one day or more worth of bestc*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories level2/xxxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes dataformat (str): 'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the matlab data tree is read. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: mat=readbestc('~/icadata','20150211') mat=readbestc('~/icadata','20150211',variables=['time_instances','h_counts']) mat=readbestc('~/icadata','20150211','20150213') """ if variables is None: variables = [] if filters is None: filters = dict() return loadlevelN(datarootpath, fileprefix='bestc', from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances=time_instances, branch='level2', filters=filters, varshapes=default_varshapes2)
[docs]def readflux(datarootpath, from_day=None, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', filters=None, time_instances='time_instances'): r"""Loads one day or more worth of flux*.mat data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of a data directory tree This data tree should contain the file(s) in subdirectories level3/xxxx/yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes dataformat (str): 'mat' or 'h5'. if h5 then the hdf5 data tree is read ifmat then the matlab data tree is read. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: mat=readflux('~/icadata','20150211') mat=readflux('~/icadata','20150211',variables=['time_instances','flux']) mat=readflux('~/icadata','20150211','20150213') """ if variables is None: variables = [] if filters is None: filters = dict() return loadlevelN(datarootpath, fileprefix='flux', from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances=time_instances, branch='level3', filters=filters, varshapes=default_varshapes3)
# %% # Below follow all level 0/1/2/3... variable names that need # concatenation when several files are loaded # Variables not listed here will be only copied (the one from the first file is returned) # sorted for shape: # --- level 3 ------ _level3shape1vars = [ # following stuff is from proc ] _level3shape2vars = [ ] _level3shape3vars = [ ] _level3shape4vars = [ # following stuff is from bestc ] _level3datetimevars = ['time_instances' ] # --- level 2 ------ _level2shape1vars = [ # following stuff is from proc 'time_instances', 'sw_version', 'cur_pacc', 'mode', 'noise_reduction_corrsum', # following stuff is from bestc 'random_noise_corrsum', # following stuff is from bestc 'dead_time_corrsum', 'flat_mass_corrsum', 'flat_sector_corrsum', 'ghost_corrsum', 'shadowmask', 'processing_level' ] _level2shape2vars = [ 'processing_level' ] _level2shape3vars = [ 'heavy_ions', 'light_ions' ] _level2shape4vars = [ # following stuff is from bestc 'best_ionspectra', 'sigma_ionspectra', 'ghost_ionspectra' ] _level2datetimevars = [ 'time_instances' ] # --- level 1 ------ _level1shape1vars = [ # following stuff is from proc 'clean_ionspectra', 'cur_pacc', 'cur_pacc_table', 'elevation_step', 'mode', 'is_mass_matrix', 'massbinwidth', 'time_error_flag', 'decoder_error_flag', 'edf_error_flag', 'fmt_error_flag', 'error_flags', 'size_of_lookup_table', 'time_instances', 'sw_version', 'error_flags', # below is for compatibility with pipeline v2.5 files # 'orig_ionspectra', # following stuff is from aux 'ilat', 'ilon', 'azimbinwidth', 'elevbinwidth', # following stuff is from special # following stuff is from lap 'lap_time', 'lap_quality', #'lap_electrondensity', # removed #'lap_electrontemp', # removed 'lap_potential', 'usc_time', 'usc_potential', 'usc_potential_q', 'usc_quality', 'usc_source', 'asw_time', 'asw_electrondensity', 'asw_electrondensity_q', 'asw_photosaturationcurrent', 'asw_photosaturationcurrent_q', 'asw_bulkionspeed', 'asw_bulkionspeed_q', 'asw_electrontemperature', 'asw_electrontemperature_q', 'asw_electrontemperature_xcal', 'asw_electrontemperature_xcal_q', 'asw_potential', 'asw_potential_q', 'asw_quality', 'ned_time', 'ned_electrondensity', 'ned_electrondensity_q', 'ned_source', 'ned_quality', # following stuff is from cops 'cops_time', 'cops_density', 'cops_pressure_ng', 'cops_pressure_ng_flag', # following stuff is from mag 'mag_time', 'mag_time_level_g', 'mag_quality', 'mag_quality_level_g', # following stuff is moved from bestc to proc 'noise_reduction', 'valven_time' ] _level1shape2vars = [ # following stuff is from proc 'sum_orig_ionspectra', 'shadow', # following stuff is from aux 'cp_eclip', 'sa_cseqx', 'sa_cseqy', 'sa_cseqz', 'sa_csox', 'sa_csoy', 'sa_csoz', 'sp_cseq', 'sp_cso', 'sv_cseq', 'sv_cso', # following stuff is from mag 'mag_b_cseq', 'mag_b_cseq_level_g' ] _level1shape3vars = [ ] _level1shape4vars = [ 'bs_cseq', 'bs_cso', 'orig_ionspectra', # following stuff is from bestc 'clean_ionspectra' ] _level1datetimevars = [ 'time_instances', 'time_elev_scan_sta', 'time_elev_scan_sto', 'lap_time', 'usc_time', 'asw_time', 'ned_time', 'cops_time', 'mag_time', 'mag_time_level_g', 'valven_time' ] # --- level 0 ------ _level0shape1vars = [ 'badhvmask', 'decoder_error', 'fifo_fill', 'fmt_len', 'fmt_len_error', 'iontime', 'mode', 'mtable', 'pacc_high_low', 'pacclevel', 'promsection', 'shadowmask', 'hk_cmdstat', 'hk_defl_hv_ref', 'hk_defl_lv_ref', 'hk_deflhv_mon', 'hk_deflhv_sw', 'hk_defllv_mon', 'hk_defllv_sw', 'hk_direct_cmd_sw', 'hk_dpu_temp', 'hk_entr_lower_mon', 'hk_entr_ref', 'hk_entr_sw', 'hk_entr_upper_mon', 'hk_fifofill', 'hk_first_word_cmdret', 'hk_grid_mon', 'hk_grid_ref', 'hk_grid_sw', 'hk_main28_present', 'hk_main_sw', 'hk_mcp_curr_ref', 'hk_mcp_def_ref', 'hk_mcp_mon', 'hk_mcp_present', 'hk_mcp_sw', 'hk_mode', 'hk_newcmd', 'hk_opto_curr_ref', 'hk_opto_def_ref', 'hk_opto_mon', 'hk_opto_present', 'hk_opto_sw', 'hk_pacc_high_ref', 'hk_pacc_hl', 'hk_pacc_low_ref', 'hk_pacc_mode', 'hk_pacc_mon', 'hk_pacc_sw', 'hk_sensor_temp', 'hk_sid', 'hk_time', 'hk_tmfifo_ovrf' ] _level0shape2vars = [ 'azim', 'elev', 'ionspectra', 'masschannel' ] _level0datetimevars = [ 'iontime', 'hk_time' ] # --- black list ------ _blacklistvars = [ 'cal_orig_ionspectra', 'time_error_mask', 'fmt_error_mask', 'decoder_error_mask'] # --------------------- _force_datatype_float = [ 'orig_ionspectra', 'sum_orig_ionspectra', 'clean_ionspectra', 'light_ions', 'heavy_ions', 'noise_reduction' ] _force_datatype_int = [ 'sw_version', 'mode', 'cur_pacc', 'error_flags', 'time_error_flag', 'decoder_error_flag', 'edf_error_flag', 'fmt_error_flag', 'version_list', 'dead_channles', 'is_matrix', 'mass_looku_table', 'shadow', 'size_of_lookup_table' ] _force_datatype_stringlist = [ 'ica_processed_data_comment', 'aux_processed_data_comment', 'bestc_processed_data_comment', 'flux_processed_data_comment', 'lap_processed_data_comment', 'cops_processed_data_comment', 'mag_processed_data_comment', 'mip_processed_data_comment' ] _force_datatype_unicodestring = [ 'processing_level' ] # convenient configuration variables: default_varshapes1 = {'1d':_level1shape1vars, '2d':_level1shape2vars, '3d':_level1shape3vars, '4d':_level1shape4vars, 'datetime':_level1datetimevars, 'int32':_force_datatype_int, 'float32':_force_datatype_float, 'unicodestring':_force_datatype_unicodestring, 'stringlist':_force_datatype_stringlist, 'blacklist':_blacklistvars} default_varshapes2 = {'1d':_level2shape1vars, '2d':_level2shape2vars, '3d':_level2shape3vars, '4d':_level2shape4vars, 'datetime':_level2datetimevars, 'int32':_force_datatype_int, 'float32':_force_datatype_float, 'unicodestring':_force_datatype_unicodestring, 'stringlist':_force_datatype_stringlist, 'blacklist':_blacklistvars} default_varshapes3 = {'1d':_level3shape1vars, '2d':_level3shape2vars, '3d':_level3shape3vars, '4d':_level3shape4vars, 'datetime':_level3datetimevars, 'int32':_force_datatype_int, 'float32':_force_datatype_float, 'unicodestring':_force_datatype_unicodestring, 'stringlist':_force_datatype_stringlist, 'blacklist':_blacklistvars} # %%
[docs]def conditional_gc_collect(): r""" Calls gc.collect() if the available memory is less than 30%. """ if _HAVE_PSUTIL: mem = psutil.virtual_memory() if mem.percent > 70.0: gc.collect() else: gc.collect()
# %%
[docs]def loadmat(file, variable_names=None): r""" Loads the dictionary "variable_names" from a v7 or v7.3 mat file. if the dictionary is not specified, then everything is loaded.:: filename = 'test.mat' m = irfpy.ica.io.loadmat(filename) print (m.keys()) """ afile = os.path.expanduser(file) # check the requested keys are in the file fil = None try: fil = h5py.File(afile, 'r') fk = fil.keys() existingvariables = [x for x in variable_names if x in fk] except: if variable_names is not None: existingvariables = list(variable_names) else: existingvariables = None if fil is not None: fil.close() try: mat = hdf5storage.loadmat(afile, variable_names=existingvariables) except KeyboardInterrupt: mat = {} raise except: warnings.warn("irfpy.ica.io.loadmat: Unable to read data from " + afile, stacklevel=4) mat = {} return mat
[docs]def loadh5(file, variable_names=None): r""" loads the dictionary "variable_names" from a h5 file. """ afile = os.path.expanduser(file) # load the h5 file completely # h5 = load_dict_from_hdf5(afile) h5 = dict() try: if variable_names is not None: for vv in variable_names: try: data = deepdish.io.load(afile,[os.sep+vv]) h5[vv] = data[0] except ValueError: pass conditional_gc_collect() else: h5 = deepdish.io.load(afile) # # check the requested keys are in the file # if variable_names is not None: # there is a positive list of variables that are wanted # # remove unwanted ones # for k in h5.keys(): # if k not in variable_names: # h5.pop(k, None) # gc.collect() # gc.collect() except KeyboardInterrupt: raise except: warnings.warn("irfpy.ica.io.loadh5: Unable to read data from " + afile, stacklevel=4) return h5
[docs]def savemat(file, data): r""" Save the dictionary data in v7.3 mat format to file analog to scipy.io.savemat:: a = np.range(10) b = np.range(33) c = np.range(22) ** 2 filename = 'test.mat' m = dict() m['a'] = a m['b'] = b m['c'] = c irfpy.ica.io.savemat(filename,m) """ afile = os.path.expanduser(str(file)) hdf5storage.savemat(file_name=afile, mdict=data, store_python_metadata=True, matlab_compatible=True, truncate_existing=True, truncate_invalid_matlab=True)
[docs]def saveh5(file, data): r""" Save the dictionary data in hdf5 format using deepdish""" afile = os.path.expanduser(str(file)) deepdish.io.save(afile, data, compression=('blosc', 5))
# save_dict_to_hdf5(data,afile) # --------------------- def _loaddata(file, variable_names, datafileextension): r""" Loads a dictionary data using the selected format""" if datafileextension == 'mat': data = loadmat(file, variable_names=variable_names) else: data = loadh5(file, variable_names=variable_names) return data # ---------------------
[docs]def minonedim(a): r""" return an array with at least one dimension based on a """ if len(np.shape(a)) >= 1: return a # is already ok return np.array([a]) # we need one dimension at least
[docs]def flatstring(version): r""" Tries hard to make a string out of the argument. """ return str(np.array(version).squeeze())
[docs]def flatstringlist(list_of_lists): r""" Tries hard to make a list of strings out of the argument. """ strlist=list() try: if isinstance(list_of_lists, np.ndarray): # It was a cell array in matlab. take apart: list_of_lists_sq = np.squeeze(list_of_lists) for c in list_of_lists_sq: if type(c) == np.str_: strlist.append(str(c)) else: strlist.append(str(c[0][0])) else: for c in list_of_lists: strlist.append(str(c)) except: strlist.append('Could not decode comment list for some reson :-( ') return strlist
# %% _WANTVERSIONS=dict() _HAVEVERSIONS=dict() _VERBOSEVERSION=False _ALREADYSHOWNVERSIONS=dict()
[docs]def expectedversion(proc=None, aux=None, bestc=None, flux=None, lap=None, cops=None, mag=None, mip=None, verbose=True, **kwargs): r""" Defines the data versions that the user would like to have. If the existing version is different, then at the first occurence there will be a warning printed explaining what the difference is between the required version and the present version. If no version is specified then all versions are allowed. All parameters are strings of the form "xx.y" or floating numbers. e.g. proc='2.3' means that for proc, version 2.3 is requested. The version found in the datafiles is printed once. If verbose is set to false, this is suppressed. Usage:: expectedversion(proc='2.3',lap='0.3') """ global _WANTVERSIONS global _HAVEVERSIONS global _VERBOSEVERSION global _ALREADYSHOWNVERSIONS _WANTVERSIONS=dict() _HAVEVERSIONS=dict() _VERBOSEVERSION = verbose _ALREADYSHOWNVERSIONS=dict() if proc is not None: _WANTVERSIONS['proc'] = str(proc) if aux is not None: _WANTVERSIONS['aux'] = str(aux) if bestc is not None: _WANTVERSIONS['bestc'] = str(bestc) if flux is not None: _WANTVERSIONS['flux'] = str(flux) if lap is not None: _WANTVERSIONS['lap'] = str(lap) if cops is not None: _WANTVERSIONS['cops'] = str(cops) if mag is not None: _WANTVERSIONS['mag'] = str(mag) if mip is not None: _WANTVERSIONS['mip'] = str(mip)
def _checkdataversion(mat, mattype, versionkey, commentkey): r""" This is called internally after loading a mat file to cehck if the version is according to specifications. """ # global versionstr versionstr='no version info available' if versionkey in mat: versionstr=''.join(flatstring(mat[versionkey]).split()) # remove also whitespace # global commentlist commentlist=list() if commentkey in mat: commentlist=flatstringlist(mat[commentkey]) if _VERBOSEVERSION: if mattype+versionstr not in _ALREADYSHOWNVERSIONS: print(' +++ BEGIN INFO +++++++++++++++++++++++++++++++++++++++++++++++++++++') print(' Loading file of type: '+mattype) print(' The version of this file is : ', versionstr) # print(' Comments: ') # for c in commentlist: # print(' '+flatstring(c)) _ALREADYSHOWNVERSIONS[mattype+versionstr] = True print(' +++ END INFO +++++++++++++++++++++++++++++++++++++++++++++++++++++++') if versionkey == '': return if versionkey not in mat: return if not mattype in _HAVEVERSIONS: if mattype in _WANTVERSIONS: # check if there is a requiremetn if _WANTVERSIONS[mattype] < versionstr: print() print('*** BEGIN WARNING ***************************************************') print('You expected an ealier version of the '+mattype+' data to be present:') print('Desired version was : '+ _WANTVERSIONS[mattype]) print('But the currently loading version is : '+ versionstr) print('Please review the differences between the two versions:') if len(commentlist)==0: print(' '+'No version history available to show') for c in commentlist: lval = re.sub(r'[^0123456789\.]','',flatstring(c).split(':',maxsplit=1)[0]) if float(lval) > float(_WANTVERSIONS[mattype]): print(' '+flatstring(c)) print(' ---') print('*** END WARNING *****************************************************') print() else: if _HAVEVERSIONS[mattype] != versionstr: print() print('*** BEGIN WARNING ***************************************************') print('You are loading differnt versions of '+mattype+' data at the same time:') print('Previoulsy loaded version was '+ _HAVEVERSIONS[mattype]) print('Currently loading version is '+ versionstr) print('This may give unpredictable results.') print('*** END WARNING *****************************************************') print() _HAVEVERSIONS[mattype] = versionstr def _getversionkey(mattype): r""" Returns the name of the version key associated with a certain mattype. e.g. mattype=='proc' returns 'ica_processed_data_version' """ if mattype == 'proc': return 'ica_processed_data_version' return mattype+'_processed_data_version' # default def _getcommentkey(mattype): r""" Returns the name of the comment key associated with a certain mattype. e.g. mattype=='proc' returns 'ica_processed_data_comment' """ if mattype == 'proc': return 'ica_processed_data_comment' return mattype+'_processed_data_comment' # default # %% def _filtermatfile(m,filters,partialelevationscan, verbose=False, time_instances='time_instances', varshapes=None): r""" Removes time_instances according to specifications given in the filters list. Manages Level1, Level2 and Level3 variables. """ if not filters: return m if not m: return m if time_instances not in m: if verbose: print(" Warning: '"+time_instances+"' was not present in the loaded data. "+ "Can not apply filters.") return m # oktimes=np.ones_like(np.squeeze(m['time_instances']),dtype=bool) # initialize with True oktimes=np.ones((m[time_instances].shape[-1],),dtype=bool) # initialize with True for filterkey in filters: if filterkey not in m: if verbose: print(" Warning: The filter key '"+filterkey+ "' was not present in the loaded data. This filter is therfore ignored.") else: # e have the fitlerkey loaded. check it is an applicable key # remove the matlab extra dimension for 1D variables prior making filter calculations. if filterkey in varshapes['1d']: m[filterkey] = minonedim(np.squeeze(m[filterkey])) else: m[filterkey] = np.squeeze(m[filterkey]) # apply filter if filterkey in varshapes['1d']+ \ varshapes['2d']+ \ varshapes['3d']+ \ varshapes['4d']: oktimes[m[filterkey] != filters[filterkey]] = False # deselect if verbose>1: print(' filtering using '+filterkey) else: if verbose: print(" Warning: filter key '"+filterkey+ "' is not a valid filter, it was therefore ignored.") if not partialelevationscan: # now make sure that if an elevation scan contains an error, the whole # elevation scan is removed. for block in range(len(oktimes) // 16): # walk through the error mask in steps of 16 if not np.all(oktimes[block * 16:(block + 1) * 16]): oktimes[block * 16:(block + 1) * 16] = False for key in m.keys(): # prior filtering, remove the matlab extra dimension for 1D variables. if key in varshapes['1d']: m[key] = minonedim(np.squeeze(m[key])) # if verbose: # print(' +++ squeezed ',key) # now filter: theshape = np.shape(m[key]) # all 1D vars: if len(theshape) == 1: # the following things contain time series if key in varshapes['1d']: mk = m[key] m[key] = mk[oktimes] # all 2D vars: elif len(theshape) == 2: # the following things need to be concatenated if key in varshapes['2d']: mk = m[key] m[key] = mk[:, oktimes] # all 3D vars: elif len(theshape) == 3: # the following things need to be concatenated if key in varshapes['3d']: mk = m[key] m[key] = mk[:, :, oktimes] # 4D vars: elif len(theshape) == 4: # the following things need to be concatenated if key in varshapes['4d']: mk = m[key] m[key] = mk[:, :, :, oktimes] conditional_gc_collect() return m # %% Generic load function
[docs]def loadlevelN(datarootpath, fileprefix='bestc', from_day=None, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', branch=None, time_instances='time_instances', filters=None, varshapes=None ): r""" Loads one day or more worth of level1 data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of the level1 data tree This data tree should contain the file(s) in subdirectories yyyy/mm/dd/ as it is done in the pipeline. fileprefix (string): Data files start with this prefix. e.g. 'bestc' from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes. dataformat (string): One of: 'mat'(default) or 'h5'. time_instances (str): Name of the time_instances variable (default: 'time_instances') branch (str): Directory below datarootpath the data is stored, e.g. 'level1' filters (dict): Filter list varshapes (dict): data structure describing the dimensionality and the special data types of the variables to be read. Use predefined defaults, e.g. default_varshapes1 for level1 variable shapes. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: matfile=irfpy.ica.io.loadlevel?(icapath,'bestc','20150211') matfile=irfpy.ica.io.loadlevel?(icapath,'bestc','20150211', variables=['time_instances','E']) matfile=irfpy.ica.io.loadlevel?(icapath,'bestc','20150211','20150213', variables=['time_instances','E']) """ # do some sanity checking if variables is None: variables = [] if filters is None: filters = {} if not isinstance(datarootpath, str): raise ValueError( "irfpy.ica.io.loadlevelN: The parameter datarootpath must be of <class 'str'> " + "(a string) but is now " + str(type(datarootpath))) if not isinstance(fileprefix, str): raise ValueError( "irfpy.ica.io.loadlevelN: The parameter fileprefix must be of <class 'str'> " + "(a string) but is now " + str(type(fileprefix))) if not isinstance(from_day, (str, dt.datetime)): raise ValueError( "irfpy.ica.io.loadlevelN: The parameter from_day must be of <class 'str'> " + "(a string) or of <class 'datetime.datetime'> but is now " + str(type(from_day))) if not isinstance(to_day, (str, dt.datetime)): raise ValueError( "irfpy.ica.io.loadlevelN: The parameter to_day must be of <class 'str'> " + "(a string) or of <class 'datetime.datetime'> but is now " + str(type(to_day))) if not isinstance(variables, (list, np.ndarray)): raise ValueError( "irfpy.ica.io.loadlevelN: The parameter variables must be of <class 'list'> " + "(a list) or <class 'numpy.ndarray'> (a numpy array) but is now " + str(type(variables))) if varshapes is None: raise ValueError( "irfpy.ica.io.loadlevelN: The parameter varshapes must be of <class 'dict'> " + "(a dictionary) but is now None. Add a varshapes=default_varshapes1, ..2 or ..3 parameter") def getvarshapes(varshapes,akey,default=None): levelNshape = default if akey in varshapes: levelNshape = varshapes[akey] if not isinstance(levelNshape,list): raise ValueError( "irfpy.ica.io.loadlevelN: The value of varshapes['{:s}'] must be of <class 'list'> (a list).".format(akey)) return levelNshape levelNshape1vars = getvarshapes(varshapes,'1d') levelNshape2vars = getvarshapes(varshapes,'2d') levelNshape3vars = getvarshapes(varshapes,'3d') levelNshape4vars = getvarshapes(varshapes,'4d') levelNdatetimevars = getvarshapes(varshapes,'datetime') levelNint32 = getvarshapes(varshapes,'int32') levelNfloat32 = getvarshapes(varshapes,'float32') levelNstringlist = getvarshapes(varshapes,'stringlist',default=list()) levelNunicodestring = getvarshapes(varshapes,'unicodestring',default=list()) levelNblacklist = getvarshapes(varshapes,'blacklist',default=_blacklistvars) if branch is None: raise ValueError( "irfpy.ica.io.loadlevelN: The parameter branch must be of <class 'str'> " + "(a dictionary) but is now None.") if isinstance(from_day, dt.datetime): from_day = icatools.datetime2string(from_day) if isinstance(to_day, dt.datetime): to_day = icatools.datetime2string(to_day) datarootpath = os.path.expanduser(datarootpath) tp = "" if datarootpath[-1] != os.sep: tp = datarootpath + os.sep else: tp = datarootpath checkdirectory(datarootpath) subdir, extension = icatools.get_data_path_info(branch, dataformat) subdir = subdir.replace(os.sep,os.sep) tp = tp + subdir mat = defaultdict(list) # a dictionary with lists as elements # Are year, month or day from from_day and to_day identical? # If so limit the search pattern for glob myyear = '*' if len(from_day) >= 4: if len(to_day) == 0: myyear = from_day[0:4] elif from_day[0:4] == to_day[0:4]: myyear = from_day[0:4] mymonth = '*' if len(from_day) >= 6: if len(to_day) == 0: mymonth = from_day[4:6] elif from_day[0:6] == to_day[0:6]: mymonth = from_day[4:6] myday = '*' if len(from_day) >= 8: if len(to_day) == 0: myday = from_day[6:8] elif from_day[0:8] == to_day[0:8]: myday = from_day[6:8] # handle default only one day if to_day == '': # print('searching for: '+tp+myyear+os.sep+mymonth+os.sep+ # myday+os.sep+mattype+from_day+'*mat') files = np.array(glob.glob(tp + myyear + os.sep + mymonth + os.sep + myday + os.sep + fileprefix + from_day[0:11] + '*' + extension)) if len(files) == 0: # nothing found assume non subdirectory stee! if flat: files = np.array(glob.glob(tp + fileprefix + from_day[0:11] + '*' + extension)) if len(files) == 0: # nothing found assume non subdirectory stee! return {} files.sort() npfiles = np.array(files) usethese = np.array([npfiles > '']) else: # several days # get all candidates files = np.array(glob.glob(tp + myyear + os.sep + mymonth + os.sep + myday + os.sep + fileprefix + '*' + extension)) if len(files) == 0: # nothing found! if flat: files = np.array(glob.glob(tp + fileprefix + from_day + '*' + extension)) if len(files) == 0: # nothing found assume non subdirectory stee! return {} files.sort() npfiles = np.array(files) filesshort = np.array([os.path.split(f)[1] for f in npfiles]) # no .mat here because of sorting order fromfil = fileprefix + from_day[0:8].ljust(8, '0') + 'T' + from_day[9:11].ljust(6, '0') tofil = fileprefix + to_day[0:8].ljust(8, '9') + 'T' + \ to_day[9:11].ljust(6, '9') + '.' + extension usethese = np.logical_and([filesshort >= fromfil], [filesshort <= tofil]) versionkey=_getversionkey(fileprefix) commentkey=_getcommentkey(fileprefix) # There is a possibility that usethese has shape (1,n) but we want (n,) # np.squeeze() is not used because if the shape is (1,1) then the # result will be a skalar. if len(usethese.shape)==2: usethese = usethese[0,:] # print('Usethese 1:',usethese) if variables == []: variablelist = None else: variablelist = variables.copy() if time_instances not in variablelist: variablelist.append(time_instances) if len(versionkey)>0 and versionkey not in variablelist: variablelist.append(versionkey) if len(commentkey)>0 and commentkey not in variablelist: variablelist.append(commentkey) # make sure the required filter values are loaded for key in filters.keys(): if key not in variablelist: variablelist.append(key) if 'sum_orig_ions' in variablelist: if verbose: print(' Warning: "sum_orig_ions" is calculated on a hourly basis '+ 'and not according to "from_day" and "to_day" parameters '+ 'of loadlevel1() or readproc()') if time_instances !='time_instances': if verbose: print(' Using the variable "'+time_instances+'" to select time intervals') for file in npfiles[usethese]: if verbose: print(' loading: ' + file) matfile = _loaddata(file, variable_names=variablelist, datafileextension=extension) # Check if the data version is acceptable _checkdataversion(matfile,fileprefix,versionkey,commentkey) # Remove values that do not match the filters dictionary. # Do this here to avoid loading and handling large amounts of unwanted data. matfile = _filtermatfile(matfile,filters,partialelevationscan, verbose=verbose, time_instances=time_instances, varshapes=varshapes) for key in matfile: if '__' in key: continue # the black list: if key in levelNblacklist: continue if (variables == []) or (key in variablelist): if key not in mat: if verbose: print(' adding key: ' + key) # a new key not seen before if np.shape(matfile[key]) == (): mat[key].append(matfile[key]) # scalar else: # handle 2D vars from matlab that are actually 1d if key in levelNshape1vars: mat[key].append(minonedim(np.squeeze(matfile[key]))) else: mat[key].append(np.squeeze(matfile[key])) else: # something we already have theshape = np.shape(mat[key][0]) # all 1D vars: if len(theshape) == 1: # the following things from proc need to be concatenated if key in levelNshape1vars: # print(np.shape(matfile[key])) mat[key].append(minonedim(np.squeeze(matfile[key]))) # all 2D vars: elif len(theshape) == 2: # the following things from proc need to be concatenated if key in levelNshape2vars: mat[key].append(matfile[key]) # this one needs to be summed up elif key in ['sum_orig_ions']: mat[key][0] = np.add(mat[key][0], matfile[key]) # all 3D vars: elif len(theshape) == 3: # the following things from proc need to be concatenated if key in levelNshape3vars: mat[key].append(matfile[key]) # all 4D vars: elif len(theshape) == 4: # the following things from proc need to be concatenated if key in levelNshape4vars: mat[key].append(matfile[key]) # force data types where needed. do this before concatenating to minimize # memory footprint if key in levelNdatetimevars: if mat[key][-1].dtype != 'O': # object mat[key][-1] = icatools.matlab2datetime(mat[key][-1]) if key in levelNfloat32: if mat[key][-1].dtype != 'float32': # it is not right float type mat[key][-1] = mat[key][-1].astype('float32') # then force it to float if key in levelNint32: if mat[key][-1].dtype != 'int32': # it is not right int type mat[key][-1] = mat[key][-1].astype('int32') # then force it to int if key in levelNstringlist: # this is mainly for the comment field mat[key][-1] = flatstringlist(mat[key][-1]) elif key.endswith('processed_data_comment'): mat[key][-1] = flatstringlist(mat[key][-1]) if key in levelNunicodestring: # this is mainly for the processing_level field mat[key][-1] = mat[key][-1].decode() conditional_gc_collect() # generate masks for the first and the last file to make an exact match of # the requested time interval tmaskfirst = [] tmasklast = [] if partialelevationscan: # default is to load only full elevation scans boundary = 1 else: boundary = 16 if time_instances in mat.keys(): # modify the first and the last element of mat[key] to fit the exact time interval tmaskfirst = icatools.selecttime( mat[time_instances][0], from_day, to_day, mod_boundary=boundary) if len(mat[time_instances][:]) > 1: tmasklast = icatools.selecttime( mat[time_instances][-1], from_day, to_day, mod_boundary=boundary) allmat = {} # the final result # concatenate the lists for key in mat.keys(): # something we already have # take the shape from the first element theshape = np.shape(mat[key][0]) # all 1D vars: if len(theshape) == 1: # the following things from need to be concatenated if key in levelNshape1vars: if len(tmaskfirst) > 0: mat[key][0] = mat[key][0][tmaskfirst] if len(tmasklast) > 0: mat[key][-1] = mat[key][-1][tmasklast] # for element in mat[key]: # print(key,': ',np.shape(element)) allmat[key] = np.concatenate(mat[key], axis=0) continue # all 2D vars: elif len(theshape) == 2: # the following things from proc need to be concatenated if key in levelNshape2vars: if len(tmaskfirst) > 0: mat[key][0] = mat[key][0][:, tmaskfirst] if len(tmasklast) > 0: mat[key][-1] = mat[key][-1][:, tmasklast] allmat[key] = np.concatenate(mat[key], axis=1) continue # all 3D vars: elif len(theshape) == 3: # the following things from proc need to be concatenated if key in levelNshape3vars: if len(tmaskfirst) > 0: mat[key][0] = mat[key][0][:, :, tmaskfirst] if len(tmasklast) > 0: mat[key][-1] = mat[key][-1][:, :, tmasklast] allmat[key] = np.concatenate(mat[key], axis=2) continue # all 4D vars: elif len(theshape) == 4: # the following things from proc need to be concatenated if key in levelNshape4vars: # for ii in range(6): # print(np.shape(mat[key][ii])) if len(tmaskfirst) > 0: mat[key][0] = mat[key][0][:, :, :, tmaskfirst] if len(tmasklast) > 0: mat[key][-1] = mat[key][-1][:, :, :, tmasklast] allmat[key] = np.concatenate(mat[key], axis=3) continue # if no match with anything, just copy the first element allmat[key] = mat[key][0] conditional_gc_collect() return allmat
# %% Loadlevel2 backward compatibility
[docs]def loadlevel2(datarootpath, mattype='bestc', from_day=None, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', filters=None): r""" Loads one day or more worth of level1 data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. PARAMETERS datarootpath (string): local path to the 'root' of the level1 data tree This data tree should contain the file(s) in subdirectories yyyy/mm/dd/ as it is done in the pipeline. mattype (string): one of 'bestc' from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes. dataformat (string): One of: 'mat'(default) or 'h5'. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: matfile=irfpy.ica.io.loadlevel2(icapath,'bestc','20150211') matfile=irfpy.ica.io.loadlevel2(icapath,'bestc','20150211', variables=['time_instances','E']) matfile=irfpy.ica.io.loadlevel2(icapath,'bestc','20150211','20150213', variables=['time_instances','E']) """ return loadlevelN(datarootpath, fileprefix=mattype, from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances='time_instances', branch='level2', filters=filters, varshapes=default_varshapes2)
# %% loadlevel1 for backward compatibility
[docs]def loadlevel1(datarootpath, mattype='proc', from_day=None, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', filters=None): r""" Loads one day or more worth of level1 data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. Consider using one of the readproc(), readaux(), read...() functions instead. PARAMETERS datarootpath (string): local path to the 'root' of the level1 data tree This data tree should contain the file(s) in subdirectories yyyy/mm/dd/ as it is done in the pipeline. mattype (string): one of 'proc', 'special' or 'aux' from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Optional parameter, if set to True the loading function will print the names of the files it loads for debugging purposes dataformat (string): One of: 'mat'(default) or 'h5'. OUTPUT Returns a dictionary with all loaded variables. The dictionary is empty if no matching data could be loaded Use like this:: icapath = '/home/user/icadata' matfile=irfpy.ica.io.loadlevel1(icapath,'proc','20150211') matfile=irfpy.ica.io.loadlevel1(icapath,'proc','20150211', variables=['time_instances','E']) matfile=irfpy.ica.io.loadlevel1(icapath,'proc', '20150211','20150213', variables=['time_instances','E']) matfile=irfpy.ica.io.loadlevel1(icapath, 'aux', theinterval, variables=['sp_cso','time_instances']) """ return loadlevelN(datarootpath, fileprefix=mattype, from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances='time_instances', branch='level1', filters=filters, varshapes=default_varshapes1)
[docs]def loadlevel1oktime(datarootpath, mattype, from_day, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', filters=None): r""" Loads one day or more worth of level1 data analog to irfpy.ica.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. Only data with timeswhere time_error_flag is == 0 is loaded Consider using one of the readproc(), readaux(), read...() functions instead. PARAMETERS datarootpath (string): local path to the 'root' of the level1 data tree This data tree should contain the file(s) in subdirectories yyyy/mm/dd/ as it is done in the pipeline. mattype (string): filetype to be loaded, one of 'proc' or 'aux' from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Prints what the function does dataformat (string): One of: 'mat'(default) or 'h5'. This function is identical to the call of loadlevel1 with a filters parameter: loadlevel1(datarootpath, .... , filters={'time_error_flag':0}) OUTPUT Returns a dictionary with all loaded variables Limitation: If this function is used to load an aux file, the aux file must be version 1.9 or larger. This function can not be used to load special files. """ if variables is None: variables = [] if filters is None: filters = {} filters['time_error_flag'] = 0 # m = loadlevel1(datarootpath, mattype, from_day, to_day, variables, # verbose, flat, partialelevationscan, dataformat, # filters=filters) m = loadlevelN(datarootpath, fileprefix=mattype, from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances='time_instances', branch='level1', filters=filters, varshapes=default_varshapes1) if 'time_error_flag' not in m: if verbose: print('irfpy.ica.io.loadlevel1oktime() :') print(' This function requires access to the time_error_flag which ' + 'should be in both aux and proc') print(' Also verify that the dataformat is correct and you have the ' + 'corresponding data files. ') print(' Valid datatypes are "mat" and "h5". You are trying to load "' + dataformat + '"-files.') return {} conditional_gc_collect() return m
[docs]def loadlevel1okall(datarootpath, mattype='proc', from_day=None, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', filters=None): r""" Loads one day or more worth of level1 data analog to irfpy.ica.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. Only data with times where error_flags is == 0 is loaded Consider using one of the readproc(), readaux(), read...() functions instead. PARAMETERS datarootpath (string): local path to the 'root' of the level1 data tree This data tree should contain the file(s) in subdirectories yyyy/mm/dd/ as it is done in the pipeline. mattype (string): filetype to be loaded, one of 'proc' or 'aux' from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Prints what the function does dataformat (string): One of: 'mat'(default) or 'h5'. This function is identical to the call of loadlevel1 with a filters parameter: loadlevel1(datarootpath, .... , filters={'error_flags':0}) OUTPUT Returns a dictionary with all loaded variables """ # variables_flags = variables.copy() # if variables_flags != []: # # no need to add the flags if the variable list is empty as this means load all # if 'error_flags' not in variables_flags: # variables_flags.append('error_flags') if variables is None: variables = [] if filters is None: filters = {} filters['error_flags'] = 0 m = loadlevelN(datarootpath, fileprefix=mattype, from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances='time_instances', branch='level1', filters=filters, varshapes=default_varshapes1) if 'error_flags' not in m: if verbose: print('irfpy.ica.io.loadlevel1okall() : error_flags not found while loading ' + mattype) print(' This function requires access to error_flags,') print(' which should be in the same file that is being loaded.') print(' Also verify that the dataformat is correct and you have ') print(' the corresponding data files. ') print(' Valid datatypes are "mat" and "h5". You are trying to load "' + dataformat + '"-files.') return {} conditional_gc_collect() return m
[docs]def loadlevel1filter(datarootpath, mattype='proc', from_day=None, to_day='', variables=None, verbose=False, flat=False, partialelevationscan=False, dataformat='mat', time_error_flag=None, decoder_error_flag=None, fmt_error_flag=None, edf_error_flag=None, error_flags=None, mode=None, sw_version=None, cur_pacc=None ): r""" Loads one day or more worth of level1 data analog to irfpy.ica.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to that day. Only data with times where the filter criteria apply are loaded PARAMETERS datarootpath (string): local path to the 'root' of the level1 data tree This data tree should contain the file(s) in subdirectories yyyy/mm/dd/ as it is done in the pipeline. mattype (string): filetype to be loaded, one of 'proc' or 'aux' from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (dict): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object filters select the data loaded: time_error_flag (int) : 0 decoder_error_flag (int) : 0 fmt_error_flag (int) : 0 edf_error_flag (int) : 0 error_flags (int) : 0 mode (int) : 0 sw_version (int) : 6 cur_pacc (int) : 6 flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) partialelevationscan (boolean): Optional parameter, default False. If set to False the loading function will only load complete elevation scans. If set to True, the neartest time_instances will be loaded independent of where an elevation scan starts. verbose (boolean): Prints what the function does dataformat (string): One of: 'mat'(default) or 'h5'. OUTPUT Returns a dictionary with all loaded variables. """ if variables is None: variables = [] filters = {} # create a filter list if error_flags is not None: filters['error_flags'] = error_flags if time_error_flag is not None: filters['time_error_flag'] = time_error_flag if decoder_error_flag is not None: filters['decoder_error_flag'] = decoder_error_flag if edf_error_flag is not None: filters['edf_error_flag'] = edf_error_flag if fmt_error_flag is not None: filters['fmt_error_flag'] = fmt_error_flag if mode is not None: filters['mode'] = mode if sw_version is not None: filters['sw_version'] = sw_version if cur_pacc is not None: filters['cur_pacc'] = cur_pacc m = loadlevelN(datarootpath, fileprefix=mattype, from_day=from_day, to_day=to_day, variables=variables, verbose=verbose, flat=flat, partialelevationscan=partialelevationscan, dataformat=dataformat, time_instances='time_instances', branch='level1', filters=filters, varshapes=default_varshapes1) conditional_gc_collect() return m
# %% Level 0 is still treated special.
[docs]def loadlevel0(datarootpath, from_day, to_day='', variables=None, verbose=False, flat=False, dataformat='mat', **kwargs): r""" Loads one hour or more worth of level0 data analog to scipy.io.loadmat('xyz.mat') but with assembling all hourly files corresponding to the time interval specified. Consider using the readraw() function instead. AUTHOR: Gabriella Stenberg Wieser PARAMETERS datarootpath (string): local path to the 'root' of the level0 data tree This data tree should contain the file(s) in subdirectories yyyy/mm/dd/ as it is done in the pipeline. from_day (string or datetime.datetime): string is of the form '20150115T1011' to_day (string or datetime.datetime): optional, string is of the form '20150116T1112'. If given data to the end of the given intervall is loaded, otherwise one day. variables (list of strings): a list of variable names to load. Default is to load all variables. If the loaded variables include 'time_instances' then these are automatically converted to python datetime object flat (boolean): If flat is set to True, the load function will if no files were found also attempt to load files which are at the root of the data tree (without the yyyy/mm/dd/ subdirectories) dataformat (string): One of: 'mat'(default) or 'h5'. OUTPUT returns a dictionary with all loaded variables EXAMPLES:: matfile=ica.io.loadlevel0(icapath,'20150211') matfile=ica.io.loadlevel0(icapath,'20150211', variables=['iontime','ionspectra']) matfile=ica.io.loadlevel0(icapath,'20150211','20150213', variables=['time_interval','E']) """ # do some sanity checking if variables is None: variables = [] if not isinstance(datarootpath, str): raise ValueError( "irfpy.ica.io.loadlevel0: The parameter datarootpath must be of " + "<class 'str'> (a string) but is now " + str(type(datarootpath))) if not isinstance(from_day, (str, dt.datetime)): raise ValueError( "irfpy.ica.io.loadlevel0: The parameter from_day must be of " + "<class 'str'> (a string) or of <class 'datetime.datetime'> but is now " + str(type(from_day))) if not isinstance(to_day, (str, dt.datetime)): raise ValueError( "irfpy.ica.io.loadlevel0: The parameter to_day must be of " + "<class 'str'> (a string) or of <class 'datetime.datetime'> but is now " + str(type(to_day))) if not isinstance(variables, (list, np.ndarray)): raise ValueError( "irfpy.ica.io.loadlevel0: The parameter variables must be of " + "<class 'list'> (a list) or <class 'numpy.ndarray'> (a numpy array) " + "but is now " + str(type(variables))) if isinstance(from_day, dt.datetime): from_day = icatools.datetime2string(from_day) if isinstance(to_day, dt.datetime): to_day = icatools.datetime2string(to_day) tp = "" if datarootpath[-1] != os.sep: tp = datarootpath + os.sep else: tp = datarootpath checkdirectory(datarootpath) # spacial handling of proc and special that are both located in level1 subdir, extension = icatools.get_data_path_info('level0', dataformat) tp = tp + subdir mat = defaultdict(list) # a dictionary with lists as elements mattype = 'RPC_ICA_' # handle default only one day if to_day == '': # print('searching for: '+tp+'*/*/*/'+mattype+from_day+'*mat') files = np.array(glob.glob(tp + '*'+os.sep+'*'+os.sep+'*'+os.sep + mattype + from_day[0:11] + '*' + extension)) if len(files) == 0: # nothing found assume non subdirectory stee! if flat: files = np.array(glob.glob(tp + mattype + from_day[0:11] + '*' + extension)) if len(files) == 0: # nothing found assume non subdirectory stee! return {} # print(files) files.sort() npfiles = np.array(files) usethese = np.array([npfiles > '']) else: # several days # get all candidates files = np.array(glob.glob(tp + '*'+os.sep+'*'+os.sep+'*'+os.sep + mattype + '*' + extension)) if len(files) == 0: # nothing found! if flat: files = np.array(glob.glob(tp + mattype + from_day[0:11] + '*' + extension)) if len(files) == 0: # nothing found assume non subdirectory stee! return {} files.sort() npfiles = np.array(files) filesshort = np.array([os.path.split(f)[1] for f in npfiles]) # no .mat here because of sorting order fromfil = mattype + from_day[0:8].ljust(8, '0') + 'T' + from_day[9:15].ljust(6, '0') tofil = mattype + to_day[0:8].ljust(8, '9') + 'T' + \ to_day[9:15].ljust(6, '9') + '.' + extension usethese = np.logical_and([filesshort >= fromfil], [filesshort <= tofil]) # There is a possibility that usethese has shape (1,n) but we want (n,) # np.squeeze() should not be used because if the shape is (1,1) then # the result will be a skalar. if len(usethese.shape)==2: usethese = usethese[0,:] if variables == []: variablelist = None else: variablelist = variables.copy() for file in npfiles[usethese]: if verbose: print(' loading: ' + file) matfile = _loaddata(file, variable_names=variablelist, datafileextension=extension) for key in matfile: if '__' in key: continue # the black list: if key in _blacklistvars: continue if (variables == []) or (key in variablelist): if key not in mat: # a new key not seen before if np.shape(matfile[key]) == (): mat[key].append(matfile[key]) # scalar else: mat[key].append(np.squeeze(matfile[key])) else: # something we already have theshape = np.shape(mat[key][0]) # all 1D vars: if len(theshape) == 1: # the following things from proc need to be concatenated if key in _level0shape1vars: mat[key].append(np.squeeze(matfile[key], axis=0)) # all 2D vars: elif len(theshape) == 2: # the following things from proc need to be concatenated if key in _level0shape2vars: mat[key].append(matfile[key]) continue allmat = {} # the final result # concatenate the lists for key in mat.keys(): # something we already have theshape = np.shape(mat[key][0]) # take the shape from the first element # all 1D vars: if len(theshape) == 1: # the following things from proc need to be concatenated if key in _level0shape1vars: allmat[key] = np.concatenate(mat[key], axis=0) continue # all 2D vars: elif len(theshape) == 2: # the following things from proc need to be concatenated if key in _level0shape2vars: allmat[key] = np.concatenate(mat[key], axis=1) continue # if no match with anything, just copy the first element allmat[key] = mat[key][0] # post processing for key in allmat: # change time variables to datetime objects if key in _level0datetimevars: allmat[key] = icatools.matlab2datetime(allmat[key]) # make sure values are float # if key in ['sum_orig_ionspectra','orig_ionspectra','sum_orig_ions','clean_ionspectra']: # mat[key] = mat[key]*1.0 conditional_gc_collect() return allmat
[docs]def checkdirectory(d, branch='level1'): """ Verifies that the given directory points to the root of an ICA data tree """ if d[-1] != os.sep: d = d + os.sep pathendswith = os.path.basename(os.path.normpath(d)) if (len(glob.glob(d + '20*')) > 0) or \ (pathendswith in ['aux', 'bestc', 'proc', 'mag', 'cops', 'lap', 'level1', 'level2', 'level3', branch]): raise ValueError("\n" + "*********************************************************************\n" + "The given path to ICA data files '" + d + "'\npoints to the wrong place: " + "Starting from version 3.5.0 of irfpy.ica,\n" + "the data path should point ot the top of a subdirectory structure\n" + "as shown in \n" + "https://rosetta-wiki.irf.se/doku.php?id=pipeline_directory_structure\n" + "For the given path '" + d + "', \nthe data files were expected to be in: \n" + "'" + d + "xxxxx/matlab/20yy/mm/dd/*.mat'\n".replace('/',os.sep) + "(with xxxxx one of level0, level1, level2, mag, cops, aux, etc.),\n" + "but your data is apparently located in: \n" + "'" + d + "20yy/mm/dd/*.mat'\n".replace('/',os.sep) + "Your path must not contain the 'xxxxx' or 'matlab' parts.\n" + "Please update your local data structure by moving files.\n" + "Contact wieser@irf.se if you need support in this matter.\n" + "*********************************************************************\n" + "\n")
# %% write data to a readxxx compatible structure
[docs]def buildfilename(apath,someday,prefix,postfix,hour): if isinstance(someday, dt.datetime): someday = icatools.datetime2string(someday) return apath + os.sep + someday[:4]+os.sep+someday[4:6]+os.sep+someday[6:8]+os.sep+\ prefix+someday[:8]+'T{:02d}'.format(hour)+'0000'+postfix
[docs]def writedailydata(dataroot, data, theday, fileprefix=None, branch=None, time_instances="time_instances", dataformat='mat', keepNaN=False, commentlist=None, dataversion = '1.0'): if not isinstance(data, dict): raise ValueError( "irfpy.ica.io.writedailydata: The parameter 'data' must be of <class 'dict'> " + "(a dict) but is now "+ str(type(data)) + ".") if branch is None: raise ValueError( "irfpy.ica.io.writedailydata: The parameter 'branch' must be of <class 'str'> " + "(a string) but is now None. 'branch' is the directory name below 'dataroot' where the data is written.") if fileprefix is None: raise ValueError( "irfpy.ica.io.writedailydata: The parameter 'fileprefix' must be of <class 'str'> " + "(a string) but is now None. 'fileprefix' is the string at the start if the filename " + "to be written. e.g. fileprefix='proc' will write files like 'proc20990101T010000.mat'") if dataformat not in ['mat','h5']: raise ValueError( "irfpy.ica.io.writedailydata: The parameter 'dataformat' must be one of 'mat' or 'h5'. "+ "It determines the filetype that is written ") if time_instances not in data: raise ValueError( "irfpy.ica.io.writedailydata: The parameter 'data' must contain a key named " + time_instances + " to allow for proper time handling.") if not isinstance(theday, (str, dt.datetime)): raise ValueError( "irfpy.ica.io.loadlevelN: The parameter theday must be of <class 'str'> " + "(a string) or of <class 'datetime.datetime'> but is now " + str(type(theday))) thetime = data[time_instances] if not isinstance(thetime[0], dt.datetime): raise ValueError( "irfpy.ica.io.loadlevelN: the type of data["+time_instances+"] "+ " of <class 'datetime.datetime'> but is now " + str(type(thetime[0]))) if commentlist is None: commentlist = list() commentlist.append('0.0: Generated using irfpy.ica.io.writedailydata()') commentlist.append(str(dataversion)+': This version') if isinstance(theday,dt.datetime): theday = theday.strftime("%Y%m%d") # loop over all hours. magtime is sorted already yy = int(theday[:4]) mm = int(theday[4:6]) dd = int(theday[6:8]) for hh in range(24): # make a mask containing only this hour thishour = np.array([x.hour == hh and x.year == yy and x.month== mm and x.day==dd for x in thetime]) # if the mask is not empty make a file: has_H = any(x for x in thishour) if has_H: # extract the data dic = dict() # extract the data dic[time_instances] = icatools.datetime2matlab(ma.masked_array( thetime, mask=~thishour).compressed()) hasfinitevalues = False for key in data.keys(): print("key: "+key) if key != time_instances: try: dic[key] = data[key][...,thishour] if not np.isnan(dic[key]).all(): hasfinitevalues = True #print(dic[key].shape) except: #dic[key] = data[key] print("Not using key '{}' as it is not a numpy array".format(key)) if hasfinitevalues or keepNaN: #dicshapes = [type(dic[k]) for k in dic.keys()] #print(dic.keys(),dicshapes) dic[fileprefix+'_processed_data_comment'] = commentlist dic[fileprefix+'_processed_data_version'] = [dataversion,] if dataformat=='mat': afilename = buildfilename(dataroot+os.sep + branch + os.sep + 'matlab', theday, prefix=fileprefix, postfix='.mat', hour=hh) else: afilename = buildfilename(dataroot+os.sep + branch + os.sep + 'hdf5', theday, prefix=fileprefix, postfix='.h5', hour=hh) os.makedirs(os.path.dirname(afilename), exist_ok=True) print(' writing: ' + afilename) savemat(afilename, dic) gc.collect() else: print(' *** skipped ' + str(hh) + ' hours') else: print(' *** skipped ' + str(hh) + ' hours') return dic
# %% if __name__ == '__main__': # Example content of example ~/.irfpyrc: # [icadds] # dataroot =/home/tinu/ica/processed/ from irfpy.util.irfpyrc import Rc rc = Rc() icadatarootpath = rc.get('icadds', 'dataroot') print(icadatarootpath) expectedversion(proc='4.5',lap='1.1',verbose=True) theday = '20160419' import irfpy.ica.pipeline as pip # import pipeline as pip # pip.updateraw(icadatarootpath,theday,progress=print) pip.updateproc(icadatarootpath,theday,progress=print,dataformat='mat') pip.updatespecial(icadatarootpath,progress=print) gmat=readspecial(icadatarootpath,verbose=True) pip.updatelap(icadatarootpath,theday,progress=print) pip.updatecops(icadatarootpath,theday,progress=print,private=True) pip.updatemag(icadatarootpath,theday,progress=print) pip.updatebestc(icadatarootpath,theday,progress=print) # gmat=readproc(icadatarootpath,'20160101T0633','20160101T1133', # variables=['time_instances'],verbose=True) # gmat=readproc(icadatarootpath,theday, # variables=['time_instances','noise_reduction','orig_ionspectra'], # verbose=True) matlap=readlap(icadatarootpath,theday,verbose=True) gmat=readmag(icadatarootpath,theday,variables=['time_instances'],verbose=True) gmat=readspecial(icadatarootpath,generation=6,verbose=True) gmat=readbestc(icadatarootpath,'20160929T01','20160929T04',verbose=True) # pip.updateproc(icadatarootpath,'20150730',progress=print,dataformat='h5') pip.updateproc(icadatarootpath,'20150730',progress=print,dataformat='mat') pip.updateproc(icadatarootpath,'20160101',progress=print,dataformat='mat') gh5=readproc(icadatarootpath,'20150730T0', variables=['time_instances','noise_reduction','orig_ionspectra'], verbose=True, dataformat='h5') gmat=readproc(icadatarootpath,'20150730T0', variables=['time_instances','noise_reduction','orig_ionspectra'], verbose=True, dataformat='mat') gh5=readproc(icadatarootpath,'20150730T01','20150730T01', verbose=True,dataformat='h5') gmat=readproc(icadatarootpath,'20150730T01','20150730T01', verbose=True,dataformat='mat') gmat=readproc(icadatarootpath, dt.datetime(2015,7,30,6,33), dt.datetime(2015,7,30,6,43),verbose=True) print(len(gmat['time_instances'])) gmat=readproc(icadatarootpath, dt.datetime(2016,1,1,6,0), dt.datetime(2016,1,2,6,59), variables=['time_instances'], verbose=True, partialelevationscan=False) print(len(gmat['time_instances'])) #h = readbestc(icadatarootpath, '20160930', verbose=True, dataformat='mat') # set_datafiletype('mat') # m=readbestc(icadatarootpath,'20160920', verbose=True) spec = readspecial(icadatarootpath, generation=6, variables=['ICAanalyzerconstant', 'ICAhighvoltageoffset', 'ESC_H_volt', 'ESC_L_volt', 'ICAsoftwareversion'],verbose=True) #%% xmat=readproc(icadatarootpath,'20150729T2330','20150730T0910', variables=['time_instances','noise_reduction','orig_ionspectra'], verbose=True, dataformat='mat') #xmat.pop('ica_processed_data_version') #xmat.pop('ica_processed_data_comment') #%% test= writedailydata(icadatarootpath, xmat, '20150730', fileprefix='wonder', branch='mix', time_instances="time_instances", dataformat='mat') #%% xmatread = loadlevelN(icadatarootpath, fileprefix='wonder', from_day='20150730T00',to_day='20150730T09', branch='mix', time_instances='time_instances', varshapes=default_varshapes1, verbose=True) #%% bbb = np.array(['', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V02_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V03_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V04_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V05_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V06_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V07_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V08_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V09_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaPhysElTable_V10_00.txt']) aaa = np.array(['', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V02_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V03_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V04_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V05_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V06_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V07_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V08_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V09_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaElTable_V10_00.txt']) ccc = np.array(['', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V02_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V03_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V04_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V05_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V06_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V07_01.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V08_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V09_00.txt', 'ica_pipeline_0to1/calibration/IcaRosettaEnTable_V10_00.txt'])