""" Intermediate file handling.
.. codeauthor:: Futaana
It handles the intermediate file.
Here intermediate file means any data file that stores the results
of any calculation or processing.
Example use cases:
- You read a hugedata set but needed only a part of it in the next processing.
Therefore, the needed part should be saved into a separated file in my
local harddisk.
- (More will come)
Sample usage:
1. Using :func:`intermfile` function:
.. code-block:: python
from irfpy.util.intermfile import intermfile
def my_function(a, b, c=0, d=0):
return (a + b, c * d)
dat = intermfile('datfile.pickle', my_function, args=[5, 10], kwds={'c': 3, 'd': 7})
2. Using decorator, :func:`interm`:
.. code-block:: python
from irfpy.util.intermfile import interm
@interm('datafile.pickle')
def my_function(a, b, c=0, d=0):
return (a + b, c * d)
my_function(5, 10, c=3, d=7)
In either cases, the call my_function returns (15, 21) always.
The value is obtaine from running my_function() if the 'datafile.pickle' is not found,
or obained from the intermediate file 'datafile.pickle'.
Which to choose? Function (option 1) is flexible but less intuitive. Decorator (option 2) is easy but less flexible.
.. note::
- Good point of using :func:`interm` decorator is the simple syntax for users.
The call is trivial.
However, the decorator is less flexible, because the meta data (intermediate filename,
version, or expiration time) should be defined at compile of function, not runtime.
- Good point of using :func:`intermfile` function is flexibility.
User can determine the intermediate file name at runtime, indicating that the user can
dump multiple intermediate files from the same function (e.g. using different parameters).
The weak point is that the syntax is less trivial, and the function's argumenets or
keyword should be provided in a non-standard way.
.. note::
Developer note:
It is inspired by :mod:`irfpy.util.filepairv`.
Difference is that
- This module is for more generic purpose.
- :mod:`irfpy.util.filepairv` is only for reading and caching file.
Which to recommend to user? As of 2016-11-15, the :mod:`irfpy.util.filepairv` is more tested and
used so that it is more stable. This module may replace the
:mod:`irfpy.util.filepairv` module in the future.
"""
import os
import pickle
import gzip
import bz2
import time
import logging
_logger = logging.getLogger(__name__)
from collections import namedtuple
import numpy as np
def _data_from_function(processing_function, args=None, kwds=None):
if args is None: args = []
if kwds is None: kwds = {}
return processing_function(*args, **kwds)
IntermFileMetaData = namedtuple('IntermFileMetaData', ['version', 'creation_time'])
IntermFileMetaData.__doc__ = """ Class to store the intermediate file's meta data.
:param version: The version number
:param creation_time: Creation time
"""
class _IntermFileReader:
""" Represent reading intermediate file.
"""
def __init__(self, intermediate_filename):
if intermediate_filename.endswith('.gz'):
fp = gzip.open(intermediate_filename, 'rb')
elif intermediate_filename.endswith('.bz2'):
fp = bz2.open(intermediate_filename, 'rb')
else:
fp = open(intermediate_filename, 'rb')
self._meta = pickle.load(fp)
self._dat = None # Data is not read in this stage. You should call dat() method.
self._fp = fp
def dat(self):
if self._dat is None:
self._dat = pickle.load(self._fp)
self._fp.close()
return self._dat
def version(self):
return self._meta.version
def is_expired(self, expire):
now = time.time()
pickle_mtime = self._meta.creation_time
return (now - pickle_mtime > expire)
def _pickle_to(intermediate_filename, meta, dat, compresslevel=9):
if intermediate_filename.endswith('.gz'):
fp = gzip.open(intermediate_filename, 'wb', compresslevel=compresslevel)
elif intermediate_filename.endswith('.bz2'):
fp = bz2.open(intermediate_filename, 'wb', compresslevel=compresslevel)
else:
fp = open(intermediate_filename, 'wb')
pickle.dump(meta, fp)
pickle.dump(dat, fp)
fp.close()
[docs]def intermfile(intermediate_filename, processing_function, args=None, kwds=None, version=0, refresh=False, expire=np.inf, compresslevel=9):
""" Process the data, and save to the given file, or read from the file.
:param intermediate_filename: Intermediate file name.
:param processing_function: A function that should be called.
:keyword args: Argument to be given to processing function.
:type args: None or ``list``
:keyword kwds: Keyword to be given to processing function.
:type kwds: None or ``dict``
:keyword version: Version number of the intermediate data file. If the version
number in the data file is different from the given version,
reprocessed.
:keyword refresh: If set to *True*, reprocessed.
:keyword expire: After the given value (in seconds), the intermediate file will be invalid.
:keyword compresslevel: Gzip / Bzip2 compression level. It is only valid if the
intermediate file name ends with ".gz" or ".bz2".
:returns: The resulting object, either the object read from the ``intermediate_filename``
or the results of ``processing_function``
The call
.. code-block:: python
from irfpy.util.intermfile import intermfile
dat = intermfile('datfile.pickle', my_function)
is equivalent to
.. code-block:: python
import pickle
if os.path.exists('datafile.pickle'):
with open('datafile.pickle') as fp:
dat = pickle.load(fp)
else:
dat = my_function()
with open('datafile.pickle', 'wb') as fp:
pickle.dump(dat, fp)
"""
### Data is read from pickle file.
if (not refresh) and os.path.exists(intermediate_filename):
_logger.info('... Loading from the intermediate file: {}'.format(intermediate_filename))
try:
freader = _IntermFileReader(intermediate_filename) # Metadata is read
if freader.version() == version and (not freader.is_expired(expire)):
_logger.info('... Intermediate file is valid. Read from the file ``{}``.'.format(intermediate_filename))
dat = freader.dat()
_logger.info('... Done.')
return dat # If successfully loaded, the data will be returned.
else:
_logger.info('... Intermediate file ``{}`` has been expired.'.format(intermediate_filename))
except KeyboardInterrupt as e:
_logger.warning('User stopped the reading file.')
raise
except Exception as e:
_logger.warning('')
_logger.warning('Read from intermediate file ``{}`` failed.'.format(intermediate_filename))
_logger.warning('Reason: {}'.format(str(e)))
_logger.warning('Please do not worry. Re-processing will be soon initiated')
_logger.warning('')
### Data is read from original function
_logger.info('... (Re-)processing data')
dat = _data_from_function(processing_function, args, kwds)
meta = IntermFileMetaData(version=version, creation_time=time.time())
try:
_logger.info('... Intermediate file ``{}`` is producing.'.format(intermediate_filename))
_pickle_to(intermediate_filename, meta, dat, compresslevel=compresslevel)
_logger.info('... Done')
except Exception as e:
_logger.warning('')
_logger.warning('Write to intermediate file ``{}`` failed.'.format(intermediate_filename))
_logger.warning('Reason: {}'.format(str(e)))
_logger.warning('Please do not worry. The intermediate file was not produced, but the data is returned')
_logger.warning('')
if os.path.exists(intermediate_filename):
try:
os.remove(intermediate_filename)
except:
pass
return dat
[docs]def interm(filename, version=0, expire=np.inf, compresslevel=9):
""" Decorator version of intermediat file.
:param filename: The name of the intermediate file.
:keyword version: Specify the version of the program. If the given version is different from the version
in the intermediate file (``filename``), the function is re-run.
:keyword expire: Expiration time in seconds.
:keyword compresslevel: For .gz or .bz2, compression level is settled.
The :meth:`interm` is the decoration version of :meth:`intermfile`.
Assume you have a function ``user_function``, returning some of the data.
.. code-block:: python
def user_function(a, b, c=0, d=0):
return (a + b, c * d)
This function can be decorated such as
.. code-block:: python
from irfpy.util.intermfile import interm
@interm('user_function.dat', version=1, expire=86400)
def user_function(a, b, c=0, d=0):
return (a + b, c * d)
Then, the ``user_function()`` return the tuple, but in addition,
dump the pickle file of the returned value to ``user_function.dat``
for the first call.
.. code-block:: python
> user_function(1, 5, c=10, d=40)
# => (6, 400)
.. code-block:: sh
% ls
... user_function.dat ...
The second call of the ``user_function`` then rely on the pickle file.
.. code-block:: python
> user_function(1, 5, c=10, d=40)
# => (6, 400)
Indeed, this tuple is read from the pickle file.
It means that changing the argument/keyword will not reflect the results
as long as ``user_function.dat`` file exists.
.. code-block:: python
> user_function(2, 7, c=1, d=4)
# => (6, 400)
You may expect to get ``(9, 4)`` for this call, however, it is not.
It is because, again, the data is read from the intermediate file.
"""
def _interm(func):
import functools
@functools.wraps(func)
def wrapper(*args, **kwds):
if os.path.exists(filename):
_logger.info('... Loading from the intermediate file: {}'.format(filename))
try:
freader = _IntermFileReader(filename) # Metadata is read
if freader.version() == version and (not freader.is_expired(expire)):
_logger.info('... Intermediate file is valid. Read from the file ``{}``.'.format(filename))
dat = freader.dat()
_logger.info('... Done.')
return dat
else:
_logger.info('... Intermediate file ``{}`` has been expired.'.format(filename))
_logger.info('... version in file = {} version given = {}'.format(freader.version(), version))
_logger.info('... expiration is {}'.format(freader.is_expired(expire)))
except KeyboardInterrupt as e:
_logger.warn('User stopped the reading file.')
raise
except Exception as e:
_logger.warning('')
_logger.warning('Read from intermediate file ``{}`` failed.'.format(filename))
_logger.warning('Reason: {}'.format(str(e)))
_logger.warning('Please do not worry. Re-processing will be soon initiated')
_logger.warning('')
### Data is read from original function
_logger.info('... (Re-)processing data')
dat = _data_from_function(func, args, kwds)
meta = IntermFileMetaData(version=version, creation_time=time.time())
try:
_logger.info('... Intermediate file ``{}`` is producing.'.format(filename))
_pickle_to(filename, meta, dat, compresslevel=compresslevel)
_logger.info('... Done')
except Exception as e:
_logger.warning('')
_logger.warning('Write to intermediate file ``{}`` failed.'.format(filename))
_logger.warning('Reason: {}'.format(str(e)))
_logger.warning('Please do not worry. The intermediate file was not produced, but the data is returned')
_logger.warning('')
if os.path.exists(filename):
try:
os.remove(filename)
except:
pass
return dat
return wrapper
return _interm