Source code for irfpy.ica.baseline

# -*- coding: utf-8 -*-
"""
Created on Wed Dec 30 16:05:47 2015

@author: M Wieser

LICENCE
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>
"""


import numpy as np
from scipy.sparse.linalg import spsolve
from scipy.linalg import cholesky
# from scikits.sparse.cholmod import cholesky
from scipy import sparse

from scipy.stats import norm
import matplotlib.pyplot as pl


[docs]def als(y, lam=1e6, p=0.1, itermax=10): r""" Implements an Asymmetric Least Squares Smoothing baseline correction algorithm (P. Eilers, H. Boelens 2005) Baseline Correction with Asymmetric Least Squares Smoothing based on https://github.com/vicngtor/BaySpecPlots Baseline Correction with Asymmetric Least Squares Smoothing Paul H. C. Eilers and Hans F.M. Boelens October 21, 2005 Description from the original documentation: Most baseline problems in instrumental methods are characterized by a smooth baseline and a superimposed signal that carries the analytical information: a series of peaks that are either all positive or all negative. We combine a smoother with asymmetric weighting of deviations from the (smooth) trend get an effective baseline estimator. It is easy to use, fast and keeps the analytical peak signal intact. No prior information about peak shapes or baseline (polynomial) is needed by the method. The performance is illustrated by simulation and applications to real data. Inputs: y: input data (i.e. chromatogram of spectrum) lam: parameter that can be adjusted by user. The larger lambda is, the smoother the resulting background, z p: wheighting deviations. 0.5 = symmetric, <0.5: negative deviations are stronger suppressed itermax: number of iterations to perform Output: the fitted background vector """ L = len(y) # D = sparse.csc_matrix(np.diff(np.eye(L), 2)) D = sparse.eye(L, format='csc') D = D[1:] - D[:-1] # numpy.diff( ,2) does not work with sparse matrix. This is a workaround. D = D[1:] - D[:-1] D = D.T w = np.ones(L) for i in range(itermax): W = sparse.diags(w, 0, shape=(L, L)) Z = W + lam * D.dot(D.T) z = spsolve(Z, w * y) w = p * (y > z) + (1 - p) * (y < z) return z
[docs]def arpls(y, lam=1e4, ratio=0.05, itermax=100): r""" Baseline correction using asymmetrically reweighted penalized least squares smoothing Sung-June Baek, Aaron Park, Young-Jin Ahna and Jaebum Choo, Analyst, 2015, 140, 250 (2015) Abstract Baseline correction methods based on penalized least squares are successfully applied to various spectral analyses. The methods change the weights iteratively by estimating a baseline. If a signal is below a previously fitted baseline, large weight is given. On the other hand, no weight or small weight is given when a signal is above a fitted baseline as it could be assumed to be a part of the peak. As noise is distributed above the baseline as well as below the baseline, however, it is desirable to give the same or similar weights in either case. For the purpose, we propose a new weighting scheme based on the generalized logistic function. The proposed method estimates the noise level iteratively and adjusts the weights correspondingly. According to the experimental results with simulated spectra and measured Raman spectra, the proposed method outperforms the existing methods for baseline correction and peak height estimation. Inputs: y: input data (i.e. chromatogram of spectrum) lam: parameter that can be adjusted by user. The larger lambda is, the smoother the resulting background, z ratio: wheighting deviations: 0 < ratio < 1, smaller values allow less negative values itermax: number of iterations to perform Output: the fitted background vector """ N = len(y) # D = sparse.csc_matrix(np.diff(np.eye(N), 2)) D = sparse.eye(N, format='csc') D = D[1:] - D[:-1] # numpy.diff( ,2) does not work with sparse matrix. This is a workaround. D = D[1:] - D[:-1] H = lam * D.T * D w = np.ones(N) for i in range(itermax): W = sparse.diags(w, 0, shape=(N, N)) WH = sparse.csc_matrix(W + H) C = sparse.csc_matrix(cholesky(WH.todense())) z = spsolve(C, spsolve(C.T, w * y)) d = y - z dn = d[d < 0] m = np.mean(dn) s = np.std(dn) wt = 1. / (1 + np.exp(2 * (d - (2 * s - m)) / s)) if np.linalg.norm(w - wt) / np.linalg.norm(w) < ratio: break w = wt return z
[docs]def WhittakerSmooth(x, w, lam, differences=1): ''' Penalized least squares algorithm for background fitting input x: input data (i.e. chromatogram of spectrum) w: binary masks (value of the mask is zero if a point belongs to peaks and one otherwise) lam: parameter that can be adjusted by user. The larger lambda is, the smoother the resulting background differences: integer indicating the order of the difference of penalties output: the fitted background vector ''' X = np.matrix(x) m = X.size # D = csc_matrix(np.diff(np.eye(m), differences)) D = sparse.eye(m, format='csc') for i in range(differences): D = D[1:] - D[:-1] # numpy.diff() does not work with sparse matrix. This is a workaround. W = sparse.diags(w, 0, shape=(m, m)) A = sparse.csc_matrix(W + (lam * D.T * D)) B = sparse.csc_matrix(W * X.T) background = spsolve(A, B) return np.array(background)
[docs]def airpls(x, lam=100, porder=1, itermax=100): ''' airpls.py Copyright 2014 Renato Lombardo - renato.lombardo@unipa.it Baseline correction using adaptive iteratively reweighted penalized least squares This program is a translation in python of the R source code of airPLS version 2.0 by Yizeng Liang and Zhang Zhimin - https://code.google.com/p/airpls Reference: Z.-M. Zhang, S. Chen, and Y.-Z. Liang, Baseline correction using adaptive iteratively reweighted penalized least squares. Analyst 135 (5), 1138-1146 (2010). Description from the original documentation: Baseline drift always blurs or even swamps signals and deteriorates analytical results, particularly in multivariate analysis. It is necessary to correct baseline drift to perform further data analysis. Simple or modified polynomial fitting has been found to be effective in some extent. However, this method requires user intervention and prone to variability especially in low signal-to-noise ratio environments. The proposed adaptive iteratively reweighted Penalized Least Squares (airPLS) algorithm doesn't require any user intervention and prior information, such as detected peaks. It iteratively changes weights of sum squares errors (SSE) between the fitted baseline and original signals, and the weights of SSE are obtained adaptively using between previously fitted baseline and original signals. This baseline estimator is general, fast and flexible in fitting baseline. Adaptive iteratively reweighted penalized least squares for baseline fitting input x: input data (i.e. chromatogram of spectrum) lam: parameter that can be adjusted by user. The larger lambda is, the smoother the resulting background, z porder: integer indicating the order of the difference of penalties output: the fitted background vector ''' m = x.shape[0] w = np.ones(m) for i in range(1, itermax + 1): z = WhittakerSmooth(x, w, lam, porder) d = x - z dssn = np.abs(d[d < 0].sum()) if(dssn < 0.001 * (abs(x)).sum() or i == itermax): if(i == itermax): print('airpls: max iteration reached!') break w[d >= 0] = 0 # d>0 means that this point is part of a peak, # so its weight is set to 0 in order to ignore it w[d < 0] = np.exp(i * np.abs(d[d < 0]) / dssn) w[0] = np.exp(i * (d[d < 0]).max() / dssn) w[-1] = w[0] return z
if __name__ == '__main__': ''' Example usage and testing ''' print('Testing...') x = np.arange(0, 1000, 1) g1 = norm(loc=100, scale=10.0) # generate three gaussian as a signal g2 = norm(loc=300, scale=30.0) g3 = norm(loc=750, scale=5.0) signal = g1.pdf(x) + g2.pdf(x) + g3.pdf(x) baseline1 = 5e-4 * x + 0.2 # linear baseline baseline2 = 0.2 * np.sin(np.pi * x / x.max()) # sinusoidal baseline noise = np.random.poisson(100 * baseline2, size=x.shape[0]) / 800 print('Generating simulated experiment') y1 = signal + baseline1 + noise y2 = signal + baseline2 + noise print('Removing baselines') c1 = y1 - airpls(y1) # corrected values c2 = y1 - arpls(y1, 1e7, 0.1) # with baseline removed g1 = y2 - airpls(y2) # corrected values g2 = y2 - arpls(y2, 1e7, 0.01) # with baseline removed print('Plotting results') fig, ax = pl.subplots(nrows=2, ncols=1) ax[0].plot(x, y1, '-k') ax[0].plot(x, c1, '-r') ax[0].plot(x, c2, '-b') ax[0].set_title('Linear baseline') ax[1].plot(x, y2, '-k') ax[1].plot(x, g1, '-r') ax[1].plot(x, g2, '-b') ax[1].set_title('Sinusoidal baseline') pl.show() print('Done!')