"""
The Chain module is the class for performing calculations using wax carbon
chain-length concentration/abundance data imported as a 2D array-like object.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from composition_stats import clr, closure, multiplicative_replacement
import scipy.stats
# import warnings
# from ..utils import validate_data
[docs]
class Chain:
"""
Represents leaf wax carbon chain-length concentration/abundance data
imported as a 2-D, array-like object (i.e., list, array) with rows
representing unique samples and columns representing unique data types
(carbon chain-length number).
Parameters
----------
input_data : 2-D array-like
User leaf wax chain-length concentration/abundance data.
Attributes
----------
input_data : 2-D array-like
User leaf wax chain-length concentration/abundance data.
Examples
--------
.. jupyter-execute::
from leafwaxtools import Chain
"""
def __init__(self, input_data):
self.data = input_data
if self.data.ndim != 2:
raise TypeError("'input_data' must be 2-dimensional")
[docs]
def total_conc(self, zero_total=0, calculate_log=False):
"""
Calculates the total concentration of each sample (rows).
Parameters
----------
zero_total : int, optional
Return value if the sum of all columns in a row = 0. The default
is 0.
calculate_log : bool, optional
Returns log (base e) of the sum of each row instead of just the
sum. The default is False.
Raises
------
ValueError
Raises an error when 'calculate_log' is neither True nor False.
Returns
-------
total_conc : numpy.ndarray
1-D Numpy array of total leaf wax concentrations for each sample
(row).
"""
total_conc = np.zeros(len(self.data[:,0]))
for row in range(0, len(self.data[:,0])):
total_conc[row] = np.nansum(self.data[row,:])
if total_conc[row] == 0:
total_conc[row] = zero_total
if calculate_log is True:
total_conc = np.log(total_conc)
elif calculate_log is False:
total_conc = total_conc
else:
raise ValueError("'calculate_log' must either be True or False (default)")
return total_conc
[docs]
def relative_abd(self, calculate_percent=False):
"""
Calculates the relative abundance (fraction out of 1 or percentage) of
each leaf wax carbon chain-length (columns) for each sample (rows).
Parameters
----------
calculate_percent : bool, optional
Calculate each chain-length relative abundance as a percentage
instead of a fraction of 1. The default is False.
Raises
------
ValueError
Raises an error when 'calculate_percent' is neither True nor False.
Returns
-------
rel_abd : numpy.ndarray
2-D Numpy array of leaf wax chain-length relative abundances
(columns) for each sample (row).
"""
rel_abd = np.zeros(np.shape(self.data))
for row in range(0, len(self.data[:,0])):
for col in range(0, len(self.data[0,:])):
rel_abd[row,col] = self.data[row,col]/np.sum(self.data[row,:])
if calculate_percent is True:
for row in range(0, len(self.data[:,0])):
for col in range(0, len(self.data[0,:])):
rel_abd[row,col] = rel_abd[row,col]*100
elif calculate_percent is False:
rel_abd = rel_abd
else:
raise ValueError("'calculate_percent' must either be True or False (default)")
return rel_abd
[docs]
def acl(self, chain_lengths):
"""
Calculates the Average Chain-Length (ACL; Bray & Evans, 1961; Bush &
McInerney, 2013) of each sample (rows).
References:
Bray, E. E., & Evans, E. D. (1961). Distribution of n-paraffins as a
clue to recognition of source beds. Geochimica et Cosmochimica Acta,
22(1), 2-15. https://doi.org/10.1016/0016-7037(61)90069-2
Bush, R. T., & McInerney, F. A. (2013). Leaf wax n-alkane
distributions in and across modern plants: Implications for
paleoecology and chemotaxonomy. Geochimica et Cosmochimica Acta, 117,
161-179. https://doi.org/10.1016/j.gca.2013.04.016
Parameters
----------
chain_lengths : list
List of integers or floats representing the carbon chain-length
number of each column.
Raises
------
TypeError
Raises an error if 'chain_lengths' is not a list.
ValueError
Raises an error if 'chain_lengths' is an empty list.
Returns
-------
acl : numpy.ndarray
1-D Numpy array of ACL values for each sample (row).
"""
if type(chain_lengths) is not list:
raise TypeError(
"'chain_lengths' must be a list() type containing integers or floats; Example: [22, 24, 26, 28]"
)
if len(chain_lengths) < 1:
raise ValueError(
"'chain_lengths' is currently an empty list. Please make sure 'chain_lengths' contains at least 1 integer or float."
)
# Add check if len(chain_lengths) != # of data columns
acl_numer = np.zeros(len(self.data[:,0]))
acl = np.zeros(len(self.data[:,0]))
for row in range(0, len(self.data[:,0])):
for col in range(0, len(self.data[0,:])):
acl_numer[row] += self.data[row,col] * chain_lengths[col]
acl[row] = acl_numer[row]/np.sum(self.data[row,:])
return acl
[docs]
def cpi(self, chain_lengths, even_over_odd=True):
"""
Calculates the Carbon Preference Index (CPI; Marzi et al., 1993) of
each sample (rows).
References:
Marzi, R., Torkelson, B. E., & Olson, R. K. (1993). A revised carbon
preference index. Organic Geochemistry, 20(8), 1303-1306.
https://doi.org/10.1016/0146-6380(93)90016-5
Parameters
----------
chain_lengths : list
List of integers or floats representing the carbon chain-length
number of each column.
even_over_odd : bool, optional
Calculates the CPI of even-chain over odd-chain leaf waxes (use
case for n-alkanoic acids). Change to False to calculate the CPI
of odd-chain over even-chain waxes (use case for n-alkanes). The
default is True.
Raises
------
TypeError
Raises an error if 'chain_lengths' is not a list.
ValueError
Raises an error if 'chain_lengths' is an empty list or if
'even_over_odd' is neither True nor False.
Returns
-------
cpi : numpy.ndarray
1-D Numpy array of CPI values for each sample (row).
"""
if type(chain_lengths) is not list:
raise TypeError(
"'chain_lengths' must be a list() type containing integers or floats; Example: [22, 23, 24, 25, 26]"
)
if len(chain_lengths) < 1:
raise ValueError(
"'chain_lengths' is currently an empty list. Please make sure 'chain_lengths' contains at least 1 integer or float."
)
'''
EKT: use warnings to flag if even over odd order is wrong
'''
chain_lengths_even = [num for num in chain_lengths if num % 2 == 0]
chain_lengths_odd = [num for num in chain_lengths if num % 2 == 1]
data = pd.DataFrame(data=self.data, columns=(map(str, chain_lengths)))
data_even = np.array(data.filter(items=(map(str, chain_lengths_even))))
data_odd = np.array(data.filter(items=(map(str, chain_lengths_odd))))
cpi = np.zeros(len(self.data[:,0]))
if even_over_odd is True:
for row in range(0, len(self.data[:,0])):
cpi[row] = (np.nansum(data_even[row,0:-1]) + np.nansum(data_even[row,1:])) / (2 * np.nansum(data_odd[row,:]))
elif even_over_odd is False:
for row in range(0, len(self.data[:,0])):
cpi[row] = (np.nansum(data_odd[row,0:-1]) + np.nansum(data_odd[row,1:])) / (2 * np.nansum(data_even[row,:]))
else:
raise ValueError("'even_over_odd' must be True (default) or False")
return cpi
[docs]
def corr_rvals(self, minimum_obs=2):
"""
Calculates the Pearson correlation r-values between each leaf wax
chain-length (columns). To be extended with other correlation methods
(Spearman, Kendall Tau) in a future version.
Parameters
----------
minimum_obs : int, optional
Minimum number of observations (samples/rows) required to return a
Pearson r-value. The default is 2.
Returns
-------
r_vals : numpy.ndarray
2-D Numpy array of Pearson correlation r-values between each leaf
wax chain-length (column) with all values in the major diagonal
equal to 1.
"""
r_vals = np.zeros((len(self.data[0,:]), len(self.data[0,:])))
for row in range(0, len(r_vals[:,0])):
for col in range(0, len(r_vals[0,:])):
x_corr = np.array(self.data[:,row])
y_corr = np.array(self.data[:,col])
if (len(x_corr) >= minimum_obs) and (len(y_corr) >= minimum_obs):
r_vals[row,col] = scipy.stats.pearsonr(x_corr, y_corr)[0]
else:
r_vals[row,col] = np.nan
return r_vals
[docs]
def corr_pvals(self, minimum_obs=2):
"""
Calculates the Pearson correlation p-values between each leaf wax
chain-length (columns). To be extended with other correlation methods
(Spearman, Kendall Tau) in a future version.
Parameters
----------
minimum_obs : int, optional
Minimum number of observations (samples/rows) required to return a
Pearson r-value. The default is 2.
Returns
-------
p_vals : numpy.ndarray
2-D Numpy array of Pearson correlation p-values between each leaf
wax chain-length (column).
"""
p_vals = np.zeros((len(self.data[0,:]), len(self.data[0,:])))
for row in range(0, len(p_vals[:,0])):
for col in range(0, len(p_vals[0,:])):
x_corr = np.array(self.data[:,row])
y_corr = np.array(self.data[:,col])
if (len(x_corr) >= minimum_obs) and (len(y_corr) >= minimum_obs):
p_vals[row,col] = scipy.stats.pearsonr(x_corr, y_corr)[1]
else:
p_vals[row,col] = np.nan
return p_vals
[docs]
def pca(self, chain_lengths, use_clr=True):
"""
Performs a Principal Component Analysis (PCA) on the leaf wax
chain-length data with the centered log-ratio transform (clr;
Aitchison, 1982) applied to the input compositional data (Gloor et al.
2017).
References:
Aitchison, J. (1982). The statistical analysis of compositional data.
Journal of the Royal Statistical Society: Series B (Methodological),
44(2), 139-160. https://doi.org/10.1111/j.2517-6161.1982.tb01195.x
Gloor, G. B., Macklaim, J. M., Pawlowsky-Glahn, V., & Egozcue, J. J.
(2017). Microbiome datasets are compositional: and this is not
optional. Frontiers in microbiology, 8, 2224.
https://doi.org/10.3389/fmicb.2017.02224
Parameters
----------
chain_lengths : list
List of integers or floats representing the carbon chain-length
number of each column..
use_clr : bool, optional
Calculates the clr of the leaf wax chain-length abundance data,
replacing 0 values with 1/N where N is the number of chain-lengths
(columns). The default is True.
Raises
------
TypeError
Raises an error if 'chain_lengths' is not a list.
ValueError
Raises an error if 'chain_lengths' is an empty list or if
'use_clr' is neither True nor False.
Returns
-------
pca_dict : dict
Dictionary of PCA results including the PC scores for each factor
loading (chain-lengths/columns) and sample (rows). For more
details on all of the returns in pca_dict['pca'], please see the
documentation for the scikit-learn PCA function (sklearn.PCA())
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
"""
if type(chain_lengths) is not list:
raise TypeError(
"'chain_lengths' must be a list() type containing integers or floats; Example: [22, 23, 24, 25, 26]"
)
if len(chain_lengths) < 1:
raise ValueError(
"'chain_lengths' is currently an empty list. Please make sure 'chain_lengths' contains at least 1 integer or float."
)
'''
# deal with missing data
for row in range(0, len(self.data[:,0]):
for col i range(0, len(self.data[0,:]):
if self.data[row,col] == np.nan:
self.data[row,col] = 0
if np.sum(self.data[row,:]) == 0:
'''
if use_clr is True:
wax_relabd = closure(multiplicative_replacement(self.data))
wax_clr = clr(wax_relabd)
wax_data = pd.DataFrame(data=wax_clr, columns=chain_lengths)
elif use_clr is False:
wax_data = pd.DataFrame(data=self.relative_abd(), columns=chain_lengths)
else:
raise ValueError("'use_clr' must be True or False (default)")
wax_scaler = StandardScaler()
wax_scaler.fit(wax_data)
wax_data_scaled = wax_scaler.transform(wax_data)
wax_pca = PCA(n_components=len(chain_lengths))
wax_pca.fit_transform(wax_data_scaled)
# wax_PC_scores = pd.DataFrame(
# wax_pca.fit_transform(wax_data_scaled),
# columns=chain_lengths
# )
# wax_loadings = pd.DataFrame(
# wax_pca.components_.T,
# columns=chain_lengths,
# index=wax_data.columns
# )
wax_ldings = wax_pca.components_
wax_features = wax_data.columns
wax_pc_values = np.arange(wax_pca.n_components_) + 1
pca_dict = {
"pca": wax_pca,
"pc_values": wax_pc_values,
"features": wax_features,
"loadings": wax_ldings
}
for i in range(0, len(chain_lengths)):
wax_pc = wax_pca.fit_transform(wax_data_scaled)[:,i]
wax_scale_pc = 1.0 / (wax_pc.max() - wax_pc.min())
wax_pc_score = wax_pc * wax_scale_pc
# pca_dict.update({f"wax_pc{i+1}": wax_pc})
# pca_dict.update({f"wax_scale_pc{i+1}": wax_scale_pc})
pca_dict.update({f"wax_pc{i+1}_score": wax_pc_score})
return pca_dict