Source code for Thermobar.noise_averaging

import numpy as np
import matplotlib.pyplot as plt
from functools import partial
import inspect
import warnings as w
import numbers
import pandas as pd
from Thermobar.core import *
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

# This function is from matplotlib -
[docs] def matplotlib_confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs): """ Create a plot of the covariance confidence ellipse of *x* and *y*. Parameters ---------- x, y : array-like, shape (n, ) Input data. ax : matplotlib.axes.Axes The axes object to draw the ellipse into. n_std : float The number of standard deviations to determine the ellipse's radiuses. **kwargs Forwarded to `~matplotlib.patches.Ellipse` Returns ------- matplotlib.patches.Ellipse """ if x.size != y.size: raise ValueError("x and y must be the same size") cov = np.cov(x, y) pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1]) # Using a special case to obtain the eigenvalues of this # two-dimensional dataset. ell_radius_x = np.sqrt(1 + pearson) ell_radius_y = np.sqrt(1 - pearson) ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2, facecolor=facecolor, **kwargs) # Calculating the standard deviation of x from # the squareroot of the variance and multiplying # with the given number of standard deviations. scale_x = np.sqrt(cov[0, 0]) * n_std mean_x = np.mean(x) # calculating the standard deviation of y ... scale_y = np.sqrt(cov[1, 1]) * n_std mean_y = np.mean(y) transf = transforms.Affine2D() \ .rotate_deg(45) \ .scale(scale_x, scale_y) \ .translate(mean_x, mean_y) ellipse.set_transform(transf + ax.transData) return ax.add_patch(ellipse)
[docs] def av_noise_samples_series(calc, sampleID): ''' This function calculates the mean, median, standard devation, maximum and minimum value of rows specified by "calc" based on values in "Sample ID" where both inputs are panda series. Parameters ------- calc: Series Panda series of inputs you want to average. SampleID: str column heading for the thing you want to average by (e.g., Sample_ID_Cpx) Returns ------- Dataframe with headings "Sample", "Mean_calc", "Median_calc", "St_dev_calc", "Max_calc", "Min_calc" ''' if isinstance(calc, pd.Series): N = sampleID.unique() Av_mean = np.zeros(len(N), dtype=float) Av_median = np.zeros(len(N), dtype=float) Max = np.zeros(len(N), dtype=float) Min = np.zeros(len(N), dtype=float) Std = np.zeros(len(N), dtype=float) IQR_Std=np.zeros(len(N), dtype=float) i=0 for ID in sampleID.unique(): sam=ID # print(sam) # print(i) # print(np.nanmean(calc[sampleID == sam])) Av_mean[i] = np.nanmean(calc[sampleID == sam]) Av_median[i] = np.nanmedian(calc[sampleID == sam]) Std[i] = np.nanstd(calc[sampleID == sam]) Min[i] = np.nanmin(calc[sampleID == sam]) Max[i] = np.nanmax(calc[sampleID == sam]) var=calc[sampleID == sam] IQR_Std[i]=0.5*np.abs((np.percentile(var, 84) -np.percentile(var, 16))) i=i+1 len1=len(calc[sampleID == sam]) Err_out = pd.DataFrame(data={'Sample': N, '# averaged': len1, 'Mean_calc': Av_mean, 'Median_calc': Av_median, 'St_dev_calc': Std, 'St_dev_calc_from_percentiles': IQR_Std, 'Max_calc': Max, 'Min_calc': Min}) return Err_out
[docs] def av_noise_samples_df(dataframe, calc_heading, ID_heading): ''' This function calculates the mean, median, standard devation, maximum and minimum value of rows in a datarame with column heading "calc_heading" grouping by values in "ID_heading". Parameters ------- dataframe: pandas.DataFrame Panda datframe of inputs you want to average. Must contain column headings "calc_heading" and "ID_heading". calc_heading: str column heading for the thing you want to average (e.g, P_kbar_calc) ID_heading: str column heading for the thing you want to average by (e.g., Sample_ID) Returns ------- Dataframe with headings "Sample", "Mean_calc", "Median_calc", "St_dev_calc", "Max_calc", "Min_calc" ''' calc=dataframe[calc_heading] sampleID=dataframe[ID_heading] if isinstance(calc, pd.Series): N = sampleID.unique() Av_mean = np.zeros(len(N), dtype=float) Av_median = np.zeros(len(N), dtype=float) Max = np.zeros(len(N), dtype=float) Min = np.zeros(len(N), dtype=float) Std = np.zeros(len(N), dtype=float) IQR_Std=np.zeros(len(N), dtype=float) for i in range(0, len(N)): Av_mean[i] = np.nanmean(calc[sampleID == i]) Av_median[i] = np.nanmedian(calc[sampleID == i]) Std[i] = np.nanstd(calc[sampleID == i]) Min[i] = np.nanmin(calc[sampleID == i]) Max[i] = np.nanmax(calc[sampleID == i]) var=calc[sampleID == sam] IQR_Std[i]=0.5*np.abs((np.percentile(var, 84) -np.percentile(var, 16))) Err_out = pd.DataFrame(data={'Sample': N, 'Mean_calc': Av_mean, 'Median_calc': Av_median, 'St_dev_calc': Std,'St_dev_calc_from_percentiles': IQR_Std, 'Max_calc': Max, 'Min_calc': Min}) return Err_out
def turn_series_into_error(*, elx='Cpx', variable, variable_err): # Define variables n_samples = len(variable_err) var = variable # Define the column names cols = [ 'SiO2_{}_Err'.format(elx), 'TiO2_{}_Err'.format(elx), 'Al2O3_{}_Err'.format(elx), 'FeOt_{}_Err'.format(elx), 'MnO_{}_Err'.format(elx), 'MgO_{}_Err'.format(elx), 'CaO_{}_Err'.format(elx), 'Na2O_{}_Err'.format(elx), 'K2O_{}_Err'.format(elx), 'Cr2O3_{}_Err'.format(elx)] # Create the empty DataFrame Error = pd.DataFrame(data=0, columns=cols, index=range(n_samples)) # Fill in the appropriate column with the variable value var2=var + '_' + elx + '_Err' if var2 in cols: print(var2) Error[var2]=variable_err return Error
[docs] def add_noise_sample_1phase(phase_comp, phase_err=None, phase_err_type="Abs", variable=None, variable_err=None, variable_err_type=None, duplicates=10, noise_percent=None, err_dist="normal", positive=True, filter_q=None, append=False): ''' This function generates N duplicates containing random noise from the compositions in the dataframe specified by phase_comp. Parameters ------- Phase Comps: pandas dataframe Pandas dataframe of phase compositions. This can be generated from the import_excel function, or any dataframe with the headings _Liq for liquids, _Cpx for clinopyroxenes etc. Options for adding different types of error: 1) If you want to specifying an error for >1 variable: phase_err: pandas dataframe Pandas dataframe with headings for the error of the oxide in each phase (e.g., SiO2_Liq_Err, or SiO2_Cpx_Err). This dataframe can be generated from a user-inputted spreadsheet with these column headings using the function import_excel_errors. Errors can be absolute, or percentage errors. the default is absolute errors (in wt%), but users can overwrite this using phase_err_Type="Perc". phase_err_type: "Abs" (default) or "Perc" Determins if specified errors are absolute (Abs) or percentage errors. 2) If you want to specify error for a single variable: variable: str Name of column you wish to add error to (e.g. "Na2O" for Na2O in Liq) variable_err: flt, int Specifies how much error to add 3) If you want to add a fixed percent of noise to all variables. noise_percent: flt, int Adds a fixed noise percent to all input variables. duplicates: flt, int (Default: 10) Number of new synthetic samples generated per sample in the original dataframe. E.g., if the user enteres 7 samples, and duplicates=1000, the function returns 7000 compositions by default. If append=True, the original dataframe is appended onto the end of the returned dataframe err_dist: "normal" (default) or "uniform" determins whether added error is normally distributed with 1 sigma = entered value. Or uniformly distributed between +noise value and - noise value. positive: True (default) or False If True, doesn't allow negative values of oxide species, temperature or pressure. Can result in a non-normally distributed error distribution. If False, negative values are allowed. filter_q: str Filter criteria, e.g. if SiO2_Liq>60, only returns samples with SiO2_Liq>60 append: False (default) or True If True, appends user-entered dataframe onto the synthetic dataframe once noise has been added. Returns ------- Panda dataframe containing user-inputted samples with noise added. The output is sorted such that the first row in the input * the number of noise samples requested are the first N rows, then the new synthetic compositions for the second row in the input database are next. A heading called "Sample_ID_Liq_Num" is added, with all synthetic samples from the first row in the input dataframe have an index 0, the 2nd row have an index 1. etc. ''' if phase_err_type not in ['Abs', 'Perc']: raise ValueError("Invalid value for phase_err_type. Please choose 'Abs' or 'Perc'.") if err_dist not in ['normal', 'uniform']: raise ValueError("Invalid value for phase_err_type. Please choose 'normal' or 'uniform'.'") # if variable_err is not None: # if (type(variable_err) is not float) and (type(variable_err) is not int) and (type(variable_err) is not np.ndarray): # raise Exception('variable error must be a float, integer, or np.ndarray. If youve entered a pandas series, do series.values') # if variable is not None and noise_percent is not None: raise Exception('noise_percent is an arguement on its own ' 'it adds noise to all variables. Either specify variable or ' 'noise_percent not both') if variable_err is not None and noise_percent is not None: raise Exception('noise_percent adds noise to all variables' \ 'while variable_err adds noise to a single variable'\ 'specify only one of these arguements') if filter_q is not None: Sample_c = phase_comp.query(filter_q).copy() else: Sample_c = phase_comp.copy() if phase_err is not None and noise_percent is not None: raise Exception('You have entered both a dataframe of noise and '\ 'specified a percent noise. Select only 1 of these options') # This works out what phase you have entered data for Phase_Options = ["Cpx", "Plag", "Opx", "Sp", "Kspar", "Amp", "Liq", "Ol"] for Option in Phase_Options: if any(Sample_c.columns.str.contains(f"_{Option}")): elx = Option if any(Sample_c.columns.str.contains('Sample_ID_{}'.format(elx))): name=True else: Sample_c['Sample_ID_{}'.format(elx)]='No Name Entered' if len(Sample_c['Sample_ID_{}'.format(elx)].unique() ) != len(Sample_c): w.warn('Non unique sample names. We have appended the index onto all sample names to save issues with averaging later') TEST=Sample_c.index.values for i in range(0, len(Sample_c)): Sample_c.loc[i, 'Sample_ID_{}'.format(elx)]=Sample_c['Sample_ID_{}'.format(elx)].iloc[i] + '_'+str(TEST[i]) if phase_err is None or (phase_err is not None and err_dist == "uniform"): Sample_c['Sample_ID_{}_Num'.format(elx)] = Sample_c.index # This duplicates your entered composition the number of times # specified by noise samples (Cpx1-Cpx1-Cpx1, Cpx2, Cpx2,...) Dup_Sample = pd.DataFrame( np.repeat(Sample_c.values, duplicates, axis=0)) Dup_Sample.columns = Sample_c.columns # Dropping sample name so it doesnt get averaged. Sample_name_num = Dup_Sample['Sample_ID_{}_Num'.format(elx)] Sample_name_str=Dup_Sample['Sample_ID_{}'.format(elx)] Dup_Sample.drop('Sample_ID_{}_Num'.format(elx), axis=1, inplace=True) Dup_Sample.drop('Sample_ID_{}'.format(elx), axis=1, inplace=True) if variable is not None and not isinstance(variable, pd.Series) and not isinstance(variable, np.ndarray): ely = variable if variable == "P_kbar" or variable == "T_K": if variable_err_type == "Abs": if err_dist == "normal": Noise = np.random.normal(0, variable_err, Dup_Sample.shape[0]) if err_dist == "uniform": Noise = np.random.uniform(- variable_err, + variable_err, Dup_Sample.shape[0]) if variable_err_type == "Perc": variable_err_abs = Dup_Sample['{}'.format( ely)] * (variable_err / 100) if err_dist == "normal": Noise = np.random.normal( 0, variable_err_abs, Dup_Sample.shape[0]) if err_dist == "uniform": Noise = np.random.uniform(- variable_err_abs, + variable_err_abs, Dup_Sample.shape[0]) mynoisedDataframe = Dup_Sample.copy() mynoisedDataframe['{}'.format( ely)] = mynoisedDataframe['{}'.format(ely)] + Noise else: if variable_err_type == "Abs": if err_dist == "normal": Noise = np.random.normal( 0, variable_err, Dup_Sample.shape[0]) if err_dist == "uniform": Noise = np.random.uniform(- variable_err, + variable_err, Dup_Sample.shape[0]) if variable_err_type == "Perc": variable_err_abs = Dup_Sample['{}_{}'.format( ely, elx)] * (variable_err / 100) if err_dist == "normal": Noise = np.random.normal( 0, variable_err_abs, Dup_Sample.shape[0]) if err_dist == "uniform": Noise = np.random.uniform(- variable_err_abs, + variable_err_abs, Dup_Sample.shape[0]) mynoisedDataframe = Dup_Sample.copy() mynoisedDataframe['{}_{}'.format( ely, elx)] = mynoisedDataframe['{}_{}'.format(ely, elx)] + Noise if noise_percent is not None and err_dist == "uniform": noise = np.random.uniform(- noise_percent / 100, + noise_percent / 100, Dup_Sample.shape) mynoisedDataframe = Dup_Sample + Dup_Sample * noise if noise_percent is not None and err_dist == "normal": noise = np.random.normal(0, noise_percent / 100, Dup_Sample.shape) mynoisedDataframe = Dup_Sample + Dup_Sample * noise if phase_err is not None and err_dist == "uniform": Sample_Err = phase_err.copy() Dup_Noise = pd.DataFrame( np.repeat(Sample_Err.values, duplicates, axis=0)) Dup_Noise.columns = Sample_Err.columns noise = np.random.uniform(1, -1, Dup_Noise.shape) mynoisedDataframe = (Dup_Noise * noise).to_numpy() + Dup_Sample if variable is not None and (isinstance(variable_err, pd.Series) or isinstance(variable_err, np.ndarray)): print('got to here') phase_err=turn_series_into_error(elx=elx, variable=variable, variable_err=variable_err) phase_err_type=variable_err_type if phase_err is not None and err_dist == "normal": # This is for when users enter 2 dataframes, 1 of measurements, 1 of 1 # sigma errors Data = Sample_c if 'Sample_ID_{}'.format(elx) in Data: Data=Data.drop('Sample_ID_{}'.format(elx), axis=1) # Set up empty things to fill in SiO2_Err = np.zeros((duplicates * len(Data)), dtype=float) TiO2_Err = np.zeros((duplicates * len(Data)), dtype=float) Al2O3_Err = np.zeros((duplicates * len(Data)), dtype=float) FeOt_Err = np.zeros((duplicates * len(Data)), dtype=float) MnO_Err = np.zeros((duplicates * len(Data)), dtype=float) MgO_Err = np.zeros((duplicates * len(Data)), dtype=float) CaO_Err = np.zeros((duplicates * len(Data)), dtype=float) Na2O_Err = np.zeros((duplicates * len(Data)), dtype=float) K2O_Err = np.zeros((duplicates * len(Data)), dtype=float) Cr2O3_Err = np.zeros((duplicates * len(Data)), dtype=float) NiO_Err = np.zeros((duplicates * len(Data)), dtype=float) P2O5_Err = np.zeros((duplicates * len(Data)), dtype=float) H2O_Err = np.zeros((duplicates * len(Data)), dtype=float) P_kbar_Err = np.zeros((duplicates * len(Data)), dtype=float) T_K_Err = np.zeros((duplicates * len(Data)), dtype=float) F_Err = np.zeros((duplicates * len(Data)), dtype=float) Cl_Err = np.zeros((duplicates * len(Data)), dtype=float) Sample_name_num = np.zeros((duplicates * len(Data)), dtype=float) Sample_name_str = np.zeros((duplicates * len(Data)), dtype=object) if phase_err_type == "Abs": Err = phase_err if phase_err_type == "Perc": Err_perc = phase_err.copy() # removing headings so can multiply 2 pandas Err_perc.columns = Err_perc.columns.str.replace('_Err', '') if 'Sample_ID_Cpx' in Err_perc.columns: Err_perc = Err_perc.drop('Sample_ID_Cpx', axis=1) Err = Data * (Err_perc / 100) # adding Err back in Err.columns = [str(col) + '_Err' for col in Err.columns] for i in range(0, len(Data)): if len(Err) != len(Data): raise Exception('Your data and error input data frames arent the same length') Sample_name_num[i * duplicates:(i * duplicates + duplicates)] = i Sample_name_str[i * duplicates:(i * duplicates + duplicates)] = Sample_c['Sample_ID_{}'.format(elx)].iloc[i] SiO2_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['SiO2_{}'.format( elx)].iloc[i], scale=Err['SiO2_{}_Err'.format(elx)].iloc[i], size=duplicates) TiO2_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['TiO2_{}'.format( elx)].iloc[i], scale=Err['TiO2_{}_Err'.format(elx)].iloc[i], size=duplicates) Al2O3_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['Al2O3_{}'.format( elx)].iloc[i], scale=Err['Al2O3_{}_Err'.format(elx)].iloc[i], size=duplicates) FeOt_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['FeOt_{}'.format( elx)].iloc[i], scale=Err['FeOt_{}_Err'.format(elx)].iloc[i], size=duplicates) MnO_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['MnO_{}'.format( elx)].iloc[i], scale=Err['MnO_{}_Err'.format(elx)].iloc[i], size=duplicates) MgO_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['MgO_{}'.format( elx)].iloc[i], scale=Err['MgO_{}_Err'.format(elx)].iloc[i], size=duplicates) CaO_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['CaO_{}'.format( elx)].iloc[i], scale=Err['CaO_{}_Err'.format(elx)].iloc[i], size=duplicates) Na2O_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['Na2O_{}'.format( elx)].iloc[i], scale=Err['Na2O_{}_Err'.format(elx)].iloc[i], size=duplicates) K2O_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['K2O_{}'.format( elx)].iloc[i], scale=Err['K2O_{}_Err'.format(elx)].iloc[i], size=duplicates) Cr2O3_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['Cr2O3_{}'.format( elx)].iloc[i], scale=Err['Cr2O3_{}_Err'.format(elx)].iloc[i], size=duplicates) if variable == "P_kbar": P_kbar_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['P_kbar'.format( elx)].iloc[i], scale=Err['P_kbar_Err'.format(elx)].iloc[i], size=duplicates) if variable == "T_K": T_K_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['T_K'.format( elx)].iloc[i], scale=Err['T_K_Err'.format(elx)].iloc[i], size=duplicates) if any(Data.columns.str.contains("NiO")): NiO_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['NiO_{}'.format( elx)].iloc[i], scale=Err['NiO_{}_Err'.format(elx)].iloc[i], size=duplicates) else: NiO_Err = 0 * Data['SiO2_{}'.format(elx)] if any(Data.columns.str.contains("F_")): F_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['F_{}'.format( elx)].iloc[i], scale=Err['F_{}_Err'.format(elx)].iloc[i], size=duplicates) else: F_Err = 0 * Data['SiO2_{}'.format(elx)] if any(Data.columns.str.contains("Cl_")): Cl_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['Cl_{}'.format( elx)].iloc[i], scale=Err['Cl_{}_Err'.format(elx)].iloc[i], size=duplicates) else: Cl_Err = 0 * Data['SiO2_{}'.format(elx)] if any(Data.columns.str.contains("P2O5")): P2O5_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['P2O5_{}'.format( elx)].iloc[i], scale=Err['P2O5_{}_Err'.format(elx)].iloc[i], size=duplicates) else: P2O5_Err = 0 * Data['SiO2_{}'.format(elx)] if any(Data.columns.str.contains("H2O")): H2O_Err[i * duplicates:(i * duplicates + duplicates)] = np.random.normal(loc=Data['H2O_{}'.format( elx)].iloc[i], scale=Err['H2O_{}_Err'.format(elx)].iloc[i], size=duplicates) else: H2O_Err = 0 * Data['SiO2_{}'.format(elx)] if elx == 'Cpx' or elx == "Opx" or elx == "Plag" or elx == "Kspar": mynoisedDataframe = pd.DataFrame(data={'SiO2_{}'.format(elx): SiO2_Err, 'TiO2_{}'.format(elx): TiO2_Err, 'Al2O3_{}'.format(elx): Al2O3_Err, 'FeOt_{}'.format(elx): FeOt_Err, 'MnO_{}'.format( elx): MnO_Err, 'MgO_{}'.format(elx): MgO_Err, 'CaO_{}'.format(elx): CaO_Err, 'Na2O_{}'.format(elx): Na2O_Err, 'K2O_{}'.format(elx): K2O_Err, 'Cr2O3_{}'.format(elx): Cr2O3_Err}) if elx == 'Ol' or elx == "Sp": mynoisedDataframe = pd.DataFrame(data={'SiO2_{}'.format(elx): SiO2_Err, 'TiO2_{}'.format(elx): TiO2_Err, 'Al2O3_{}'.format(elx): Al2O3_Err, 'FeOt_{}'.format(elx): FeOt_Err, 'MnO_{}'.format(elx): MnO_Err, 'MgO_{}'.format(elx): MgO_Err, 'CaO_{}'.format(elx): CaO_Err, 'Na2O_{}'.format(elx): Na2O_Err, 'K2O_{}'.format(elx): K2O_Err, 'Cr2O3_{}'.format(elx): Cr2O3_Err, 'NiO_{}'.format(elx): NiO_Err}) if elx == "Amp": mynoisedDataframe = pd.DataFrame(data={'SiO2_{}'.format(elx): SiO2_Err, 'TiO2_{}'.format(elx): TiO2_Err, 'Al2O3_{}'.format(elx): Al2O3_Err, 'FeOt_{}'.format(elx): FeOt_Err, 'MnO_{}'.format(elx): MnO_Err, 'MgO_{}'.format(elx): MgO_Err, 'CaO_{}'.format(elx): CaO_Err, 'Na2O_{}'.format(elx): Na2O_Err, 'K2O_{}'.format(elx): K2O_Err, 'Cr2O3_{}'.format(elx): Cr2O3_Err, 'F_{}'.format(elx): F_Err, 'Cl_{}'.format(elx): Cl_Err}) if elx == "Liq": mynoisedDataframe = pd.DataFrame(data={'SiO2_{}'.format(elx): SiO2_Err, 'TiO2_{}'.format(elx): TiO2_Err, 'Al2O3_{}'.format(elx): Al2O3_Err, 'FeOt_{}'.format(elx): FeOt_Err, 'MnO_{}'.format(elx): MnO_Err, 'MgO_{}'.format(elx): MgO_Err, 'CaO_{}'.format(elx): CaO_Err, 'Na2O_{}'.format(elx): Na2O_Err, 'K2O_{}'.format(elx): K2O_Err, 'Cr2O3_{}'.format(elx): Cr2O3_Err, 'P2O5_{}'.format(elx): P2O5_Err, 'H2O_{}'.format(elx): H2O_Err}) mynoisedDataframe = mynoisedDataframe.reindex( df_ideal_liq.columns, axis=1).fillna(0) mynoisedDataframe = mynoisedDataframe.apply( pd.to_numeric, errors='coerce').fillna(0) if variable == "T_K": mynoisedDataframe['P_kbar'] = P_kbar_Err if variable == "P_kbar": mynoisedDataframe['T_K'] = T_K_Err mynoisedDataframe['Sample_ID_{}_Num'.format(elx)] = Sample_name_num mynoisedDataframe['Sample_ID_{}'.format(elx)] = Sample_name_str if positive is True: num = mynoisedDataframe._get_numeric_data() num[num < 0] = 0 print('All negative numbers replaced with zeros. '\ 'If you wish to keep these, set positive=False') mynoisedDataframe=mynoisedDataframe.fillna(0) if append is True: mynoisedDataframe2 = pd.concat([Sample_c, mynoisedDataframe], axis=0) return mynoisedDataframe2 else: return mynoisedDataframe
[docs] def calculate_bootstrap_mixes( endmember1, endmember2, num_samples, self_mixing=False): '''Specify 2 end-members, generates synthetic liquids from mixing between these end-members Parameters ------- endmember1: pandas.DataFrame Panda DataFrame of liquid compositions for end-member 1, with column headings SiO2_Liq etc. endmember2: pandas.DataFrame Panda DataFrame of liquid compositions for end-member 2, with column headings SiO2_Liq etc. num_samples: float or int If num_samples is less than the length of the end members, will randomly resample liquids entered to get to sufficient N. If num_samples greater than length of end members, will randomly downsample liquids to N=num_samples. self_mixing: None, False, True, "Partial" If None or False, will mix 2 end members in various proportions, but no mixing between end members If True, will mix between samples from a given end member as well as between the 2 end members. If Partial, half of outputted liquids will be generated by mixing within and between end members, and the other half from mixing between end members. Returns: ------- pandas DataFrame synthetic liquids generated by mixing between end-members with column headings "SiO2_Liq" etc. ''' Elements = ['SiO2_Liq', 'TiO2_Liq', 'Al2O3_Liq', 'FeOt_Liq', 'FeOt_Liq', 'MnO_Liq', 'MgO_Liq', 'CaO_Liq', 'Na2O_Liq', 'K2O_Liq', 'Cr2O3_Liq', 'P2O5_Liq', 'H2O_Liq'] f = np.repeat(np.random.uniform(0, 1, (num_samples, 1)), (len(Elements)), axis=1) # Takes half mixes from self mixing, half from mixing between defined # end-members if self_mixing == "Partial": # self mixing part my_dataset3_self = pd.concat([endmember1, endmember2], ignore_index=True) endmember1_self = my_dataset3_self[Elements].sample( n=num_samples, replace=True).to_numpy() endmember2_self = my_dataset3_self[Elements].sample( n=num_samples, replace=True).to_numpy() combined_model_self = endmember1_self * f + endmember2_self * (1 - f) # normal mixing part endmember1 = endmember1[Elements].sample( n=num_samples, replace=True).to_numpy() endmember2 = endmember2[Elements].sample( n=num_samples, replace=True).to_numpy() combined_model = endmember1 * f + endmember2 * (1 - f) myDataframe_self = pd.DataFrame() for ix, my_el in enumerate(Elements): myDataframe_self[my_el] = combined_model_self[:, ix] myDataframe_mix = pd.DataFrame() for ix, my_el in enumerate(Elements): myDataframe_mix[my_el] = combined_model[:, ix] myDataframe = pd.concat([myDataframe_mix, myDataframe_self], ) myDataframe = myDataframe.sample(n=num_samples, replace=True) if self_mixing is True: my_dataset3 = pd.concat([endmember1, endmember2], ignore_index=True) endmember1 = my_dataset3[Elements].sample( n=num_samples, replace=True).to_numpy() endmember2 = my_dataset3[Elements].sample( n=num_samples, replace=True).to_numpy() if self_mixing is False or self_mixing is None: endmember1 = endmember1[Elements].sample( n=num_samples, replace=True).to_numpy() endmember2 = endmember2[Elements].sample( n=num_samples, replace=True).to_numpy() if self_mixing is False or self_mixing is None or self_mixing is True: combined_model = endmember1 * f + endmember2 * (1 - f) myDataframe = pd.DataFrame() for ix, my_el in enumerate(Elements): myDataframe[my_el] = combined_model[:, ix] myDataframe = myDataframe.fillna(0) return myDataframe # , f, endmember1, endmember2