1

Name:Jose Haro

I collaborated with:Cinthya Montoya

2

In [6]:

import numpy as np import seaborn as sns import pandas as pd from scipy.stats.stats import pearsonr import matplotlib.pyplot as plt import scipy.stats as stats

3

In [7]:

#1 #Pearson's: The x and y data must be Gaussian, roughly linear, and homoscedastic #Spearman's: The x and y data must be ranked, but violates the Gaussian shape that is needed in order to perform Pearson's. However, the data remains to be homoscedastic #Mutual Information: The x and y data is not monotonic.

4

In [8]:

#2 river=pd.read_csv("nenana.csv", sep="\t") #prriver=pearsonr(river["Ice Breakup Day of Year"], river["Year"]) reg=stats.linregress(river["Year"], river["Ice Breakup Day of Year"]) m=reg.slope b=reg.intercept x=river["Year"] x_plot=np.linspace(1920,2020,100) y_plot=m*x_plot+b sns.lmplot("Year", "Ice Breakup Day of Year", data=river, fit_reg=True) plt.plot(x_plot,y_plot) print("The slope is:",m) print("The y-intercept is:", b)

5

The slope is: -0.08441029301937356
The y-intercept is: 289.7854275359135

In [9]:

#Confidence Interval m=-0.08441029301937356 #observed slope n=len(river) sims=np.zeros(10000) for i in range(10000): a=river.sample(n, replace=True) reg=stats.linregress(a["Year"], a["Ice Breakup Day of Year"]) slope=reg.slope sims[i]=slope sims.sort() M_lower = sims[49] M_upper = sims[9949] CI_low = m*2 - M_upper CI_high = m*2 - M_lower p=sns.distplot(sims) p.axvline(CI_low, color="green") p.axvline(CI_high, color="red") p.axvline(m) print("The 99th % Confidence Interval lies between", CI_low, "and", CI_high)

6

The 99th % Confidence Interval lies between -0.14290532643298176 and -0.03016814688305683

In [10]:

#3 #Comparing One-group to a Refernce value: Monte Carlo simulations using big box model #Comparing Two Groups: Two Box with re-centering, two-box with re-sampling for CI. #Comparing Proportions: Box-Model with two populations for conditions (i.e: 100*[Survived] + 50*[Dead]) #Correlation: A np.shuffle is needed to destroy the relationship between x and y in order to simulate the Pearson's/Spearman's correlation for the null hypothesis. #Paired Analysis:2 box-bootstrap method (re-centering) #ANOVA: Big-Box with re-centering; to find a difference before we begin to calculate our p-value #Multi-Factor ANOVA: 2/3-box with re-centering

7

In [11]:

#4 #In lab #10, we calculated power through changes that we created in a phantom world. While in Halsey et al's journal article power is also being measured, the differences in power values are found through changes made in the sample size of the study rather than changes made in the phantom world. It would be also important to note that measurement used for effect sizes are drastically different between Lab #10 and the journal article, as relative risk is used in the lab while the measurement in p-values was used in the journal. #Similarities between the two include the interpretation of the p-value in relation to the power. The result of having a low power with a p-value means that the study, on its own, does not have (quite literally!) the power to support the p-value of a negative association, which is usually our alternative hypothesis. Another similarity can be drawn from the inclusion of effect size from both the lab and the journal article.

8

In [12]:

#5 #Figure 4 demonstrates that differences between populations measured through m*mol^-1, different sample sizes, and differences in p-value all affect the value of power at any given value of difference between the populations.

9

In [13]:

#6a plant_eph = [34,51,20,47,22,19,24,18,33,45,36,42,30,25,44,33,47,35,51,15,24,13,25,32,49,28,37,17,13,32,24,18,22] plant_perm = [31,40,43,37,23,45,53,46,44,25,47,34,41,31,37,34,42,37,14,29,18,15,14,28] np_plant_eph = np.array(plant_eph) np_plant_perm = np.array(plant_perm) #Effect Size to be used: Difference of medians between ephemeral and wetland groups def med_diff(data1, data2): return (np.median(data1) - np.median(data2)) obs = med_diff(np_plant_eph, np_plant_perm)

10

In [14]:

#Method used to determine Power: increasing the sample size to ten times the amount, that is increasing the length of a dataset to ten times the length. n1 = len(np_plant_eph) n2 = len(np_plant_perm) total = n1 + 10*n2

11

In [15]:

population = np.concatenate([np_plant_eph, np_plant_perm]) sims = 1000 zeros = np.zeros(sims) for i in range(sims): sim_group1 = np.random.choice(population,n1) sim_group2 = np.random.choice(population,10*n2) diffmeds = (np.median(sim_group2) - np.median(sim_group1)) zeros[i]=diffmeds #Plotting q = sns.distplot(zeros, kde = False) q.set(title = "Null Distribution of Difference of Medians", xlabel = 'Difference of Medians', ylabel = 'Count') q.axvline(obs, color = 'green') q.axvline(diffmeds, color = 'red') #P-value phantom_pvalue = np.sum(sum(zeros>= diffmeds) + sum(zeros<=-1*diffmeds))/sims pvalue = np.sum(sum(zeros>=obs) + sum(zeros<=-1*obs))/sims print('p-value of phantom world is', phantom_pvalue) print('p-value is', pvalue)

12

p-value of phantom world is 0.832
p-value is 1.943

In [16]:

#Power Analysis a1 = 0.05 a2 = 0.01 power5 = np.sum(zeros < a1)/sims power1 = np.sum(zeros < a2)/sims print("For sample size *2:") print(" The power of this study with alpha = 0.05 is",power5,".") print(" The power of this study with alpha = 0.01 is",power1,".")

13

For sample size *2:
The power of this study with alpha = 0.05 is 0.562 .
The power of this study with alpha = 0.01 is 0.562 .

In [26]:

def med_diff(data1, data2): return (np.median(data1) - np.median(data2)) obs = med_diff(np_plant_eph, np_plant_perm) #NHST for Power Analysis def NHST_Power(data1, data2): population = np.concatenate([data1, data2]) obs = med_diff(data1, data2) np_data1 = np.array(data1) np_data2 = np.array(data2) n1 = len(data1) n2 = len(data2) sims = 1000 # Make number of bootstraps a variable, so we can easily run a small number of bootstraps for testing zeros = np.zeros(sims) # Create array of zeros to hold a calculated statistic for each simulation based on the null hypothesis for i in range(sims): sim_group1 = np.random.choice(population,n1) sim_group2 = np.random.choice(population,n2) diffmeds = (np.median(sim_group1) - np.median(sim_group2)) zeros[i] = diffmeds pvalue = np.sum(sum(zeros>= np.abs(obs)) + sum(zeros<= -1*(np.abs(obs))))/sims return pvalue NHST_Power(np_plant_eph, np_plant_perm)

14

0.238

In [33]:

#Same Effect Size n1 = len(np_plant_eph) n2 = len(np_plant_perm) sims1 = 1000 zeros1 = np.zeros(sims1) for i in range(sims1): box1 = np.random.choice(np_plant_eph,n1) box2 = np.random.choice(np_plant_perm, n2) zeros1[i] = (NHST_Power(box1, box2))

15

In [34]:

sns.distplot(zeros1, kde = False)

16

<matplotlib.axes._subplots.AxesSubplot at 0x7fe379c8bac8>

In [35]:

#Calculating Power a1 = 0.05 a2 = 0.01 power5 = np.sum(zeros1 < a1)/sims1 power1 = np.sum(zeros1 < a2)/sims1 print("For sample size ") print(" The power of this study with alpha = 0.05 is",power5,".") print(" The power of this study with alpha = 0.01 is",power1,".")

17

For sample size *2:
The power of this study with alpha = 0.05 is 0.205 .
The power of this study with alpha = 0.01 is 0.05 .

In [13]:

#6b #Based on the unchanging power values between both levels of alpha,

18

In [14]:

#6c #Altering the MADAM values would affect the overall data, which would then alter the power value of the this study at the cost of changing the actual data sets of the study. Changing the effect size by increasing could have also affected the power level of the study, as well as increasing the sample size.

19

In [15]:

#7 #Because when we performed the NHST between the plant species, we had only found it while only considering the difference of the medians without taking into deep consideration of the MADAM values for both the permanent and wetland species. However, when taking into consideration the difference of MADAM values between the two groups, a 1.0 unit change can alter the power level of the study which then can further detect the number of cases that demonstrated a significant difference which would contribute to the significant p-value. Additionally, the MADAM values that were used in this homework problem are different from the actual MADAM values of Homework 4, the difference between the two is that the MADAM values of Homework #9 are lesser than the ones present in Homework #4. This is to say that the in-group variation in Homework #4 had decreased the power value of the study; by lowering the MADAM values to 3 and 2 the power of the study had increased.

20

In [16]:

#8 plant_eph = [34,51,20,47,22,19,24,18,33,45,36,42,30,25,44,33,47,35,51,15,24,13,25,32,49,28,37,17,13,32,24,18,22] plant_perm = [31,40,43,37,23,45,53,46,44,25,47,34,41,31,37,34,42,37,14,29,18,15,14,28] np_plant_eph = np.array(plant_eph) np_plant_perm = np.array(plant_perm) #Effect Size to be used: Difference of medians between ephemeral and wetland groups def med_diff(data1, data2): return (np.median(data1) - np.median(data2)) obs = med_diff(np_plant_eph, np_plant_perm)

21

In [44]:

#Method used to determine Power: increasing the sample size to ten times the amount, that is increasing the length of a dataset to ten times the length. n1 = 7*(len(np_plant_eph)) n2 = 7*(len(np_plant_perm)) sims1 = 1000 zeros1 = np.zeros(sims1) for i in range(sims1): box1 = np.random.choice(np_plant_eph,n1) box2 = np.random.choice(np_plant_perm, n2) zeros1[i] = (NHST_Power(box1, box2)) a1 = 0.05 a2 = 0.01 power5 = np.sum(zeros1 < a1)/sims1 power1 = np.sum(zeros1 < a2)/sims1 print("For sample size ") print(" The power of this study with alpha = 0.05 is",power5,".") print(" The power of this study with alpha = 0.01 is",power1,".")

22

For sample size
The power of this study with alpha = 0.05 is 0.838 .
The power of this study with alpha = 0.01 is 0.681 .
231

In [45]:

print(n1) print(n2)

23

231
168

In [ ]:

#In order to provide a power of at least 80% for an alpha value of 0.05, the sample size needed to have been multiplied by 7 times (n = 231 for the ephemeral wetland group, and n = 168 for the permanent wetland groups). In order to have both alpha values at least 80%, a factor of 11 would have been needed.

24