################################################################################ # R Script for selection gradient analysis # If you have questions, contact D. Johnson at johnson@nceas.ucsb.edu # Updated: 06/18/2012 # The file is set up such that users can run the following code by copying and # pasting it into R. Sections of code that are typed in ALL CAPITALS need to # be replaced with the appropriate information (e.g., trait names, file # locations, names of grouping variables, etc.) supplied by the user. ################################################################################ # Required Packages # ################################################################################ # This program uses the "MASS" package for R, therefore this package needs to # be installed prior to using the script. To install, type the following at the # command prompt upon opening R. install.packages("MASS", dependencies=TRUE) # and then select an appropriate mirror for download (e.g., USA(CA1)). # Once the MASS package has been installed, load it to continue: library(MASS) ################################################################################ # INPUT FILE # ################################################################################ # The script expects a single file as data input. The default is to have these # files in comma delimited format (.csv), although appropriate changes to the # script can be made to accomodate other formats (e.g., tab delimited data) # In the file, rows index individuals (samples), columns index traits. # Note that the file should contain a header row. # There should also be a column that distinguishes the before selection and # after selection samples. # The file may also contain a column to distinguish groups for separate analyses ################################################################################ # BEGIN USER INPUT SECTION # ################################################################################ # Set the file pathway to the appropriate folder where the data file is saved # For example, for a comma-separated file in a subfolder called MV selection # on a removable disk(in this case the I drive), the path is: data_set_path<-"I:/MV selection/EXAMPLE.DATA.FILE.csv" ######################## Reading in Datafile ################################## # Code is expecting comma delimited data, for tab delimited change to sep=" " data.file<-read.table(data_set_path, header=TRUE, sep=",") #OPTIONAL: if the file contains data for multiple groups that need to be #analyzed separately(e.g., different populations, cohorts, times, etc.), #specify the group of interest here: data.subset=subset(data.file, data.file$SELECTION.GROUP=="GROUP1") # OR # #if the data in "data.file" are for a single population of interest data.subset=data.file # specify before and after groups: before.sel.samp=subset(data.subset, data.subset$SAMPLE.GROUP=="BEFORE") after.sel.samp=subset(data.subset, data.subset$SAMPLE.GROUP=="AFTER") #specify traits to include in the analysis (names come after the $ symbol # i.e., before.sel.samp$TRAITNAME) # for brevity, this example analyzes two traits. If needed, include more traits # after TRAIT.2, separating the list with commas, e.g., before.sel.samp$TRAIT.1, # before.sel.samp$TRAIT.2, before.sel.samp$TRAIT.3, etc. bef.sel.traits=cbind(before.sel.samp$TRAIT.1, before.sel.samp$TRAIT.2) aft.sel.traits=cbind(after.sel.samp$TRAIT.1, after.sel.samp$TRAIT.2) no.traits=dim(bef.sel.traits)[2] #visualizing trait relationships: pairs(bef.sel.traits) # produces a scatterplot matrix of the raw data # scale(traits) standardizes trait values by subtracting the mean then dividing #by the sample standard deviation so pairs(scale(bef.sel.traits)) # plots the standardized data # checking for multicollinearity: P=cov(scale(bef.sel.traits), use="pairwise.complete.obs") # produces the phenotypic covariance matrix in the before selection sample. # Here data are standardized P kappa(P, exact=T) # the r function 'kappa' gives matrix condition number. # greater than 10-20 is a roungh benchmark for problematic collinearity # large numbers indicate greater problems. Extremely large numbers indicate # that the matrix is almost uninvertable and the resulting calculations will be # extremely variable and unreliable # if kappa(P) is sufficiently low, proceed. If values are high, one or more # variables may need to be dropped # Calculating (linear) selection differentials (removes missing values from # calculations) MN=function(x){mean(x, na.rm=T)} SD=function(x){sd(x, na.rm=T)} bef.sel.means=apply(bef.sel.traits, 2, MN) bef.sel.sds=apply(bef.sel.traits, 2, SD) aft.sel.means=apply(aft.sel.traits, 2, MN) sel.differentials=(aft.sel.means-bef.sel.means)/bef.sel.sds # This calculates standardized selection differentials sel.differentials # To calculate selection gradients, multiply the inverse of the P matrix # by the vector of selection differentials: sel.gradients=ginv(P)%*%sel.differentials sel.gradients # NONLINEAR DIFFERENTIALS AND GRADIENTS P.aft=cov(scale(aft.sel.traits, center=bef.sel.means, scale=bef.sel.sds), use="pairwise.complete.obs") ssT= as.matrix(sel.differentials)%*%t(as.matrix(sel.differentials)) NL.differentials=(P.aft-P+ssT) NL.differentials # calculating nonlinear selection gradients: NL.gradients=ginv(P)%*%NL.differentials%*%ginv(P) NL.gradients # resampling procedure to estimate variability in differentials and gradients resamp.differentials=NULL resamp.gradients=NULL resamp.NL.diff=NULL resamp.NL.grad=NULL NL.index=NULL # indicates whether nonlinear selection is derived from # changes in variances, in which case the indicies to specify rows and columns #would be the same: 1,1 or 2,2 etc. OR changes in covariances, in which case #the indicies would be different: 1,2 or 2,4 etc. for(h in 1:no.traits){ col.2=which(1:no.traits>=h) col.1=rep(h,length(col.2)) part.index=cbind(col.1, col.2) NL.index=rbind(NL.index, part.index) } for(i in 1:1000){ samp.bef.sel=bef.sel.traits[sample(seq(1:length(bef.sel.traits[,1])), length(bef.sel.traits[,1]), replace=T),] samp.aft.sel=aft.sel.traits[sample(seq(1:length(aft.sel.traits[,1])), length(aft.sel.traits[,1]), replace=T),] samp.bef.sel.means=apply(samp.bef.sel, 2, MN) samp.bef.sel.sds=apply(samp.bef.sel, 2, SD) samp.aft.sel.means=apply(samp.aft.sel, 2, MN) Df=(samp.aft.sel.means-samp.bef.sel.means)/samp.bef.sel.sds resamp.differentials=rbind(resamp.differentials, Df) samp.P=cov(scale(samp.bef.sel),use="pairwise.complete.obs") resamp.gradients=rbind(resamp.gradients, t(ginv(P)%*%Df)) samp.P.aft=cov(scale(samp.aft.sel, center=samp.bef.sel.means, scale=samp.bef.sel.sds), use="pairwise.complete.obs") samp.ssT= as.matrix(Df)%*%t(as.matrix(Df)) samp.C= (samp.P.aft-samp.P+samp.ssT) NL.diff.values=NULL for(j in 1:dim(NL.index)[1]){ NL.diff.values=c(NL.diff.values,samp.C[NL.index[j,1],NL.index[j,2]] ) } resamp.NL.diff=rbind(resamp.NL.diff, NL.diff.values ) samp.NL.grad=ginv(samp.P)%*%samp.C%*%ginv(samp.P) NL.grad.values=NULL for(k in 1:dim(NL.index)[1]){ NL.grad.values=c(NL.grad.values,samp.NL.grad[NL.index[k,1],NL.index[k,2]] ) } resamp.NL.grad=rbind(resamp.NL.grad,NL.grad.values) } # Estimates of variability in selection measures: apply(resamp.differentials, 2, SD) apply(resamp.gradients, 2, SD) # values for nonlinear measures are elements of a matrix. See the values in # "NL.index" to find corresponding row and column locators apply(resamp.NL.diff, 2, SD) apply(resamp.NL.grad, 2, SD) NL.index