################################################################################ # R Script for visualizing selection surfaces estimated from cross-sectional # data. If you have questions, contact D. Johnson at johnson@nceas.ucsb.edu # Updated: 06/18/2012 # The file is set up such that users can run the following code by copying and # pasting it into R. Sections of code that are typed in ALL CAPITALS need to # be replaced with the appropriate information (e.g., trait names, file # locations, names of grouping variables, etc.) supplied by the user. ################################################################################ # Required Packages # ################################################################################ # This program uses the "mgcv" package for R, therefore this package needs to # be installed prior to using the script. To install, type the following at the # command prompt upon opening R. install.packages("mgcv", dependencies=TRUE) # and then select an appropriate mirror for download (e.g., USA(CA1)). # Once the mgcv package has been installed, load it to continue: library(mgcv) ################################################################################ # INPUT FILE # ################################################################################ # The script expects a single file as data input. The default is to have these # files in comma delimited format (.csv), although appropriate # changes to the script can be made to accomodate other formats (tab delimited # data, etc.) # In the file, rows index individuals (samples), columns index traits. # Note that the file should contain a header row # There should also be a column that distinguishes the before selection and # after selection samples. # The file may also contain a column to distinguish groups for separate analyses ################################################################################ # BEGIN USER INPUT SECTION # ################################################################################ # Set the file pathway to the appropriate folder where the data file is saved # For example, for a comma-separated file in a subfolder called MV selection # on a removable disk(in this case the I drive), the path is: data_set_path<-"I:/MV selection/EXAMPLE.DATA.FILE.csv" ######################## Reading in Datafile ################################## # Code is expecting comma delimited data, for tab delimited change to sep=" " data.file<-read.table(data_set_path, header=TRUE, sep=",") #OPTIONAL: if the file contains data for multiple groups that need to be #analyzed separately(e.g., different populations, cohorts, times, etc.), #specify the group of interest here: data.subset=subset(data.file, data.file$SELECTION.GROUP=="GROUP1") # OR # #if the data in "data.file" are for a single population of interest data.subset=data.file ############################################################################### # visualizing total selection on single traits (analagous to selection # differentials) # specify before and after groups: before.sel.samp=subset(data.subset, data.subset$SAMPLE.GROUP=="BEFORE") after.sel.samp=subset(data.subset, data.subset$SAMPLE.GROUP=="AFTER") #specify traits to include in the analysis (names come after the $ symbol # i.e., before.sel.samp$TRAITNAME) # for brevity, this example analyzes two traits. If needed, include more traits # after TRAIT.2, separating the list with commas, e.g., before.sel.samp$TRAIT.1, # before.sel.samp$TRAIT.2, before.sel.samp$TRAIT.3, etc. bef.sel.traits=cbind(before.sel.samp$TRAIT.1, before.sel.samp$TRAIT.2) aft.sel.traits=cbind(after.sel.samp$TRAIT.1, after.sel.samp$TRAIT.2) MN=function(x){mean(x, na.rm=T)} SD=function(x){sd(x, na.rm=T)} bef.sel.means=apply(bef.sel.traits, 2, MN) bef.sel.sds=apply(bef.sel.traits, 2, SD) groups=c(rep(0,length(before.sel.samp[,1])), rep(1, length(after.sel.samp[,1]))) pooled.data=rbind(scale(bef.sel.traits), scale(aft.sel.traits, center=bef.sel.means, scale=bef.sel.sds)) reg.data=as.data.frame(cbind(groups, pooled.data)) names(reg.data)<-c("groups", "TRAIT.1", "TRAIT.2") # CAN ADD MORE TRAITS AFTER # TRAIT 2, if necessary Uz=gam(groups~s(TRAIT.1), data=reg.data, family=binomial) # Uz describes conditional probability of being in the 'after selection' # sample, given that an individual was sampled at all. Uz as a function of # phenotypic value. When gam is used to estimate the function, Uz is # represented by a thin plate regression spline (the default in mgcv package) summary(Uz) # PLOTTING INDIVIDUAL SELECTION SURFACE: T1=seq(min(reg.data$TRAIT.1),max(reg.data$TRAIT.1),(max(reg.data$TRAIT.1) -min(reg.data$TRAIT.1))/150) # Creates a vector of phenotype values over which to plot the selection surface # Alternatively use #T1=sequence(LOWER LIMIT, UPPER LIMIT, INCREMENT)#and substitute desired values N.before= dim(bef.sel.traits)[1]# before selection sample size N.after= dim(aft.sel.traits)[1] # after selection sample size pred.Uz=predict(Uz,list(TRAIT.1=T1), se.fit=TRUE) pred.Uz.upper=pred.Uz$fit+2*pred.Uz$se.fit pred.Uz.lower=pred.Uz$fit-2*pred.Uz$se.fit ISS=function(x){ (N.before/N.after)*exp(x) } # plots ISS plus two standard errors plot(T1, ISS(pred.Uz$fit), type="l", lwd=2, ylab=("Relative Survival"), xlab=("TRAIT.1")) lines(T1, ISS(pred.Uz.upper), lty=2) lines(T1, ISS(pred.Uz.lower), lty=2) # DIRECT EFFECTS OF SELECTION # Here, Uz is estimated as a function of two traits. If interactions among the # traits are present, then two trait, three dimensional representations are best # (and are detailed below). First we consider the case with no interaction # when one is interested in plotting estimates of the direct relationships # between trait values and relative survival. T1=seq(min(reg.data$TRAIT.1),max(reg.data$TRAIT.1),(max(reg.data$TRAIT.1) -min(reg.data$TRAIT.1))/150) # values of the variable of interest to plot data for T2=rep(0, length(T1)) # holds second variable constant, in this case at a # value of 0 - the mean for standardized traits Uz=gam(groups~s(TRAIT.1)+s(TRAIT.2), data=reg.data, family=binomial) pred.Uz=predict(Uz,list(TRAIT.1=T1, TRAIT.2=T2), se.fit=TRUE) pred.Uz.upper=pred.Uz$fit+2*pred.Uz$se.fit pred.Uz.lower=pred.Uz$fit-2*pred.Uz$se.fit ISS=function(x){ (N.before/N.after)*exp(x) } # plots ISS plus two standard errors for trait 1 plot(T1, ISS(pred.Uz$fit), type="l", lwd=2, xlab=("TRAIT.1"), ylab=("Relative Survival")) lines(T1, ISS(pred.Uz.upper), lty=2) lines(T1, ISS(pred.Uz.lower), lty=2) # to plot the ISS plus two standard errors for trait 2, we hold the first trait # constant (i.e., trait1=const.val in the following) and predict relative # survival values for trait two (i.e., trait two=T1 in the following): T1=seq(min(reg.data$TRAIT.2),max(reg.data$TRAIT.2),(max(reg.data$TRAIT.2) -min(reg.data$TRAIT.2))/150) T2=rep(0, length(T1)) pred.Uz=predict(Uz,list(TRAIT.1=T2, TRAIT.2=T1), se.fit=TRUE) pred.Uz.upper=pred.Uz$fit+2*pred.Uz$se.fit pred.Uz.lower=pred.Uz$fit-2*pred.Uz$se.fit plot(T1, ISS(pred.Uz$fit), type="l", lwd=2, xlab=("TRAIT.2"), ylab=("Relative Survival")) lines(T1, ISS(pred.Uz.upper), lty=2) lines(T1, ISS(pred.Uz.lower), lty=2) # VISUALIZING DIRECT SELECTION ON TWO TRAITS TOGETHER # this is especially valuable when interactions are present T1=seq(min(reg.data$TRAIT.1),max(reg.data$TRAIT.1),(max(reg.data$TRAIT.1) -min(reg.data$TRAIT.1))/150) T2=seq(min(reg.data$TRAIT.2),max(reg.data$TRAIT.2),(max(reg.data$TRAIT.2) -min(reg.data$TRAIT.2))/150) Uz=gam(groups~s(TRAIT.1)+s(TRAIT.2), data=reg.data, family=binomial) EF=function(x,y){ (N.before/N.after)*exp(predict(Uz,list(TRAIT.1=x, TRAIT.2=y))) } zvec= outer(T1, T2, EF) persp(T1, T2, zvec, theta=30, phi=30, shade=0.5, ticktype="detailed", zlab="Relative survival", xlab="TRAIT.1", ylab="TRAIT.2")