# Use MW methods for real pseudo-proxies from long GCM simulations # Code adapted from MW: R_fig14.txt # Jan 2010: Differs from R_pseudo.txt in the eiv files imported, # flexibility of baseline, and calculation of skill scores # 1 year inconsistency in baselines for GKSS vs CSM fixed (acknowledgments to McShane and Wyner) my_pc_reg <- function(y, gpc, ppc, gpc_out, ppc_out, num_grid_pc, num_proxy_pc){ k = num_grid_pc p = num_proxy_pc tmpg <- data.frame(gpc[,1:k]) if(k==1){colnames(tmpg) <- "PC1"} yreg <- lm(y~., data=tmpg) tmpghat <- tmpg[1:dim(ppc_out)[1],] if(k==1){tmpghat <- data.frame(PC1=tmpghat)} greg <- list(NULL) tmpx <- data.frame(ppc[,1:p]) if(p==1){colnames(tmpx) <- "PC1"} for(i in 1:k){ greg[[i]] <- lm(tmpg[,i]~., data=tmpx) tmpghat[,i] <- predict(greg[[i]], newdata=ppc_out) } yhat <- predict(yreg, newdata=tmpghat) } # OPTIONS: Set in calling file # use either gkss or csm #src="GKSS" or src="CSM" # 59 or 104 pseudo-proxies #num="59" or num="104" # baseline for plotting # strplotbase=1900; endplotbase=1980; #read in files: nhmean, pseudo-proxies and modern period field if (src == "GKSS") { # data runs from 1001 AD to 1990 AD nhrecon <- read.table("EIV/data/pseudo/nh_gkss", nrows=990) pseudo.proxy <- read.table(paste("EIV/data/pseudo/gkssproxynorm_",num,"red",sep = ""), nrows=990) modern <- read.table("EIV/data/pseudo/grid_gkss", nrows=125) stmod=856 endmod=980 stpr=1 x=(1001:1990) nhrecon2=nhrecon[1:980,]-mean(nhrecon[(strplotbase-1000):(endplotbase-1000),]) x2=x[1:980] nh_eiv <- read.table(paste("EIV/output/eiv_hyb_gkss_",num,sep = ""), nrows=980) nh_eiv_nohy <- read.table(paste("EIV/output/eiv_nohyb_gkss_",num,sep = ""), nrows=980) nh_eiv[,2]=nh_eiv[,2]-mean(nh_eiv[(strplotbase-1000):(endplotbase-1000),2]) nh_eiv_nohy[,2]=nh_eiv_nohy[,2]-mean(nh_eiv_nohy[(strplotbase-1000):(endplotbase-1000),2]) } else { #CSM # data runs from 850 AD to 1980 AD nhrecon <- read.table("EIV/data/pseudo/nh_csm",nrows=1131) pseudo.proxy <- read.table(paste("EIV/data/pseudo/csmproxynorm_",num,"red",sep = ""), nrows=1131) modern <- read.table("EIV/data/pseudo/grid_csm", nrows=125) stmod=1007 endmod=1131 stpr=152 x=(850:1980) nhrecon2=nhrecon[152:1131,]-mean(nhrecon[(strplotbase-849):(endplotbase-849),]) x2=x[152:1131] nh_eiv <- read.table(paste("EIV/output/eiv_hyb_csm_",num,sep = ""), nrows=1131) nh_eiv_nohy <- read.table(paste("EIV/output/eiv_nohyb_csm_",num,sep = ""), nrows=1131) nh_eiv[1:980,2]=nh_eiv[152:1131,2]-mean(nh_eiv[(strplotbase-849):(endplotbase-849),2]) nh_eiv_nohy[1:980,2]=nh_eiv_nohy[152:1131,2]-mean(nh_eiv_nohy[(strplotbase-849):(endplotbase-849),2]) } # Parameters: numcv_lars <- 10 K_lars <- 5 numgridPC <- c(1,5,10,20) numproxyPC <- c(1,5,10,20) # Set proxies years=1001:1980 proxy=pseudo.proxy[stpr:endmod,1:as.numeric(num)] rownames(proxy) <- years temp=nhrecon[stmod:endmod,1] grid=modern[1:125,] rownames(grid) <- 1856:1980 # Do MW analysis: # Run proxy pca: proxy_pc <- prcomp(proxy, center=TRUE, scale=TRUE) grid_pc <- prcomp(grid, center=TRUE, scale=TRUE) # Build insample dataframe: my_years=rownames(grid) yy_in <- temp names(yy_in) <- my_years proxy_in <- data.frame( proxy[my_years,] ) gpc_in <- data.frame( grid_pc$x[my_years,] ) ppc_in <- data.frame( proxy_pc$x[my_years,] ) # Build out of sample dataframe: proxy_out <- data.frame( proxy ) ppc_out <- data.frame( proxy_pc$x ) # FIT VARIOUS MODELS: library(MASS) library(lars) myfrac <- seq(0, 1, length=201) yhat <- list(NULL) # Intercept: mylm <- lm(yy_in~1) yhat[[1]] <- predict(mylm, data.frame(ppc_out)) # OLS_PC1: mylm <- lm(yy_in~., data=data.frame(x=ppc_in[,1])) yhat[[2]] <- predict(mylm, data.frame(x=ppc_out[,1])) # OLS_PC4: mylm <- lm(yy_in~., data=data.frame(ppc_in[,1:4])) yhat[[3]] <- predict(mylm, data.frame(ppc_out[,1:4])) # OLS_PC10: mylm <- lm(yy_in~., data=data.frame(ppc_in[,1:10])) yhat[[4]] <- predict(mylm, data.frame(ppc_out[,1:10])) # OLS_PC20: mylm <- lm(yy_in~., data=data.frame(ppc_in[,1:20])) yhat[[5]] <- predict(mylm, data.frame(ppc_out[,1:20])) # LASSO_PROXY95 # Cross-validate: cv.mean <- matrix(NA,length(myfrac),numcv_lars) cv.se <- matrix(NA,length(myfrac),numcv_lars) for(j in 1:numcv_lars){ tmp <- cv.lars(as.matrix(proxy_in), yy_in, K=K_lars, fraction=myfrac, type="lasso", use.Gram=FALSE, plot.it=FALSE) cv.mean[,j] <- tmp$cv cv.se[,j] <- tmp$cv.error } # Create Lasso Model tmp <- rowMeans(cv.mean) tmp2 <- (1:length(myfrac))[tmp==min(tmp)] cv.frac <- myfrac[tmp2] full_lasso <- lars(as.matrix(proxy_in), yy_in, type="lasso", use.Gram=FALSE) lasso_yhat <- predict(full_lasso, as.matrix(proxy_out), type="fit", s=cv.frac, mode="fraction") yhat[[6]] <- lasso_yhat$fit # LASSO_PC95 # Cross-validate: cv.mean <- matrix(NA,length(myfrac),numcv_lars) cv.se <- matrix(NA,length(myfrac),numcv_lars) for(j in 1:numcv_lars){ tmp <- cv.lars(as.matrix(ppc_in), yy_in, K=K_lars, fraction=myfrac, type="lasso", use.Gram=FALSE, plot.it=FALSE) cv.mean[,j] <- tmp$cv cv.se[,j] <- tmp$cv.error } # Create Lasso Model tmp <- rowMeans(cv.mean) tmp2 <- (1:length(myfrac))[tmp==min(tmp)] cv.frac <- myfrac[tmp2] full_lasso <- lars(as.matrix(ppc_in), yy_in, type="lasso", use.Gram=FALSE) lasso_yhat <- predict(full_lasso, as.matrix(ppc_out), type="fit", s=cv.frac, mode="fraction") yhat[[7]] <- lasso_yhat$fit # stepAIC_Proxy95: f0 <- lm(yy_in~1, data=proxy_in) f1 <- lm(yy_in~., data=proxy_in) f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE) yhat[[8]] <- predict(f, data.frame(proxy_out)) # stepAIC_PC95: f0 <- lm(yy_in~1, data=ppc_in) f1 <- lm(yy_in~., data=ppc_in) f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE) yhat[[9]] <- predict(f, data.frame(ppc_out)) # stepBIC_Proxy95: f0 <- lm(yy_in~1, data=proxy_in) f1 <- lm(yy_in~., data=proxy_in) f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE, k=log(dim(proxy_in)[2])) yhat[[10]] <- predict(f, data.frame(proxy_out)) # stepBIC_PC95: f0 <- lm(yy_in~1, data=ppc_in) f1 <- lm(yy_in~., data=ppc_in) f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE, k=log(dim(ppc_in)[2])) yhat[[11]] <- predict(f, data.frame(ppc_out)) cnt <- 12 for(aa in numgridPC){ for(bb in numproxyPC){ yhat[[cnt]] <- my_pc_reg(yy_in, gpc_in, ppc_in, gpc_out, ppc_out, aa, bb) cnt <- cnt+1 } } yhat2 <- yhat yhat_sm <- yhat tmp1 <- as.numeric(rownames(ppc_out)) tmpin <- as.numeric(names(yy_in)) tmpout <- setdiff(tmp1, tmpin) vv <- mean(c( max(tmpout), min(tmpin) )) #pdf(paste("pseudo_",src,"_",num,"eiv.pdf",sep=""), width=8, height=5) m <- min(yy_in, min(unlist(yhat2))) M <- max(yy_in, max(unlist(yhat2))) plot( as.numeric(rownames(ppc_out)), yhat[[1]], type="n", xlab="Year", ylab="Temperature Anomaly (C)", ylim=c(-1.,0.5), col="grey", main=paste(panel,"Pseudo-proxy test (N=",num,"): ",src,sep=""), cex.main=1.2) abline(v=vv) for(i in 2:length(yhat)){ yhat2[[i]] = yhat[[i]]-mean(unlist(yhat[[i]][(strplotbase-1000):(endplotbase-1000)])) # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col="grey" ) tmplo <- loess(yhat2[[i]]~as.numeric(rownames(ppc_out)), span=.0833) tmpy <- predict(tmplo, as.numeric(rownames(ppc_out)), se=TRUE) yhat_sm[[i]]= tmpy$fit lines(as.numeric(rownames(ppc_out)), tmpy$fit, col="grey", lwd=2) } # Add OLS PC1: # i <- 2 ## lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=2) # lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=2, lwd=2) # Add OLS PC4: i <- 3 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=5) lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=5, lwd=2) # Add OLS PC10: i <- 4 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=3) lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=3, lwd=2) # Add OLS G5 P5: i <- 17 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=4) lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=4, lwd=2) # Add Lasso i <- 6 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=2) lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=2, lwd=2) i <- 7 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=2) lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=2, lwd=2) # add eiv reconstructions (hybrid + non-hybrid) # lines( x2,nh_eiv[1:980,2], col=1, lwd=1) tmplo <- loess(nh_eiv[1:980,2]~x2, span=.0833) tmpy <- predict(tmplo, x2, se=TRUE) nh_eiv_sm=tmpy$fit lines(x2, tmpy$fit, col=6, lwd=2) # lines( x2,nh_eiv_nohy[1:980,2], col=1, lwd=1) tmplo <- loess(nh_eiv_nohy[1:980,2]~x2, span=.0833) tmpy <- predict(tmplo, x2, se=TRUE) nh_eiv_nohy_sm=tmpy$fit lines(x2, tmpy$fit, col=7, lwd=2) # Add actual NH temps: # lines( x2,nhrecon2, col=1, lwd=1) tmplo <- loess(nhrecon2~x2, span=.0833) tmpy <- predict(tmplo, x2, se=TRUE) nhrecon_sm=tmpy$fit lines(x2, tmpy$fit, col=1, lwd=2) legend(1250,0.5,legend=c("NH Mean","OLS PC4","OLS PC10","OLS G5 PC5"),lwd=2,col=c(1,5,3,4)) legend(1550,0.5,legend=c("Lasso","EIV","EIV (hybrid)"),lwd=2,col=c(2,7,6)) #dev.off() ## Produce skill scores for various methods (over 1001-1855) ## r^2, RMSE, RE and CE for annual and smoothed data. rmse=matrix(NA,length(yhat)+2,2) r2=matrix(NA,length(yhat)+2,2) RE=matrix(NA,length(yhat)+2,2) CE=matrix(NA,length(yhat)+2,2) selyr=1:855 selyr_cal=856:980 ## baseline all reconstructions/temps to have the same (zero) mean over the calibration interval nhrecon2 = nhrecon2-mean(nhrecon2[selyr_cal]) nh_eiv[1:980,2]=nh_eiv[1:980,2]-mean(nh_eiv[selyr_cal,2]) nh_eiv_sm=nh_eiv_sm-mean(nh_eiv_sm[selyr_cal]) nh_eiv_nohy[1:980,2]=nh_eiv_nohy[1:980,2]-mean(nh_eiv_nohy[selyr_cal,2]) nh_eiv_nohy_sm=nh_eiv_nohy_sm-mean(nh_eiv_nohy_sm[selyr_cal]) for(i in 2:length(yhat)){ ## baseline all reconstructions to have zero mean over the calibration interval yhat2[[i]] = yhat[[i]]-mean(unlist(yhat[[i]][selyr_cal])) yhat_sm[[i]] = yhat_sm[[i]]-mean(unlist(yhat_sm[[i]][selyr_cal])) # RMSE rmse[i,1] = sqrt(mean((yhat2[[i]][selyr]-nhrecon2[selyr])^2)) rmse[i,2] = sqrt(mean((yhat_sm[[i]][selyr]-nhrecon_sm[selyr])^2)) # RE RE[i,1] = 1 - rmse[i,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr_cal]))^2) RE[i,2] = 1 - rmse[i,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr_cal]))^2) # CE CE[i,1] = 1 - rmse[i,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr]))^2) CE[i,2] = 1 - rmse[i,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr]))^2) # r^2 r2[i,1] = cor(yhat2[[i]][selyr],nhrecon2[selyr])^2 r2[i,2] = cor(yhat_sm[[i]][selyr],nhrecon_sm[selyr])^2 } # add eiv stats j=length(yhat)+1 # RMSE rmse[j,1] = sqrt(mean((nh_eiv_nohy[selyr,2]-nhrecon2[selyr])^2)) rmse[j,2] = sqrt(mean((nh_eiv_nohy_sm[selyr]-nhrecon_sm[selyr])^2)) # RE RE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr_cal]))^2) RE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr_cal]))^2) # CE CE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr]))^2) CE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr]))^2) # r^2 r2[j,1] = cor(nh_eiv_nohy[selyr,2],nhrecon2[selyr])^2 r2[j,2] = cor(nh_eiv_nohy_sm[selyr],nhrecon_sm[selyr])^2 j=length(yhat)+2 # RMSE rmse[j,1] = sqrt(mean((nh_eiv[selyr,2]-nhrecon2[selyr])^2)) rmse[j,2] = sqrt(mean((nh_eiv_sm[selyr]-nhrecon_sm[selyr])^2)) # RE RE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr_cal]))^2) RE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr_cal]))^2) # CE CE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr]))^2) CE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr]))^2) # r^2 r2[j,1] = cor(nh_eiv[selyr,2],nhrecon2[selyr])^2 r2[j,2] = cor(nh_eiv_sm[selyr],nhrecon_sm[selyr])^2 # print out select results: print(paste(" Pseudo-proxy results for ",src," using ",num," records:",sep="")) rowhead=c(" OLS PC1 :"," OLS PC4 :"," OLS PC10 :"," OLS G5 P5:"," Lasso Pr :"," Lasso PC :"," EIV :"," EIV (hyb):") print(" RMSE RMSE(SM) RE RE(SM) CE CE(SM) r2 r2(SM)") sel=c(2,3,4,17,6,7,28,29) for (i in 1:8){ j=sel[i] print(paste(rowhead[i], round(rmse[j,1],3),round(rmse[j,2],3),round(RE[j,1],3),round(RE[j,2],3),round(CE[j,1],3),round(CE[j,2],3) ,round(r2[j,1],3),round(r2[j,2],3))) }