# Use MW methods for real pseudo-proxies from long GCM simulations
# Code adapted from MW: R_fig14.txt

# Jan 2010: Differs from R_pseudo.txt in the eiv files imported,
# flexibility of baseline, and calculation of skill scores 
# 1 year inconsistency in baselines for GKSS vs CSM fixed (acknowledgments to McShane and Wyner) 

my_pc_reg <- function(y, gpc, ppc, gpc_out, ppc_out, num_grid_pc, num_proxy_pc){

  k = num_grid_pc
  p = num_proxy_pc
  tmpg <- data.frame(gpc[,1:k])
  if(k==1){colnames(tmpg) <- "PC1"}
  yreg <- lm(y~., data=tmpg)

  tmpghat <- tmpg[1:dim(ppc_out)[1],]
  if(k==1){tmpghat <- data.frame(PC1=tmpghat)}
  greg <- list(NULL)
  tmpx <- data.frame(ppc[,1:p])
  if(p==1){colnames(tmpx) <- "PC1"}
  for(i in 1:k){
    greg[[i]] <- lm(tmpg[,i]~.,  data=tmpx)   
    tmpghat[,i] <- predict(greg[[i]], newdata=ppc_out)
  }
  yhat <- predict(yreg, newdata=tmpghat)
}


# OPTIONS:  Set in calling file
# use either gkss or csm
#src="GKSS"  or src="CSM"

# 59 or 104 pseudo-proxies
#num="59" or num="104"

# baseline for plotting
# strplotbase=1900; endplotbase=1980;

#read in files: nhmean, pseudo-proxies and modern period field
if (src == "GKSS") {
# data runs from 1001 AD to 1990 AD
nhrecon <- read.table("EIV/data/pseudo/nh_gkss", nrows=990)
pseudo.proxy <- read.table(paste("EIV/data/pseudo/gkssproxynorm_",num,"red",sep = ""), nrows=990)
modern <- read.table("EIV/data/pseudo/grid_gkss", nrows=125)
stmod=856
endmod=980
stpr=1
  x=(1001:1990)
  nhrecon2=nhrecon[1:980,]-mean(nhrecon[(strplotbase-1000):(endplotbase-1000),])
  x2=x[1:980]
nh_eiv <- read.table(paste("EIV/output/eiv_hyb_gkss_",num,sep = ""),  nrows=980)
nh_eiv_nohy <- read.table(paste("EIV/output/eiv_nohyb_gkss_",num,sep = ""), nrows=980)
nh_eiv[,2]=nh_eiv[,2]-mean(nh_eiv[(strplotbase-1000):(endplotbase-1000),2])
nh_eiv_nohy[,2]=nh_eiv_nohy[,2]-mean(nh_eiv_nohy[(strplotbase-1000):(endplotbase-1000),2])

} else {  #CSM

# data runs from 850 AD to 1980 AD
nhrecon <- read.table("EIV/data/pseudo/nh_csm",nrows=1131)
pseudo.proxy <- read.table(paste("EIV/data/pseudo/csmproxynorm_",num,"red",sep = ""), nrows=1131)
modern <- read.table("EIV/data/pseudo/grid_csm",  nrows=125)
stmod=1007
endmod=1131
stpr=152
   x=(850:1980)
  nhrecon2=nhrecon[152:1131,]-mean(nhrecon[(strplotbase-849):(endplotbase-849),])
  x2=x[152:1131]
nh_eiv <- read.table(paste("EIV/output/eiv_hyb_csm_",num,sep = ""), nrows=1131)
nh_eiv_nohy <- read.table(paste("EIV/output/eiv_nohyb_csm_",num,sep = ""), nrows=1131)
nh_eiv[1:980,2]=nh_eiv[152:1131,2]-mean(nh_eiv[(strplotbase-849):(endplotbase-849),2])
nh_eiv_nohy[1:980,2]=nh_eiv_nohy[152:1131,2]-mean(nh_eiv_nohy[(strplotbase-849):(endplotbase-849),2])
}


# Parameters:
numcv_lars <- 10
K_lars <- 5
numgridPC <- c(1,5,10,20)
numproxyPC <- c(1,5,10,20)

# Set proxies 

years=1001:1980
proxy=pseudo.proxy[stpr:endmod,1:as.numeric(num)]
rownames(proxy) <- years
temp=nhrecon[stmod:endmod,1]

grid=modern[1:125,]
rownames(grid) <- 1856:1980

# Do MW analysis:

# Run proxy pca:
proxy_pc <- prcomp(proxy, center=TRUE, scale=TRUE)
grid_pc <- prcomp(grid, center=TRUE, scale=TRUE)

# Build insample dataframe:
my_years=rownames(grid)
yy_in <- temp
names(yy_in) <- my_years
proxy_in <- data.frame( proxy[my_years,] )
gpc_in <- data.frame( grid_pc$x[my_years,] )
ppc_in <- data.frame( proxy_pc$x[my_years,] )

# Build out of sample dataframe:
proxy_out <- data.frame( proxy )
ppc_out <- data.frame( proxy_pc$x )

# FIT VARIOUS MODELS:
library(MASS)
library(lars)
myfrac <- seq(0, 1, length=201)
yhat <- list(NULL)
  # Intercept:
  mylm <- lm(yy_in~1)
  yhat[[1]] <- predict(mylm, data.frame(ppc_out))

  # OLS_PC1:
  mylm <- lm(yy_in~., data=data.frame(x=ppc_in[,1]))
  yhat[[2]] <- predict(mylm, data.frame(x=ppc_out[,1]))

  # OLS_PC4:
  mylm <- lm(yy_in~., data=data.frame(ppc_in[,1:4]))
  yhat[[3]] <- predict(mylm, data.frame(ppc_out[,1:4]))

  # OLS_PC10:
  mylm <- lm(yy_in~., data=data.frame(ppc_in[,1:10]))
  yhat[[4]] <- predict(mylm, data.frame(ppc_out[,1:10]))

  # OLS_PC20:
  mylm <- lm(yy_in~., data=data.frame(ppc_in[,1:20]))
  yhat[[5]] <- predict(mylm, data.frame(ppc_out[,1:20]))

  # LASSO_PROXY95
  # Cross-validate:
  cv.mean <- matrix(NA,length(myfrac),numcv_lars)
  cv.se <- matrix(NA,length(myfrac),numcv_lars)
  for(j in 1:numcv_lars){
    tmp <- cv.lars(as.matrix(proxy_in), yy_in, K=K_lars, fraction=myfrac, type="lasso", use.Gram=FALSE, plot.it=FALSE)
    cv.mean[,j] <- tmp$cv
    cv.se[,j] <- tmp$cv.error
  }
  # Create Lasso Model
  tmp <- rowMeans(cv.mean)
  tmp2 <- (1:length(myfrac))[tmp==min(tmp)]
  cv.frac <- myfrac[tmp2]
  full_lasso <- lars(as.matrix(proxy_in), yy_in, type="lasso", use.Gram=FALSE)
  lasso_yhat <- predict(full_lasso, as.matrix(proxy_out), type="fit", s=cv.frac, mode="fraction")
  yhat[[6]] <- lasso_yhat$fit

  # LASSO_PC95
  # Cross-validate:
  cv.mean <- matrix(NA,length(myfrac),numcv_lars)
  cv.se <- matrix(NA,length(myfrac),numcv_lars)
  for(j in 1:numcv_lars){
    tmp <- cv.lars(as.matrix(ppc_in), yy_in, K=K_lars, fraction=myfrac, type="lasso", use.Gram=FALSE, plot.it=FALSE)
    cv.mean[,j] <- tmp$cv
    cv.se[,j] <- tmp$cv.error
  }
  # Create Lasso Model
  tmp <- rowMeans(cv.mean)
  tmp2 <- (1:length(myfrac))[tmp==min(tmp)]
  cv.frac <- myfrac[tmp2]
  full_lasso <- lars(as.matrix(ppc_in), yy_in, type="lasso", use.Gram=FALSE)
  lasso_yhat <- predict(full_lasso, as.matrix(ppc_out), type="fit", s=cv.frac, mode="fraction")
  yhat[[7]] <- lasso_yhat$fit

  # stepAIC_Proxy95:
  f0 <- lm(yy_in~1, data=proxy_in)
  f1 <- lm(yy_in~., data=proxy_in)
  f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE)
  yhat[[8]] <- predict(f, data.frame(proxy_out))

  # stepAIC_PC95:
  f0 <- lm(yy_in~1, data=ppc_in)
  f1 <- lm(yy_in~., data=ppc_in)
  f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE)
  yhat[[9]] <- predict(f, data.frame(ppc_out))

  # stepBIC_Proxy95:
  f0 <- lm(yy_in~1, data=proxy_in)
  f1 <- lm(yy_in~., data=proxy_in)
  f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE, k=log(dim(proxy_in)[2]))
  yhat[[10]] <- predict(f, data.frame(proxy_out))

  # stepBIC_PC95:
  f0 <- lm(yy_in~1, data=ppc_in)
  f1 <- lm(yy_in~., data=ppc_in)
  f <- stepAIC(f0, scope=list(lower=f0, upper=f1), trace=FALSE, k=log(dim(ppc_in)[2]))
  yhat[[11]] <- predict(f, data.frame(ppc_out))

  cnt <- 12
  for(aa in numgridPC){
    for(bb in numproxyPC){
      yhat[[cnt]] <- my_pc_reg(yy_in, gpc_in, ppc_in, gpc_out, ppc_out, aa, bb)
      cnt <- cnt+1
    }
  }

yhat2 <- yhat
yhat_sm <- yhat

tmp1 <- as.numeric(rownames(ppc_out))
tmpin <- as.numeric(names(yy_in))
tmpout <- setdiff(tmp1, tmpin)
vv <- mean(c( max(tmpout), min(tmpin) ))

#pdf(paste("pseudo_",src,"_",num,"eiv.pdf",sep=""), width=8, height=5)

m <- min(yy_in, min(unlist(yhat2)))
M <- max(yy_in, max(unlist(yhat2)))
plot( as.numeric(rownames(ppc_out)), yhat[[1]], type="n", xlab="Year", ylab="Temperature Anomaly (C)", ylim=c(-1.,0.5), col="grey", main=paste(panel,"Pseudo-proxy test (N=",num,"): ",src,sep=""), cex.main=1.2)
abline(v=vv)

for(i in 2:length(yhat)){
  yhat2[[i]] = yhat[[i]]-mean(unlist(yhat[[i]][(strplotbase-1000):(endplotbase-1000)]))
#  lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col="grey" )
  tmplo <- loess(yhat2[[i]]~as.numeric(rownames(ppc_out)), span=.0833)
  tmpy <- predict(tmplo, as.numeric(rownames(ppc_out)), se=TRUE) 
  yhat_sm[[i]]= tmpy$fit
  lines(as.numeric(rownames(ppc_out)), tmpy$fit, col="grey", lwd=2)
}
  # Add OLS PC1:
 # i <- 2
 ## lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=2)
 # lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=2, lwd=2)

  # Add OLS PC4:
  i <- 3
 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=5)
  lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=5, lwd=2)

  # Add OLS PC10:
  i <- 4
 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=3)
  lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=3, lwd=2)

  # Add OLS G5 P5:
  i <- 17
 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=4)
  lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=4, lwd=2)

  # Add Lasso
  i <- 6
 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=2)
  lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=2, lwd=2)
  i <- 7
 # lines( as.numeric(rownames(ppc_out)), yhat2[[i]], col=2)
  lines(as.numeric(rownames(ppc_out)), yhat_sm[[i]], col=2, lwd=2)

# add eiv reconstructions (hybrid + non-hybrid)

 # lines( x2,nh_eiv[1:980,2], col=1, lwd=1)
  tmplo <- loess(nh_eiv[1:980,2]~x2, span=.0833)
   tmpy <- predict(tmplo, x2, se=TRUE) 
   nh_eiv_sm=tmpy$fit
  lines(x2, tmpy$fit, col=6, lwd=2)
 # lines( x2,nh_eiv_nohy[1:980,2], col=1, lwd=1)
  tmplo <- loess(nh_eiv_nohy[1:980,2]~x2, span=.0833)
   tmpy <- predict(tmplo, x2, se=TRUE) 
   nh_eiv_nohy_sm=tmpy$fit
   lines(x2, tmpy$fit, col=7, lwd=2)

# Add actual NH temps:

 # lines( x2,nhrecon2, col=1, lwd=1)

  tmplo <- loess(nhrecon2~x2, span=.0833)
  tmpy <- predict(tmplo, x2, se=TRUE) 
  nhrecon_sm=tmpy$fit
  lines(x2, tmpy$fit, col=1, lwd=2)

legend(1250,0.5,legend=c("NH Mean","OLS PC4","OLS PC10","OLS G5 PC5"),lwd=2,col=c(1,5,3,4)) 
legend(1550,0.5,legend=c("Lasso","EIV","EIV (hybrid)"),lwd=2,col=c(2,7,6)) 

#dev.off()

## Produce skill scores for various methods (over 1001-1855)
## r^2, RMSE, RE and CE for annual and smoothed data.

rmse=matrix(NA,length(yhat)+2,2)
r2=matrix(NA,length(yhat)+2,2)
RE=matrix(NA,length(yhat)+2,2) 
CE=matrix(NA,length(yhat)+2,2) 
selyr=1:855
selyr_cal=856:980

## baseline all reconstructions/temps to have the same (zero) mean over the calibration interval
nhrecon2 = nhrecon2-mean(nhrecon2[selyr_cal])

nh_eiv[1:980,2]=nh_eiv[1:980,2]-mean(nh_eiv[selyr_cal,2])
nh_eiv_sm=nh_eiv_sm-mean(nh_eiv_sm[selyr_cal])
nh_eiv_nohy[1:980,2]=nh_eiv_nohy[1:980,2]-mean(nh_eiv_nohy[selyr_cal,2])
nh_eiv_nohy_sm=nh_eiv_nohy_sm-mean(nh_eiv_nohy_sm[selyr_cal])

for(i in 2:length(yhat)){

## baseline all reconstructions to have zero mean over the calibration interval
  yhat2[[i]] = yhat[[i]]-mean(unlist(yhat[[i]][selyr_cal]))
  yhat_sm[[i]] = yhat_sm[[i]]-mean(unlist(yhat_sm[[i]][selyr_cal]))

# RMSE
  rmse[i,1] = sqrt(mean((yhat2[[i]][selyr]-nhrecon2[selyr])^2))
  rmse[i,2] = sqrt(mean((yhat_sm[[i]][selyr]-nhrecon_sm[selyr])^2))
# RE
  RE[i,1] = 1 - rmse[i,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr_cal]))^2)
  RE[i,2] = 1 - rmse[i,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr_cal]))^2)
# CE
  CE[i,1] = 1 - rmse[i,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr]))^2) 
  CE[i,2] = 1 - rmse[i,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr]))^2) 
# r^2
  r2[i,1] = cor(yhat2[[i]][selyr],nhrecon2[selyr])^2
  r2[i,2] = cor(yhat_sm[[i]][selyr],nhrecon_sm[selyr])^2
}

# add eiv stats
j=length(yhat)+1

# RMSE
  rmse[j,1] = sqrt(mean((nh_eiv_nohy[selyr,2]-nhrecon2[selyr])^2))
  rmse[j,2] = sqrt(mean((nh_eiv_nohy_sm[selyr]-nhrecon_sm[selyr])^2))
# RE
  RE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr_cal]))^2)
  RE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr_cal]))^2)
# CE
  CE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr]))^2) 
  CE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr]))^2) 
# r^2
  r2[j,1] = cor(nh_eiv_nohy[selyr,2],nhrecon2[selyr])^2
  r2[j,2] = cor(nh_eiv_nohy_sm[selyr],nhrecon_sm[selyr])^2

j=length(yhat)+2

# RMSE
  rmse[j,1] = sqrt(mean((nh_eiv[selyr,2]-nhrecon2[selyr])^2))
  rmse[j,2] = sqrt(mean((nh_eiv_sm[selyr]-nhrecon_sm[selyr])^2))
# RE
  RE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr_cal]))^2)
  RE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr_cal]))^2)
# CE
  CE[j,1] = 1 - rmse[j,1]^2/mean((nhrecon2[selyr]-mean(nhrecon2[selyr]))^2) 
  CE[j,2] = 1 - rmse[j,2]^2/mean((nhrecon_sm[selyr]-mean(nhrecon_sm[selyr]))^2) 
# r^2
  r2[j,1] = cor(nh_eiv[selyr,2],nhrecon2[selyr])^2
  r2[j,2] = cor(nh_eiv_sm[selyr],nhrecon_sm[selyr])^2


# print out select results:
print(paste(" Pseudo-proxy results for ",src," using ",num," records:",sep=""))
rowhead=c(" OLS PC1  :"," OLS PC4  :"," OLS PC10 :"," OLS G5 P5:"," Lasso Pr :"," Lasso PC :"," EIV      :"," EIV (hyb):")
print("          RMSE RMSE(SM)  RE  RE(SM)  CE CE(SM) r2  r2(SM)")
sel=c(2,3,4,17,6,7,28,29)
for (i in 1:8){
j=sel[i]
print(paste(rowhead[i], round(rmse[j,1],3),round(rmse[j,2],3),round(RE[j,1],3),round(RE[j,2],3),round(CE[j,1],3),round(CE[j,2],3) ,round(r2[j,1],3),round(r2[j,2],3)))
}