library(RCurl)
library(XML)
library(reshape)


i <- 0
g <- 1
allpolls <- NULL
while(i < 1){
    if(length(xmlTreeParse(paste("http://elections.huffingtonpost.com/pollster/api/polls.xml?page=", g, sep=""))$doc$children$polls)<10)
        i <- 1
    newset <- lapply(xmlTreeParse(paste("http://elections.huffingtonpost.com/pollster/api/polls.xml?page=", g, sep=""))$doc$children$polls, function(x) xmlToList(x))
    allpolls <- c(allpolls, newset)
    print(g)
    g <- g+1
}


save(allpolls, file="allpolls.rdata")


load("allpolls.rdata")

apcnom <- sapply(allpolls, function(x) x$questions["name",])
uniquecontests <- unique(unlist(apcnom))

RvO <- uniquecontests[grepl("Romney vs. Obama", uniquecontests) & grepl("2012", uniquecontests)]

relevant <- sapply(apcnom, function(x) sum(RvO %in% x))

anyrel <- allpolls[relevant>0]

names(anyrel) <- sapply(anyrel, function(x) paste("poll", x[[1]], sep=""))

nm <- unlist(sapply(anyrel, function(x) names(unlist(x))))
nom <- unique(unlist(sapply(anyrel, function(x) names(unlist(x)))))

nqs <- nom[!grepl("question", nom)]

trNA <- function(x){
    out <- try(x)
    if(is.null(x))
        out <- "NA"
    out
}

ul <- function(x){
  d <- sapply(x, function(q) !is.null(q))
  y<-rep(NA, length(x))
  y[d] <- unlist(x[d])
  y
}
    

anyrel2 <- anyrel
anyrel2$poll14667b <- anyrel$poll14667
anyrel2$poll14667$questions <- anyrel$poll14667$questions[,1:2]
anyrel2$poll14667b$questions <- anyrel$poll14667$questions[,3:4]

# All polls in all states for Obama/Romney

df1 <- data.frame(id=sapply(anyrel2, function(x) try(x$id)),
                  pollster=sapply(anyrel2, function(x) try(x$pollster)),
                  start_date=sapply(anyrel2, function(x) try(x$start_date)),
                  end_date=sapply(anyrel2, function(x) try(x$end_date)),
                  method=sapply(anyrel2, function(x) try(x$method)),
                  last_updated=sapply(anyrel2, function(x) try(x$last_updated)),
                  house=ul(sapply(anyrel2, function(x) trNA(x$survey_houses$survey_house.name))),
                  name=ul(sapply(anyrel2, function(x) trNA(x$questions[,x$questions["topic",]=="2012-president"]["name"]))),
                  topic=rep("2012-president", length(anyrel2)),
                  state=ul(sapply(anyrel, function(x) trNA(x$questions[,x$questions["topic",]=="2012-president"]["state"]))),
                  subpop=sapply(anyrel, function(x) trNA(x$questions[,x$questions["topic",]=="2012-president"]$subpopulations[,1]$name)),
                  observations=as.numeric(as.character(ul(sapply(anyrel, function(x) try(x$questions[,x$questions["topic",]=="2012-president"]$subpopulations[,1]$observations))))),
                  moe=ul(sapply(anyrel, function(x) try(x$questions[,x$questions["topic",]=="2012-president"]$subpopulations[,1]$margin_of_error))),
                  obama=as.numeric(as.character(ul(sapply(anyrel, function(x) try(x$questions[,x$questions["topic",]=="2012-president"]$subpopulations[,1]$responses[,x$questions[,x$questions["topic",]=="2012-president"]$subpopulations[,1]$responses["choice",]=="Obama"]$value))))),
                  romney=as.numeric(as.character(ul(sapply(anyrel, function(x) try(x$questions[,x$questions["topic",]=="2012-president"]$subpopulations[,1]$responses[,x$questions[,x$questions["topic",]=="2012-president"]$subpopulations[,1]$responses["choice",]=="Romney"]$value)))))
                  )

df1$ProbOb <- df1$obama/(df1$obama+df1$romney)


votes2012set <- readHTMLTable("http://www.archives.gov/federal-register/electoral-college/2012/popular-vote.html")[[2]]

votes2012 <- data.frame(State=as.character(votes2012set[,1]), Obama=as.numeric(as.character(votes2012set[,2])), Romney=as.numeric(as.character(votes2012set[,3])))

levels(votes2012$State)<-c(levels(votes2012$State), "NY", "US")

votes2012$State[33] <- "NY"
votes2012$State[52] <- "US"

votes2012$ProbOb <- votes2012[,2]/(votes2012[,2]+votes2012[,3])


votes2008set <- readHTMLTable("http://www.archives.gov/federal-register/electoral-college/2008/popular-vote.html")[[2]]

votes2008 <- data.frame(State=as.character(votes2008set[,1]), Obama=as.numeric(as.character(votes2008set[,2])), Romney=as.numeric(as.character(votes2008set[,3])))

levels(votes2008$State)<-c(levels(votes2008$State), "US")

votes2008$State[52] <- "US"

votes2008$ProbOb <- votes2008[,2]/(votes2008[,2]+votes2008[,3])

votes2008$TotalMP <- (votes2008[,2]+votes2008[,3])

votes2004set <- readHTMLTable("http://www.archives.gov/federal-register/electoral-college/2004/popular_vote.html")[[2]]
votes2004 <- data.frame(State=as.character(votes2004set[,1]), Bush=as.numeric(gsub(",", "", as.character(votes2004set[,2]))), Kerry=as.numeric(gsub(",", "", as.character(votes2004set[,3]))))

levels(votes2004$State)<-c(levels(votes2004$State), "MS", "OH", "US")

votes2004$State[25] <- "MS"
votes2004$State[36] <- "OH"
votes2004$State[52] <- "US"

votes2004$ProbDem04 <- votes2004[,3]/(votes2004[,2]+votes2004[,3])


votes2000set <- readHTMLTable("http://www.archives.gov/federal-register/electoral-college/2000/popular_vote.html")[[2]]
votes2000 <- data.frame(State=as.character(votes2000set[,1]), Bush1=as.numeric(gsub(",", "", as.character(votes2000set[,2]))), Gore=as.numeric(gsub(",", "", as.character(votes2000set[,3]))))

levels(votes2000$State)<-c(levels(votes2000$State), "CA", "US")

votes2000$State[5] <- "CA"
votes2000$State[52] <- "US"


votes2000$ProbDem00 <- votes2000[,3]/(votes2000[,2]+votes2000[,3])

votes1996set <- read.csv("http://joshpasek.com/wp-content/uploads/2014/06/Election1996.csv")

votes1996 <- data.frame(State=votes1996set$State, Clinton=votes1996set$Clinton.1, Dole=votes1996set$Clinton.1)
votes1996$ProbDem96 <- votes1996[,2]/(votes1996[,2]+votes1996[,3])

statevotes <- list(votes1996, votes2000, votes2004, votes2008)

priorelect <- merge_recurse(statevotes, by="State")

priorvote <- with(priorelect, data.frame(State=State, Prior=(ProbDem96+ProbDem00+ProbDem04+ProbOb)/4))


linzerset <- read.csv("http://joshpasek.com/wp-content/uploads/2014/06/Linzer-state-forecasts.csv")
jackmanset <- read.csv("http://joshpasek.com/wp-content/uploads/2014/06/jackman-2012.csv")

addsetsp <- read.csv("AdditionalNationalComparisons.csv")

addsets <- addsetsp[rowSums(addsetsp[,-1], na.rm=TRUE)>0,]

addsets$Daysout <-  as.numeric(as.Date("11/06/2012", format="%m/%d/%y")-as.Date(addsets$Date, format="%m/%d/%y"))

addsets$Fivethirtyeightabserr <- addsets$Fivethirtyeight/100-votes2012$ProbOb[votes2012$State=="US"]
addsets$Pollyvoteabserr <- addsets$Pollyvote/100-votes2012$ProbOb[votes2012$State=="US"]
addsets$RCPabserr <- addsets$RCP/100-votes2012$ProbOb[votes2012$State=="US"]


jackmanset$ProbOb <- jackmanset$obama/(jackmanset$obama+jackmanset$romney)
jackmanset$Daysout <- as.numeric(as.Date("2012-11-06")-as.Date(jackmanset$date))

linzerset$Daysout <- as.numeric(as.Date("2012-11-06")-as.Date(linzerset$date, format="%m/%d/%Y"))

df1$Daysout <- as.numeric(as.Date("2012-11-06"))-as.numeric(as.Date(df1$end_date))

finals <- data.frame(state=votes2012$State, actual=votes2012$ProbOb)

numvotes <- data.frame(state=votes2008$State, totvotes=votes2008$TotalMP)

df2 <- merge(df1, finals, all.x=TRUE)

df2$Error <- df2$ProbOb-df2$actual

longlinza <- lapply(2:51, function(x) data.frame(state=colnames(linzerset)[x], ProbOb=linzerset[,x]/100, Daysout=linzerset$Daysout))

longlinz <- merge_recurse(longlinza)

jackmanfull <- merge(jackmanset, finals, all.x=TRUE)
jackmanfull$Error <- jackmanfull$ProbOb-jackmanfull$actual


linzerplus1 <- merge(longlinz, finals, all.x=TRUE)

linzerplus2 <- merge(linzerplus1, numvotes, all.x=TRUE)

linzerplus1$Error <- linzerplus1$ProbOb-linzerplus1$actual

library(weights)
linzerUS <- t(sapply(unique(linzerplus2$Daysout), function(d) c(state="US", Daysout=d, ProbOb=wtd.mean(linzerplus2$ProbOb[linzerplus2$Daysout==d], linzerplus2$totvotes[linzerplus2$Daysout==d]), actual=finals$actual[52], Error=wtd.mean(linzerplus2$ProbOb[linzerplus2$Daysout==d], linzerplus2$totvotes[linzerplus2$Daysout==d])-finals$actual[52])))


linzerplus <- as.data.frame(rbind(linzerplus1, linzerUS))
for(i in 2:length(linzerplus))
  linzerplus[,i] <- as.numeric(as.character(linzerplus[,i]))


plot(df2$Error~log(df2$Daysout))
lines(linzerplus$Error~log(linzerplus$Daysout), col="red", type="p")


## Ohio, Virginia, Florida as Examples

stateplot <- function(statename, ylim=c(0,8), xlim=c(0, 75), main=paste("Predicting Election Results From", statename)){
  statepolls <- df2[df2$state==statename,]
  natpolls <- df2[df2$state=="US",]
  statehistory <- priorvote[priorvote$State==statename,2]
  stateseparate <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(abs(statepolls$Error[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout]), na.rm=TRUE)))))
  staterolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(statepolls$Error[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout], na.rm=TRUE)))))
  staterollingphone <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(statepolls$Error[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout & statepolls$method=="Phone"], na.rm=TRUE)))))
  if(statename=="US"){
    forecasting <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=(.1968+.0408*.34+.59*mean(statepolls$ProbOb[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout], na.rm=TRUE)-statepolls$actual[1])))))
  }
  desartrolling <- as.data.frame(t(sapply(1:40, function(d) c(Daysout=d, AvgErr=(-.13906+.364*statehistory+.75*mean(statepolls$ProbOb[(d)<statepolls$Daysout & 45>statepolls$Daysout], na.rm=TRUE)+.163*mean(natpolls$ProbOb[(d)<natpolls$Daysout & 45>natpolls$Daysout], na.rm=TRUE)-statepolls$actual[1])))))
  linzerstate <- linzerplus[linzerplus$state==statename,]
  jackmanstate <- jackmanfull[jackmanfull$state==statename,]
  
  plot(abs(100*statepolls$Error[statepolls$Daysout<75])~statepolls$Daysout[statepolls$Daysout<75], xlab="Days From Election", ylab="Absolute Error", ylim=ylim, xlim=xlim, main=main)
  lines(abs(100*stateseparate$AvgErr)~stateseparate$Daysout, lty=2, lwd=3)
  lines(abs(100*staterolling$AvgErr)~staterolling$Daysout, lwd=3)
  lines(abs(100*desartrolling$AvgErr)~desartrolling$Daysout, col="purple", lwd=3)
  lines(abs(100*linzerstate$Error)~linzerstate$Daysout, col="red", lwd=3)
  lines(abs(100*jackmanstate$Error[order(jackmanstate$Daysout)])~jackmanstate$Daysout[order(jackmanstate$Daysout)], col="blue", lwd=3)
  if(statename=="US"){
      lines(abs(100*forecasting$AvgErr)~forecasting$Daysout, col="dark green", lwd=3)
      lines(abs(100*addsets$Fivethirtyeightabserr)~addsets$Daysout, col="orange", lwd=3)
      lines(abs(100*addsets$Pollyvoteabserr)~addsets$Daysout, col="pink", lwd=3)
      lines(abs(100*addsets$RCPabserr)~addsets$Daysout, col="brown", lwd=3)
  }
}

stateplotnonabs <- function(statename, ylim=c(-8,8), xlim=c(0, 75), main=paste("Predicting Election Results From", statename)){
  statepolls <- df2[df2$state==statename,]
  natpolls <- df2[df2$state=="US",]
  statehistory <- priorvote[priorvote$State==statename,2]
  stateseparate <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(abs(statepolls$Error[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout]), na.rm=TRUE)))))
  staterolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(statepolls$Error[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout], na.rm=TRUE)))))
  staterollingphone <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(statepolls$Error[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout & statepolls$method=="Phone"], na.rm=TRUE)))))
  if(statename=="US"){
    forecasting <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=(.1968+.0408*.34+.59*mean(statepolls$ProbOb[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout], na.rm=TRUE)-statepolls$actual[1])))))
  }
  desartrolling <- as.data.frame(t(sapply(1:40, function(d) c(Daysout=d, AvgErr=(-.13906+.364*statehistory+.75*mean(statepolls$ProbOb[(d)<statepolls$Daysout & 45>statepolls$Daysout], na.rm=TRUE)+.163*mean(natpolls$ProbOb[(d)<natpolls$Daysout & 45>natpolls$Daysout], na.rm=TRUE)-statepolls$actual[1])))))
  linzerstate <- linzerplus[linzerplus$state==statename,]
  jackmanstate <- jackmanfull[jackmanfull$state==statename,]
  
  plot(-(100*statepolls$Error[statepolls$Daysout<75])~statepolls$Daysout[statepolls$Daysout<75], xlab="Days From Election", ylab="Actual Obama Share - Predicted Obama Share", ylim=ylim, xlim=xlim, main=main)
  lines((100*stateseparate$AvgErr)~stateseparate$Daysout, lty=2, lwd=3)
  lines(-(100*staterolling$AvgErr)~staterolling$Daysout, lwd=3)
  lines(-(100*desartrolling$AvgErr)~desartrolling$Daysout, col="purple", lwd=3)
  lines(-(100*linzerstate$Error)~linzerstate$Daysout, col="red", lwd=3)
  lines(-(100*jackmanstate$Error[order(jackmanstate$Daysout)])~jackmanstate$Daysout[order(jackmanstate$Daysout)], col="blue", lwd=3)
  if(statename=="US"){
      lines(-(100*forecasting$AvgErr)~forecasting$Daysout, col="dark green", lwd=3)
      lines(-(100*addsets$Fivethirtyeightabserr)~addsets$Daysout, col="orange", lwd=3)
      lines(-(100*addsets$Pollyvoteabserr)~addsets$Daysout, col="pink", lwd=3)
      lines(-(100*addsets$RCPabserr)~addsets$Daysout, col="brown", lwd=3)
  }
  abline(h=0, lty=3, col="dark gray")
}

#stateplot("OH", ylim=c(0,5))

#par(mfrow=c(2,1))
pdf("NatAccuracyChecks.pdf", width=8.5, height=6)
par(mfrow=c(1,1))
sapply(rev(names(sort(table(df2$state))))[1], stateplot)
legend(x="topright",
       legend=c("Average Survey Error", "Aggregated Survey Error", "RealClearPolitics", "Erikson & Wlezien", "DeSart & Holbrook", "Jackman (Pollster)", "Graefe et al. (PollyVote)", "Silver (FiveThirtyEight)", "Linzer (Votamatic)"),
       col=c("black", "black", "brown", "dark green", "purple", "blue", "pink", "orange", "red"), lwd=2, lty=c(2,1,1,1,1,1,1,1,1))
dev.off()

pdf("NatAccuracyChecksNoAbs.pdf", width=8.5, height=8)
par(mfrow=c(1,1))
sapply(rev(names(sort(table(df2$state))))[1], stateplotnonabs)
legend(x="bottomright",
       legend=c("Absolute Average Survey Error", "Aggregated Survey Error", "RealClearPolitics", "Erikson & Wlezien", "DeSart & Holbrook", "Jackman (Pollster)", "Graefe et al. (PollyVote)", "Silver (FiveThirtyEight)", "Linzer (Votamatic)"),
       col=c("black", "black", "brown", "dark green", "purple", "blue", "pink", "orange", "red"), lwd=2, lty=c(2,1,1,1,1,1,1,1,1))
dev.off()

pdf("AccuracyChecks.pdf", width=8.5, height=11)
par(mfrow=c(4,2))
sapply(rev(names(sort(table(df2$state))))[2:length(table(df2$state))], stateplot)
dev.off()

pdf("AccuracyChecksNoAbs.pdf", width=8.5, height=11)
par(mfrow=c(4,2))
sapply(rev(names(sort(table(df2$state))))[2:length(table(df2$state))], stateplotnonabs)
dev.off()


matrix(c(unique(as.character(linzerplus$state)), unique(as.character(jackmanfull$state)), "", "", "", "", unique(as.character(df2$state)), "", "", "", "", ""), nrow=3, ncol=51, byrow=TRUE)


## Within State MSEs ##

msestate <- function(statename, range=7){
  statepolls <- df2[df2$state==statename,]
  natpolls <- df2[df2$state=="US",]
  statehistory <- priorvote[priorvote$State==statename,2]
  stateseparate <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(abs(statepolls$Error[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout]), na.rm=TRUE)))))
  staterolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(statepolls$Error[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout], na.rm=TRUE)))))
  staterollingphone <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=mean(statepolls$Error[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout & statepolls$method=="Phone"], na.rm=TRUE)))))
  linzerstate <- linzerplus[linzerplus$state==statename,]
  jackmanstate <- jackmanfull[jackmanfull$state==statename,]
  forecasting <- rcp <- pv <- ft8 <- data.frame(avgerr=rep(NA, 75), AvgErr=rep(NA, 75), Daysout=1:75)
  if(statename=="US"){
    forecasting <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=(.1968+.0408*.34+.59*mean(statepolls$ProbOb[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout], na.rm=TRUE)-statepolls$actual[1])))))
    rcp <- with(addsets, data.frame(avgerr=RCPabserr, Daysout=Daysout))
    pv <- with(addsets, data.frame(avgerr=Pollyvoteabserr, Daysout=Daysout))
    ft8 <- with(addsets, data.frame(avgerr=Fivethirtyeightabserr, Daysout=Daysout))
  }
  else
      forecasting <- data.frame(Daysout=1:75, AvgErr=rep(NA, 75))
  desartrolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, AvgErr=(-.13906+.364*statehistory+.75*mean(statepolls$ProbOb[(d)<statepolls$Daysout & 45>statepolls$Daysout], na.rm=TRUE)+.163*mean(natpolls$ProbOb[(d)<natpolls$Daysout & 45>natpolls$Daysout], na.rm=TRUE)-statepolls$actual[1])))))
  statemse <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, SurveyMSE=eval(mean((statepolls$Error[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout])^2, na.rm=TRUE)), SurveyAvgMSE=eval(mean(statepolls$Error[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout], na.rm=TRUE)^2), RCP=min(eval((rcp$avgerr[rcp$Daysout==d])^2), "NA"), Forecasting=min(eval((forecasting$AvgErr[forecasting$Daysout==d])^2), "NA"), Desart=eval(min(desartrolling$AvgErr[desartrolling$Daysout==d]^2, "NA")), Jackman=min(eval((jackmanstate$Error[jackmanstate$Daysout==d])^2), "NA"), Pollyvote=min(eval((pv$avgerr[pv$Daysout==d])^2), "NA"), Linzer=min(eval((linzerstate$Error[linzerstate$Daysout==d])^2), "NA"), Fivethirtyeight=min(eval((ft8$avgerr[ft8$Daysout==d])^2), "NA")))))
  npolls <- sapply(1:75, function(d) length(statepolls$Error[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout]))
  for(i in length(statemse))
    statemse[,i] <- as.numeric(as.character(statemse[,i]))
  statemse
}


ohcompare <- msestate("OH")

allcompare <- lapply(rev(names(sort(table(df2$state)))), function(x) msestate(x))

normstate <- sapply(allcompare, function(x) sapply(x, function(g) mean(as.numeric(as.character(g)), na.rm=TRUE)))
colnames(normstate) <- names(allcompare) <- rev(names(sort(table(df2$state))))

nat <- allcompare[[1]]
acm <- merge_recurse(allcompare[2:length(allcompare)])


acmf <- merge_recurse(allcompare)

finalcompare <- acmf[acmf$Daysout==1,]


propdif <- function(x, y){
    x <- as.numeric(as.character(x))
    y <- as.numeric(as.character(y))
    comparative <- mean(x[!is.na(x) & !is.na(y)], na.rm=TRUE)/mean(y[!is.na(x) & !is.na(y)], na.rm=TRUE)
    comparative
}

sumdif <- function(x, y){
    x <- as.numeric(as.character(x))
    y <- as.numeric(as.character(y))
    comparative <- sum(x<y, na.rm=TRUE)
    comparative
}

# Final Pre-Election Errors
sumdifset <- t(sapply(finalcompare[,-1], function(x) sapply(finalcompare[,-1], function(y) sumdif(x, y))))
lt <- t(apply(sumdifset, 1, function(x) c(separate=x[1], aggregation=sum(x[c(2:3)]), predictive=sum(x[4:5]), hybrid=sum(x[c(6:9)]), track=sum(x[c(1:3, 6)]), forecast=sum(x[c(4:5, 7:9)]), total=sum(x[1:9]))))
gt <- t(apply(sumdifset, 2, function(x) c(separate=x[1], aggregation=sum(x[c(2:3)]), predictive=sum(x[4:5]), hybrid=sum(x[c(6:9)]), track=sum(x[c(1:3, 6)]), forecast=sum(x[c(4:5, 7:9)]), total=sum(x[1:9]))))
tot <- gt+lt

write.csv(tot, "Tables/FinalNs.csv")

lt/(gt+lt)

write.csv(lt/(gt+lt), "Tables/FinalPropWinners.csv")

minp <- 2*(1-pbinom(lt, (gt+lt), .5))
maxp <- 2*(1-pbinom(gt, (gt+lt), .5))

finp <- minp
finp[maxp<minp] <- maxp[maxp<minp]

write.csv(finp, "Tables/FinalPVals.csv")

# Running Pre-Election Errors
allsumdifset <- t(sapply(acmf[,-1], function(x) sapply(acmf[,-1], function(y) sumdif(x, y))))
alllt <- t(apply(allsumdifset, 1, function(x) c(separate=x[1], aggregation=sum(x[c(2:3)]), predictive=sum(x[4:5]), hybrid=sum(x[c(6:9)]), track=sum(x[c(1:3, 6)]), forecast=sum(x[c(4:5, 7:9)]), total=sum(x[1:9]))))
allgt <- t(apply(allsumdifset, 2, function(x) c(separate=x[1], aggregation=sum(x[c(2:3)]), predictive=sum(x[4:5]), hybrid=sum(x[c(6:9)]), track=sum(x[c(1:3, 6)]), forecast=sum(x[c(4:5, 7:9)]), total=sum(x[1:9]))))
alltot <- allgt+alllt

write.csv(alltot, "Tables/AllNs.csv")

100*alllt/(allgt+alllt)

write.csv(100*alllt/(allgt+alllt), "Tables/AllPropWinners.csv")

allminp <- 2*(1-pbinom(alllt, (allgt+alllt), .5))
allmaxp <- 2*(1-pbinom(allgt, (allgt+alllt), .5))

allfinp <- allminp
allfinp[allmaxp<allminp] <- allmaxp[allmaxp<allminp]

write.csv(allfinp, "Tables/AllPVals.csv")


# Events (-3 for survey coverage)
# oct 3 - 31 days
# oct 11 - 23 days
# oct 16 - 18 days
# oct 22 - 15 days
# sept 4-6 - 62 days
# aug 27-30 - 68 days

coefset <- function(pollavg, Daysout){
    d1 <- Daysout>15
    d2 <- Daysout>18
    d3 <- Daysout>23
    d4 <- Daysout>31
    d5 <- Daysout>62
    d6 <- Daysout>68
    100*coef(lm(pollavg~d1+d2+d3+d4+d5+d6))[-1]
}


regstate <- function(statename, range=7){
  statepolls <- df2[df2$state==statename,]
  natpolls <- df2[df2$state=="US",]
  statehistory <- priorvote[priorvote$State==statename,2]
  stateseparatecoef <- try(coefset(statepolls$ProbOb[!is.na(statepolls$ProbOb) & statepolls$Daysout<76], statepolls$Daysout[!is.na(statepolls$ProbOb) & statepolls$Daysout<76]))
  if(class(stateseparatecoef)=="try-error")
      stateseparatecoef <- rep(NA, 6)
  staterolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, Est=mean(statepolls$ProbOb[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout], na.rm=TRUE)))))
  staterollingcoef <- try(coefset(staterolling$Est, staterolling$Daysout))
  if(class(staterollingcoef)=="try-error")
      staterollingcoef <- rep(NA, 6)
  desartrolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, Est=(-.13906+.364*statehistory+.75*mean(statepolls$ProbOb[(d)<statepolls$Daysout & 45>statepolls$Daysout], na.rm=TRUE)+.163*mean(natpolls$ProbOb[(d)<natpolls$Daysout & 45>natpolls$Daysout], na.rm=TRUE))))))
  desartrollingcoef <- try(coefset(desartrolling$Est, desartrolling$Daysout))
  if(class(desartrollingcoef)=="try-error")
      desartrollingcoef <- rep(NA, 6)
  linzerstate <- linzerplus[linzerplus$state==statename,]
  linzerstatecoef <- try(coefset(linzerstate$ProbOb[linzerstate$Daysout<76], linzerstate$Daysout[linzerstate$Daysout<76]))
      if(class(linzerstatecoef)=="try-error")
      linzerstatecoef <- rep(NA, 6)
  jackmanstate <- jackmanfull[jackmanfull$state==statename,]
  jackmanstatecoef <- try(coefset(jackmanstate$ProbOb[jackmanstate$Daysout<76], jackmanstate$Daysout[jackmanstate$Daysout<76]))
    if(class(jackmanstatecoef)=="try-error")
      jackmanstatecoef <- rep(NA, 6)
  forecastingcoef <- rcpcoef <- pvcoef <- ft8coef <- rep(NA, 6)
  if(statename=="US"){
    forecasting <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, Est=(.1968+.0408*.34+.59*mean(statepolls$ProbOb[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout], na.rm=TRUE))))))
    forecastingcoef <- coefset(forecasting$Est, forecasting$Daysout)
    rcp <- with(addsets, data.frame(Est=RCP/100, Daysout=Daysout))
    rcpcoef <- coefset(rcp$Est[rcp$Daysout<76], rcp$Daysout[rcp$Daysout<76])
    pv <- with(addsets, data.frame(Est=Pollyvote/100, Daysout=Daysout))
    pvcoef <- coefset(pv$Est[pv$Daysout<76], pv$Daysout[pv$Daysout<76])
    ft8 <- with(addsets, data.frame(Est=Fivethirtyeight/100, Daysout=Daysout))
    ft8coef <- coefset(ft8$Est[ft8$Daysout<76], ft8$Daysout[ft8$Daysout<76])
  }
  ls <- data.frame(stateseparatecoef, staterollingcoef, rcpcoef, forecastingcoef, desartrollingcoef, jackmanstatecoef, pvcoef, linzerstatecoef, ft8coef)
  ls
}

for(i in rev(names(sort(table(df2$state))))){
    print(i)
    regstate(i)
}


allcomparecoefs <- lapply(rev(names(sort(table(df2$state)))), function(x) regstate(x))

accs <- abs(merge_recurse(allcomparecoefs))

coefsumdifset <- t(sapply(accs, function(x) sapply(accs, function(y) sumdif(x, y))))
coeflt <- t(apply(coefsumdifset, 1, function(x) c(separate=x[1], aggregation=sum(x[c(2:3)]), predictive=sum(x[4:5]), hybrid=sum(x[c(6:9)]), track=sum(x[c(1:3, 6)]), forecast=sum(x[c(4:5, 7:9)]), total=sum(x[1:9]))))
coefgt <- t(apply(coefsumdifset, 2, function(x) c(separate=x[1], aggregation=sum(x[c(2:3)]), predictive=sum(x[4:5]), hybrid=sum(x[c(6:9)]), track=sum(x[c(1:3, 6)]), forecast=sum(x[c(4:5, 7:9)]), total=sum(x[1:9]))))
coeftot <- coefgt+coeflt

write.csv(coeftot, "Tables/coefNs.csv")

100*coefgt/(coefgt+coeflt)

write.csv(100*coefgt/(coefgt+coeflt), "Tables/coefPropMoreVariable.csv")

coefminp <- 2*(1-pbinom(coeflt, (coefgt+coeflt), .5))
coefmaxp <- 2*(1-pbinom(coefgt, (coefgt+coeflt), .5))

coeffinp <- coefminp
coeffinp[coefmaxp<coefminp] <- coefmaxp[coefmaxp<coefminp]

write.csv(coeffinp, "Tables/coefPVals.csv")


stateests <- function(statename, range=7){
  statepolls <- df2[df2$state==statename,]
  natpolls <- df2[df2$state=="US",]
  statehistory <- priorvote[priorvote$State==statename,2]
  stateseparate <- data.frame(Est=statepolls$ProbOb[statepolls$Daysout<76], Daysout=statepolls$Daysout[statepolls$Daysout<76])
  staterolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, Est=mean(statepolls$ProbOb[(d)<statepolls$Daysout & (d+range)>statepolls$Daysout], na.rm=TRUE)))))
  desartrolling <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, Est=(-.13906+.364*statehistory+.75*mean(statepolls$ProbOb[(d)<statepolls$Daysout & 45>statepolls$Daysout], na.rm=TRUE)+.163*mean(natpolls$ProbOb[(d)<natpolls$Daysout & 45>natpolls$Daysout], na.rm=TRUE))))))
  linzerstate <- linzerplus[linzerplus$state==statename,]
  linzerstate$Est <- linzerstate$ProbOb
  jackmanstate <- jackmanfull[jackmanfull$state==statename,]
  jackmanstate$Est <- jackmanstate$ProbOb
  rcp <- pv <- ft8 <- forecasting <- data.frame(Daysout=1:75, Est=rep(NA, 75))
  if(statename=="US"){
    forecasting <- as.data.frame(t(sapply(1:75, function(d) c(Daysout=d, Est=(.1968+.0408*.34+.59*mean(statepolls$ProbOb[(d)<statepolls$Daysout & (d+7)>statepolls$Daysout], na.rm=TRUE))))))
    rcp <- with(addsets, data.frame(Est=RCP/100, Daysout=Daysout))
    pv <- with(addsets, data.frame(Est=Pollyvote/100, Daysout=Daysout))
    ft8 <- with(addsets, data.frame(Est=Fivethirtyeight/100, Daysout=Daysout))
  }
  ls <- list(stateseparate, staterolling, rcp, forecasting, desartrolling, jackmanstate, pv, linzerstate, ft8)
  names(ls) <- c("Separate", "RollingAvg", "RCP", "Erikson", "Desart", "Jackman", "Pollyvote", "Linzer", "FiveThirtyEight")
  ls
}

combosetgt <- function(ls){
    sapply(ls, function(x) sapply(ls, function(y) as.numeric(sd(x$Est[x$Daysout %in% unique(y$Daysout)], na.rm=TRUE)>sd(y$Est[y$Daysout %in% unique(x$Daysout)], na.rm=TRUE))))
}

combosetgtwtd <- function(ls){
    sapply(ls, function(x) sapply(ls, function(y) as.numeric(wtd.var(x$Est[x$Daysout %in% unique(y$Daysout)], 1/table(x$Daysout)[x$Daysout[x$Daysout %in% unique(y$Daysout)]], normwt=TRUE, na.rm=TRUE)>wtd.var(y$Est[y$Daysout %in% unique(x$Daysout)], 1/table(y$Daysout)[y$Daysout[y$Daysout %in% unique(x$Daysout)]], normwt=TRUE, na.rm=TRUE))))
}

combosetlt <- function(ls){
    sapply(ls, function(x) sapply(ls, function(y) as.numeric(sd(x$Est[x$Daysout %in% unique(y$Daysout)], na.rm=TRUE)<sd(y$Est[y$Daysout %in% unique(x$Daysout)], na.rm=TRUE))))
}

combosetltwtd <- function(ls){
    sapply(ls, function(x) sapply(ls, function(y) as.numeric(wtd.var(x$Est[x$Daysout %in% unique(y$Daysout)], 1/table(x$Daysout)[x$Daysout[x$Daysout %in% unique(y$Daysout)]], normwt=TRUE, na.rm=TRUE)<wtd.var(y$Est[y$Daysout %in% unique(x$Daysout)], 1/table(y$Daysout)[y$Daysout[y$Daysout %in% unique(x$Daysout)]], normwt=TRUE, na.rm=TRUE))))
}

allsdcomparegt <- lapply(rev(names(sort(table(df2$state)))), function(x) combosetgtwtd(stateests(x)))
allsdcomparelt <- lapply(rev(names(sort(table(df2$state)))), function(x) combosetltwtd(stateests(x)))

sdmergegt <- allsdcomparegt[[1]]==1 & !is.na(allsdcomparegt[[1]])
sdmergelt <- allsdcomparelt[[1]]==1 & !is.na(allsdcomparelt[[1]])
for(i in 2:length(allsdcomparegt)){
    sdmergegt <- sdmergegt+(allsdcomparegt[[i]]==1 & !is.na(allsdcomparegt[[i]]))
    sdmergelt <- sdmergelt+(allsdcomparelt[[i]]==1 & !is.na(allsdcomparelt[[i]]))
}

gtsets <- cbind(sep=sdmergegt[1,], aggregate=colSums(sdmergegt[2:3,]), predictive=colSums(sdmergegt[4:5,]), hybrid=colSums(sdmergegt[6:9,]), total=colSums(sdmergegt))

ltsets <- cbind(sep=sdmergelt[1,], aggregate=colSums(sdmergelt[2:3,]), predictive=colSums(sdmergelt[4:5,]), hybrid=colSums(sdmergelt[6:9,]), total=colSums(sdmergelt))

sdmergetot <- gtsets/(gtsets+ltsets)

sdmergen <- (gtsets+ltsets)

write.csv(sdmergetot, "Tables/SDCompare.csv")
write.csv(sdmergen, "Tables/SDCompareNs.csv")

sdminp <- 2*(1-pbinom(ltsets, (gtsets+ltsets), .5))
sdmaxp <- 2*(1-pbinom(gtsets, (gtsets+ltsets), .5))

sdfinp <- sdminp
sdfinp[sdmaxp<sdminp] <- sdmaxp[sdmaxp<sdminp]

write.csv(sdfinp, "Tables/SDComparePVals.csv")