# The codes are for cross validatios of corporate exits. Smooth baseline hazard model.

# Data sets.
# Note that GVKEY = company ids,  tindex =1 means the 1st quarter of 1979.

#for sharcnet
variables=read.table("/home/tbae/voldata/truc_com_cov.txt",header=TRUE)
variables=variables[c(-2,-3)]
exitdata=read.table("/home/tbae/voldata/exits.txt",header=TRUE)
macroecono=read.table("/home/tbae/voldata/macroeco.txt",header=TRUE)

loc=which((variables[,16] <= 31) | (variables[,16] >= 113))
variables=variables[-loc,]

variables[,14]=variables[,14]*100  # for scaling to %

ids=variables[,1] # company ids for all firms

#selection of 8000 random sample

gvkeys=as.numeric(levels(factor(ids)))
sampled = sample(gvkeys,8000)
sampledloc=numeric()

for (i in sampled){
                   sampledloc=c(sampledloc,which(ids == i))
}

sampled_variables=variables[sampledloc,]    # to be used for estimation
outsample_variables=variables[-sampledloc,]  # to be used for prediction


#exit data for sampled and non sampled firms

sampled_exit_loc=numeric()

for (i in sampled){
                  sampled_exit_loc=c(sampled_exit_loc,which(exitdata[,1] == i))
}

sample_exit=exitdata[sampled_exit_loc,]
outsample_exit=exitdata[-sampled_exit_loc,]


# list of sampled variables by time points 
bysurv_sampled=by(sampled_variables,sampled_variables[,16],function(y){y})

# list of sampled exit data by reason of exit
ybank_sampled=sample_exit[sample_exit[,4]==2 | sample_exit[,4]==3,]
ymerg_sampled=sample_exit[sample_exit[,4]==1,]
bybanked_sampled=by(ybank_sampled,ybank_sampled[,5],function(y){y})
bymerged_sampled=by(ymerg_sampled,ymerg_sampled[,5],function(y){y})


# list of out sample variables by time points 
bysurv_outsample=by(outsample_variables,outsample_variables[,16],function(y){y})

# list of outsample exit data by reason of exit
ybank_outsample=outsample_exit[outsample_exit[,4]==2 | outsample_exit[,4]==3,]
ymerg_outsample=outsample_exit[outsample_exit[,4]==1,]
bybanked_outsample=by(ybank_outsample,ybank_outsample[,5],function(y){y})
bymerged_outsample=by(ymerg_outsample,ymerg_outsample[,5],function(y){y})



col_sums=function(matx){
        if (is.vector(matx)){
           matx=t(matrix(matx))
        }
        
        if (dim(matx)[1]==0){
           colsums=0
        }else{
        
            if (dim(matx)[1]==1){
                colsums=matx
            }else{
                colsums=colSums(matx)
            }
        }
        colsums
}    




#estimation of baseline hazards given regression parameters and frailty parameters
cor_baseline=function(bysurv,bybanked,bymerged,z,mle,arpara,ini,k,max.itr=1000){
        q=k-32
        phi1=arpara[1]
        theta1=arpara[2]
        phi2=arpara[3]
        theta2=arpara[4]
        sigma1=arpara[5]
        sigma2=arpara[6]
        rho=arpara[7]
        
        covmat=matrix(c(sigma1^2,rho*sigma1*sigma2,rho*sigma1*sigma2,sigma2^2),2,2)
        invcovmat=solve(covmat)
        
        phimat=matrix(c(phi1,0,0,phi2),2,2)
        thetavec=c(theta1,theta2)
        
        h=as.vector(ini)
        
        dist=1
        j=0
        while (dist > 1.0e-3){
            j=j+1
            if (j ==max.itr){ stop("max itr meets")}    
            
            firstderiv=numeric()
            seconderiv=matrix(0,ncol=(2*q),nrow=(2*q))
           
           for (r in 33:k){
            
            locmacro=which(z[,12]==(r-1)) #macroeconomic variables
            macroeco=z[locmacro,]
        
            locbank=which(as.numeric(names(bybanked))==r)
            locmerg=which(as.numeric(names(bymerged))==r)
        
            dj1=ifelse(length(locbank)==0,0,dim(bybanked[[locbank]])[1])
            dj2=ifelse(length(locmerg)==0,0,dim(bymerged[[locmerg]])[1])
                    
        #survivors
            locsurv=which(as.numeric(names(bysurv))==(r-1))
            survivorfin=bysurv[[locsurv]]
                    
            covresult=covariates(survivorfin, macroeco, mle)
            matsurv1=covresult[[1]]
            matsurv2=covresult[[2]]
            linear01=covresult[[3]]
            linear02=covresult[[4]]
               
            h01=exp(h[2*(r-33)+1]+linear01)
            h02=exp(h[2*(r-32)]+linear02)
            hdot=h01+h02
        
            lamda01=h01/(1+hdot)
            lamda02=h02/(1+hdot)
            
            if (r==33){
               gammaprime=-t(matrix(h[(2*(r-33)+1):(2*(r-32))]-thetavec))%*%invcovmat-t(matrix(h[(2*(r-32)+1):(2*(r-31))]-h[(2*(r-33)+1):(2*(r-32))]-phimat%*%matrix(thetavec-h[(2*(r-33)+1):(2*(r-32))])))%*%invcovmat%*%(phimat-diag(c(1,1)))
            }else{
                  if (r==k){
                      gammaprime=-t(matrix(h[(2*(r-33)+1):(2*(r-32))]-h[(2*(r-34)+1):(2*(r-33))]-phimat%*%matrix(thetavec-h[(2*(r-34)+1):(2*(r-33))])))%*%invcovmat
                  }else{
                        gammaprime=-t(matrix(h[(2*(r-33)+1):(2*(r-32))]-h[(2*(r-34)+1):(2*(r-33))]-phimat%*%matrix(thetavec-h[(2*(r-34)+1):(2*(r-33))])))%*%invcovmat-t(matrix(h[(2*(r-32)+1):(2*(r-31))]-h[(2*(r-33)+1):(2*(r-32))]-phimat%*%matrix(thetavec-h[(2*(r-33)+1):(2*(r-32))])))%*%invcovmat%*%(phimat-diag(c(1,1)))
                  }
            }    
                      
            firstderiv=c(firstderiv,gammaprime+c(dj1-sum(lamda01),dj2-sum(lamda02)))
            
            if (r==k){
                 gammadouble=-invcovmat
            }else{
                 gammadouble=-invcovmat-t(phimat-diag(c(1,1)))%*%invcovmat%*%(phimat-diag(c(1,1)))
            }
            
            seconderiv[(2*(r-33)+1):(2*(r-32)),(2*(r-33)+1):(2*(r-32))]=gammadouble+matrix(c(-sum(lamda01-lamda01^2),sum(lamda01*lamda02),sum(lamda02*lamda01),-sum(lamda02-lamda02^2)),2,2)
            
                # serial correlation part
                if (r < k){
            	      seconderiv[(2*(r-33)+1):(2*(r-32)),(2*(r-32)+1):(2*(r-31))]=-t(phimat-diag(c(1,1)))%*%invcovmat
            	      seconderiv[(2*(r-32)+1):(2*(r-31)),(2*(r-33)+1):(2*(r-32))]=-t(phimat-diag(c(1,1)))%*%invcovmat
	        }	          
           }
            
            
          #first derivative
          newh=matrix(h)-qr.solve(seconderiv)%*%matrix(firstderiv)
          dist=sqrt(sum((newh-matrix(h))^2))
          
          h=t(newh)
          }
          
          list(series=h,score=firstderiv,fisher=-seconderiv)
         
}
   
    
paras=function(inibase,para,sigma,tind){
        nbase=tind-32
        base=matrix(inibase,2,nbase)
        lng=dim(base)[2]
        phi=c(para[1],para[3])
        theta=c(para[2],para[4])
        
        firstterm=t(matrix(base[,1]-theta))%*%solve(sigma)%*%matrix(base[,1]-theta)
        secondterm=0
        for (i in 2:lng){
           secondterm=secondterm+t(matrix(base[,i]-base[,i-1]-diag(phi)%*%(theta-base[,i-1])))%*%solve(sigma)%*%matrix(base[,i]-base[,i-1]-diag(phi)%*%(theta-base[,i-1]))
        }
        firstterm+secondterm
}

  
    
    

#regression parameter estimation given baseline and frailty parameters
reg_para_frailty=function(x,y1,y2,z,regpara,baseline,arpara,tind){
                u=length(regpara)
    
                phi1=arpara[1]
                phi2=arpara[3]
                theta1=arpara[2]
                theta2=arpara[4]
                sigma1=arpara[5]
                sigma2=arpara[6]
                rho=arpara[7]
                
                phimat=matrix(c(phi1,0,0,phi2),ncol=2)
                covmat=rbind(c(sigma1^2,rho*sigma1*sigma2),c(rho*sigma1*sigma2,sigma2^2))
                thetavec=c(theta1,theta2)
                
                invcovmat=qr.solve(covmat)
                
                nbase=tind-32
                newbase=matrix(baseline,2,nbase)
                                
                distance = 1
                
        while (distance > 1.0e-3){
                          
                Fisher=matrix(0,ncol=u,nrow=u)
                score=rep(0,u)
                loglikelihood=0  
                
                for (kk in 33:tind){
                    base=newbase[,kk-32]  
                    locmacro=which(z[,12]==(kk-1)) #macroeconomic variables
                    macroeco=z[locmacro,]
                    
                   #survivors
                    locsurv=which(as.numeric(names(x))==(kk-1))
                    survivorfin=x[[locsurv]]
                    l=dim(survivorfin)[1]
                    
                   #exited
                    locbank=which(as.numeric(names(y1))== kk)
                    locmerg=which(as.numeric(names(y2))== kk)
                    dj1=ifelse(length(locbank)==0,0,dim(y1[[locbank]])[1])
                    dj2=ifelse(length(locmerg)==0,0,dim(y2[[locmerg]])[1])
                    dj=dj1+dj2

                    if (dj > 0){
                               bankfin=ifelse(dj1 > 0,y1[[locbank]],numeric())
                               mergfin=ifelse(dj2 > 0,y2[[locmerg]],numeric())
                               exitcode=c(bankfin[[1]],mergfin[[1]])
                               survcode=survivorfin[,1]
                               loc=numeric()
                               for (key in exitcode){
                                                    loc=c(loc, which(survcode==key))
                                }
                               exitcov=survivorfin[loc,]
                               nonexitcov=survivorfin[-loc,]
                               survivorfin=rbind(exitcov,nonexitcov)
                    }     


                    #survivors        
	            ressurv=covariates(survivorfin, macroeco, regpara)
        	    matsurv1=ressurv[[1]]
               	    matsurv2=ressurv[[2]]
                    linear01=ressurv[[3]]
                    linear02=ressurv[[4]]


                    
                    if (dj >0){
                                matexit1=matsurv1[1:dj,]
                                matexit2=matsurv2[1:dj,]
                                linear11=linear01[1:dj]
                                linear12=linear02[1:dj]

                                if (dj1==0){
                                        loglike1=sum(linear12[1:dj]+base[2])
                                        score1=col_sums(matexit2[1:dj,])
                                }
                                if (dj2==0){
                                        loglike1=sum(linear11[1:dj1]+base[1])
                                        score1=col_sums(matexit1[1:dj1,])
                                }
                                if (dj1 >0 & dj2 > 0){
                                        loglike1=sum(linear11[1:dj1]+base[1])+sum(linear12[(dj1+1):dj]+base[2])
                                        score1=col_sums(matexit1[1:dj1,])+col_sums(matexit2[(dj1+1):dj,])
                                }
                    }else{
                         loglike1=0
                         score1=rep(0,u)
                    }
                


                h01=exp(base[1]+linear01)
                h02=exp(base[2]+linear02)
                hdot=h01+h02
        
                loglike2=sum(log(1+hdot))
                lamda01=h01/(1+hdot)
                lamda02=h02/(1+hdot)
                
                
		#lincommat=lamda01*as.data.frame(matsurv1)+lamda02*as.data.frame(matsurv2) 
                #SS1=as.vector(colSums(lincommat))
                
                #SS2= matrix(rowSums(apply(sqrt(lamda01)*as.data.frame(matsurv1), MARGIN=1,function(y){y%*%t(y)})),nrow=u)+
                #    + matrix(rowSums(apply(sqrt(lamda02)*as.data.frame(matsurv2), MARGIN=1,function(y){y%*%t(y)})),nrow=u)+
                #    - matrix(rowSums(apply(as.data.frame(lincommat), MARGIN=1,function(y){y%*%t(y)})),nrow=u)
                

                SS1=rep(0,u)
        	SS2=matrix(0,ncol=u,nrow=u)
                for (w in 1:l){
            	  ss1=lamda01[w]*matsurv1[w,]+lamda02[w]*matsurv2[w,]
            	  ss2=lamda01[w]*matrix(matsurv1[w,])%*%t(matrix(matsurv1[w,]))+lamda02[w]*matrix(matsurv2[w,])%*%t(matrix(matsurv2[w,]))
           	  SS1=SS1+ss1
           	  SS2=SS2+ss2-matrix(ss1)%*%t(matrix(ss1))
        	}  


                #smoothness penalty
                if (kk==33){
                            penalty=log(2*pi)+0.5*log(det(covmat))+0.5*(base-thetavec)%*%invcovmat%*%matrix((base-thetavec))
                }else{
                      penalty=log(2*pi)+0.5*log(det(covmat))+0.5*(base-newbase[,kk-33]-t(phimat%*%matrix((thetavec-newbase[,kk-33]))))%*%invcovmat%*%(matrix(base-newbase[,kk-33])-phimat%*%matrix((thetavec-newbase[,kk-33])))
                }
        
                loglikelihood=loglikelihood+loglike1-loglike2-penalty
                score=score+score1-SS1
                Fisher=Fisher+SS2
                snderiv=-Fisher
        } #looping from kk=33 to nbase
        
        newone= matrix(regpara) - qr.solve(snderiv)%*%matrix(score)
        distance=sqrt(sum( (t(newone)-regpara)^2) )
        
        regpara=t(newone)
      }
      
      list(f=loglikelihood,g=score,H=Fisher,regress=regpara)
 }
      

itr_newton_frailty=function(x,y1,y2,z,mle,arpara,tind){
           nbase=tind-32
           ini_reg=mle
           ini_arpara=arpara
           ini_base=as.vector(rbind(rep(arpara[2],nbase),rep(arpara[4],nbase)))
           
           howfar=1
           while(howfar >= 1.0e-4){
                phi1=ini_arpara[1]
                phi2=ini_arpara[3]
                theta1=ini_arpara[2]
                theta2=ini_arpara[4]
                sigma1=ini_arpara[5]
                sigma2=ini_arpara[6]
                rho=ini_arpara[7]
    
                covmat=rbind(c(sigma1^2,rho*sigma1*sigma2),c(rho*sigma1*sigma2,sigma2^2))
                invcovmat=qr.solve(covmat)
                
                out.baseline=cor_baseline(x,y1,y2,z,ini_reg,ini_arpara,ini_base,tind)
                base_est=out.baseline$series
                arpara_est=c(optim(par=ini_arpara[1:4],paras,inibase=base_est,sigma=covmat,tind=tind)[[1]],ini_arpara[5:7])
                out.regress=reg_para_frailty(x,y1,y2,z,ini_reg,base_est,arpara_est,tind)
                regpara_est=out.regress$regress
                
                howfar=sqrt(sum((regpara_est-ini_reg)^2))
                ini_reg=regpara_est
                ini_base=base_est
                ini_arpara=arpara_est
           }
           
           list(base_fisher_diag=diag(out.baseline$fisher),reg_loglikelihood=out.regress$f,reg_score=out.regress$g,reg_fisher=out.regress$H,baseline_est=matrix(base_est,2,nbase),regress_est=regpara_est,arpara_est=arpara_est)
}





#sigma1=0.5, sigma2=0.5
covariates=function(x,z,para){
            dj=dim(x)[1]
            v11=x[,3]#QR 1
            v12=x[,4] #CR 2
            v13=x[,5] #CL 3 
            v14=x[,6] #IBIT 4
            v15=x[,7] #TL 5
            v16=x[,8] #SALE 6
            v17=x[,9] #WC 7
            v18=x[,10] #NI 8
            v19=x[,11] #RE 9
            v20=x[,12] #DTD 10 
            v21=x[,13] #AGE 11 
            v22=x[,14] #RTN 12
            v23=x[,15] #STD 13
            v24=(x[,2]==1)#ind1 14
            v25=(x[,2]==2)#ind2 15
            v26=(x[,2]==3) #ind3 16
            v27=(x[,2]==4) #ind4 17
                                 
            v28=rep(z[,1],dj) #indexrtn 18
            v29=rep(z[,2],dj) #30yrbond 19
            v30=rep(z[,3],dj) #10yrnote 20
            v31=rep(z[,4],dj) #5yrnote 21
            v32=rep(z[,5],dj) #1yrnote 22
            v33=rep(z[,6],dj) #90daybill 23
            v34=rep(z[,7],dj) #30daybill 24
            v35=rep(z[,8],dj) #CPI 25
            v36=rep(z[,9],dj) #ex inx 26
            v37=rep(z[,10],dj) #unemp 27
            v38=rep(z[,11],dj) #gdpgrt 28
            
             
            
            liquid = 0.7*v11-0.09*v12+0.4*v17
            profit =  2.8*v14-1.6*v18-0.04*v19
            debts  = 0.58*v13+0.49*v15
            DTD = v20
            AGE = v21
            RTN = v22
            SD  = v23
            
            Ind1=v24
            Ind2=v25
            
            vwrtn = v28
            longbond = v29
            midbonds = 0.3*v30-0.5*v31
            shortbonds = 0.44*v33-0.4*v34 
              
            ex = v36
            unem = v37
            gdp = v38  
            
            mat1=cbind(liquid,profit,debts,DTD,AGE,RTN,SD,longbond,midbonds,unem,   matrix(0,ncol=8,nrow=dj))
            mat2=cbind(matrix(0,ncol=10,nrow=dj),                                  liquid,profit,AGE,RTN,SD,Ind1,Ind2,unem)
             

             
             linear1=rowSums(t(matrix(rep(para,dj),ncol=dj))*mat1)
             linear2=rowSums(t(matrix(rep(para,dj),ncol=dj))*mat2)
                                                                                                                                                                                                                                                                                
             list(mat1,mat2,linear1,linear2)
}

  
mle=c(-0.84958815,-0.79395884,1.06760549,-0.51133536,0.26737205,-0.43734638,9.93639798,0.05927794,1.04982973,0.29241875,+
       0.46367106,-0.84166321,0.07405145,1.07406696,-11.49174515,-0.1615976,-0.22951649,-0.48894765)		

arpara=c(1.0,-8,0.5,-2,0.5,0.5,0)

out.newton=itr_newton_frailty(bysurv_sampled,bybanked_sampled,bymerged_sampled,macroecono,mle,arpara,112)



# in sample tests

revert_pred_insample=function(x,y1,y2,z,mle,base,threshold,k){
        u=length(mle)
       
        h=as.numeric(base)
                
        locmacro=which(z[,12]==(k-1))#macroeconomic variables
        macroeco=z[locmacro,]
        
        #survivors
        locsurv=which(as.numeric(names(x))==(k-1))
        survivorfin=x[[locsurv]]
        l=dim(survivorfin)[1]
        
        #exited
        locbank=which(as.numeric(names(y1))==k)
        locmerg=which(as.numeric(names(y2))==k)
        
            if (length(locbank)==0){
              dj1=0
              banked=0000
            }else{
              banked=y1[[locbank]][,1]
              dj1=length(banked)
            }
            
            if (length(locmerg)==0){
              dj2=0
              merged=0000
            }else{
              merged=y2[[locmerg]][,1]
              dj2=length(merged)
            }
        
                
        #survivors
        covresult=covariates(survivorfin, macroeco, mle)
        matsurv1=covresult[[1]]
        matsurv2=covresult[[2]]
        linear01=covresult[[3]]
        linear02=covresult[[4]]
            
        h01=exp(h[1]+linear01)
        h02=exp(h[2]+linear02)
        hdot=h01+h02
        
        lamda01=h01/(1+h01+h02)
        lamda02=h02/(1+h01+h02)
        
        #first term of taylor approximation
        sup01=max(lamda01)
        sup02=max(lamda02)
        
        result1=c(k, l, dj1, sum(lamda01), dj2, sum(lamda02))
        
        #shannon's entrophy
        shannon1=0
        mj1=0
        Vj1=0
        for (w in 1:l){
            if (any(banked==survivorfin[w,1])){
                shannon1=shannon1+log(lamda01[w])
            }else{
                if (any(merged==survivorfin[w,1])){
                    shannon1=shannon1+log(lamda02[w])
                    }else{
                    shannon1=shannon1+log(1-lamda01[w]-lamda02[w])
                    }
            }
        mij1=lamda01[w]*log(lamda01[w])+lamda02[w]*log(lamda02[w])+(1-lamda01[w]-lamda02[w])*log(1-lamda01[w]-lamda02[w])
        
        mj1=mj1+mij1
        Vj1=Vj1+lamda01[w]*(log(lamda01[w]))^2+lamda02[w]*(log(lamda02[w]))^2+(1-lamda01[w]-lamda02[w])*(log(1-lamda01[w]-lamda02[w]))^2-(mij1)^2
        }
        
        zscore=(shannon1-mj1)/sqrt(Vj1)
         
        #simulation
        simnum=5000
        highcom=which(lamda01+lamda02 >= threshold)
        highlamda01=lamda01[highcom]
        highlamda02=lamda02[highcom]
        lowsumlamda01=sum(lamda01)-sum(highlamda01)
        lowsumlamda02=sum(lamda02)-sum(highlamda02)
        
        nhighcom=length(highcom)
        if (nhighcom==0){
            sumcol1=rep(0,simnum)
            sumcol2=rep(0,simnum)
        }else{
            
            resultmat1=matrix(0,nrow=nhighcom,ncol=simnum)
            resultmat2=matrix(0,nrow=nhighcom,ncol=simnum)
            Pmat1=matrix(rep(highlamda01,simnum),ncol=simnum)
            Pmat2=matrix(rep(highlamda02,simnum),ncol=simnum)
            Rmat=matrix(runif(nhighcom*simnum),ncol=simnum)
            resultmat1[which(Rmat <= Pmat1)]=1
            resultmat2[which(Rmat > Pmat1  & Rmat <= (Pmat1+Pmat2))]=1
            sumcol1=colSums(resultmat1)
            sumcol2=colSums(resultmat2)
        }
                
        poisim1=rpois(simnum,lowsumlamda01)
        poisim2=rpois(simnum,lowsumlamda02)
                        
        lower1=quantile((poisim1+sumcol1),0.025)
        lower2=quantile((poisim2+sumcol2),0.025) 
        upper1=quantile((poisim1+sumcol1),0.975)
        upper2=quantile((poisim2+sumcol2),0.975) 
                    
        predint=c(lower1,upper1,lower2,upper2)
        list(result1=result1, zscore=zscore, predint=predint)    
}


regress=out.newton$regress_est
baseline=t(out.newton$baseline_est)


fitted=matrix(0,nrow=6,ncol=80)
entropy=rep(0,80)
predint=matrix(0,nrow=4,ncol=80)
for (w in 1:80){
    out.pred=revert_pred_insample(bysurv_outsample,bybanked_outsample,bymerged_outsample,macroecono,regress,baseline[w,],0.15,32+w);
    fitted[,w]=matrix(out.pred$result1)
    entropy[w]=out.pred$zscore
    predint[,w]=matrix(out.pred$predint)
}


matrix(regress)
matrix(baseline)
matrix(entropy)