# The codes are for cross validation of corporate credit rating migration under basic orded logit model with 3 categories. 

# Data sets.
# Note that GVKEY = company ids,  tindex =1 means the 1st quarter of 1979.



#variables=read.table("/home/tbae/voldata/rating_com_cov.txt",header=TRUE)
#variables=variables[-2]
#macroecono=read.table("/home/tbae/voldata/macroeco.txt",header=TRUE)

variables=read.table("F:/Merged/rating_com_cov.txt",header=TRUE)
variables=variables[-2]
macroecono=read.table("F:/Merged/macroeco.txt",header=TRUE)


variables[,16]=100*variables[,16]


loc=which(variables[,2]==0 | variables[,3]==0)
variables=variables[-loc,]


loc11=which(variables[,2]==8) # state 8 is an abosrbing state
variables=variables[-loc11,]


locdelmac=which(macroecono[,12] <= 27 | macroecono[,12] >= 112)
macroecono=macroecono[-locdelmac,]

locdel=which(variables[,18] <= 27 | variables[,18] >= 112)
variables=variables[-locdel,]

#truncation 1percentile and 99percentile

for (j in 5:17){
               loc1=which(variables[,j] <= quantile(variables[,j],0.01))
               loc2=which(variables[,j] >= quantile(variables[,j],0.99))
               variables[loc1,j]= quantile(variables[,j],0.01)
               variables[loc2,j]= quantile(variables[,j],0.99)
}


loc12=which(variables[,2] <= 3)
variables[loc12,2]=1

loc13=which((variables[,2] > 3) & (variables[,2] <= 6))
variables[loc13,2]=2

loc14=which(variables[,2] == 7)
variables[loc14,2]=3

loc22=which(variables[,3] <= 3)
variables[loc22,3]=1

loc23=which((variables[,3] > 3) & (variables[,3] <= 6))
variables[loc23,3]=2

loc24=which(variables[,3] == 7)
variables[loc24,3]=3

loc25=which(variables[,3] == 8)
variables[loc25,3]=4


#for sharcnet
#variables=read.table("/home/tbae/voldata/ratingcov.txt",header=TRUE)
#variables=variables[-2]
#macroecono=read.table("/home/tbae/voldata/macroeco.txt",header=TRUE)

#Random Sampling of 1300 firms 
ids=variables[,1]
gvkeys=as.numeric(levels(factor(ids)))
sampled = sample(gvkeys,1300)
sampledloc=numeric()

for (i in sampled){
                   sampledloc=c(sampledloc,which(ids == i))
}

variables_est=variables[sampledloc,]    # to be used for estimation
variables_pred=variables[-sampledloc,]  # to be used for prediction



byratings_est=by(variables_est,variables_est[,c(3,2)],function(y){y})
byratings_pred=by(variables_pred,variables_pred[,c(3,2)],function(y){y})




#we consider rating data available from December 1985. That is, qtr=28 
#rating==0 means no rating. consider rating 1 to 8.

row_sums=function(matx){
        if (is.vector(matx)){
              rowsums=sum(matx)
        }else{
          
          if (dim(matx)[1]==0){
              rowsums=0
          }else{
        
             if (dim(matx)[1]==1){
                 rowsums=sum(matx)
             }else{
                  rowsums=rowSums(matx)
            }
          }
        }
        rowsums
}    

Gcdf=function(x){
        cumul=rep(0,length(x))
        loc1=which(x > 500)
        loc2=which(x <= 500)
        cumul[loc1]=1
        cumul[loc2]=exp(x[loc2])/(1+exp(x[loc2]))
        cumul
        exp(x)/(1+exp(x))
}

Gprime=function(y){
        Gcdf(y)*(1-Gcdf(y))
}

Gtwoprime=function(y){
        Gcdf(y)*(1-Gcdf(y))*(1-2*Gcdf(y))
}
        
    

        
loglikelihood=function(x,z,para,tind){
          #c=(c1,c2,c3,...,c7) gamma=c(gamma1,...,gamma7) for identifiability set c1=0, and sigma1=1 
          u=length(para)
          threshold=c(0,para[1], sum(para[1:2]))
          gamma=c(para[3],sum(para[3:4]), sum(para[3:5]))       
          sigma=c(1,para[6:7])          
	  beta=c(para[8:u])
          
            
          u1=length(threshold)
          u2=length(gamma)
          u3=length(sigma)
          u4=length(beta)         
                    
          loglikelihood=0  
            
     #regression parameter estiamtion

            for (tt in 29:tind){
           
                locmacro=which(z[,12]==(tt-1)) #macroeconomic variables
                macroeco=z[locmacro,]
            
                for (kk in 1:12){
                     locsurv=which(x[[kk]][,18]==(tt-1))
                     dkl=length(locsurv)
 
                     if (dkl==0){
                         next
                     }
                 
                    covariates=x[[kk]][locsurv,]
                    states=x[[kk]][1,2]
                    nextstates=x[[kk]][1,3] # moved transitions at tt
                 
                    linearmat=const_covariates(covariates,macroeco,beta)
                    
                    
                    if(nextstates < 4){
                                     ESkl=1/sigma[states]*(threshold[nextstates]-gamma[states] - linearmat[[2]])
                                     
                    }
                    
                    if  (nextstates > 1){
                                     ESkl1=1/sigma[states]*(threshold[nextstates-1]-gamma[states] - linearmat[[2]])
                                     
                    }
                    
                    if (nextstates == 4){
                                        phi_kl= 1-Gcdf(ESkl1)
                                        ithloglike=-log(1+exp(ESkl1))
                                        }else{
                                              if (nextstates == 1){
                                                                  phi_kl= Gcdf(ESkl)
                                                                  ithloglike=ESkl - log(1+exp(ESkl))
                                                                  }else{
                                                                        phi_kl=Gcdf(ESkl)-Gcdf(ESkl1)
                                                                        ithloglike=log(exp(ESkl)-exp(ESkl1)) -log(1+exp(ESkl))-log(1+exp(ESkl1))
                                                                        }
                                              }

                 
                    loglikelihood=loglikelihood+sum(ithloglike)
                    
          }
           
       }
        print(loglikelihood)
       -loglikelihood
}

const_covariates=function(x,z,newregress){
            dj=dim(x)[1]
            v11=x[,5]#QR 2
            v12=x[,6] #CR 3
            v13=x[,7] #CL 4 
            v14=x[,8] #IBIT 5
            v15=x[,9] #TL 6
            v16=x[,10] #SALE 7
            v17=x[,11] #WC 8
            v18=x[,12] #NI 9
            v19=x[,13] #RE 10
            v20=x[,14] #DTD 11 
            v21=x[,15] #AGE 12 
            v22=x[,16] #RTN 13
            v23=x[,17] #STD 14
            v24=(x[,4]==1)#ind1 15
            v25=(x[,4]==2)#ind2 16
            v26=(x[,4]==3) #ind3 17
            v27=(x[,4]==4) #ind4 18
                                 
            v28=rep(z[,1],dj) #indexrtn 19
            v29=rep(z[,2],dj) #30yrbond 20
            v30=rep(z[,3],dj) #10yrnote 21
            v31=rep(z[,4],dj) #5yrnote 22
            v32=rep(z[,5],dj) #1yrnote 23
            v33=rep(z[,6],dj) #90daybill 24
            v34=rep(z[,7],dj) #30daybill 25
            v35=rep(z[,8],dj) #CPI 26
            v36=rep(z[,9],dj) #ex inx 27
            v37=rep(z[,10],dj) #unemp 28
            v38=rep(z[,11],dj) #gdpgrt 29
            

            liquid = -0.13*v12+1.56*v17
            profit = v14
            debt = -1.47*v13+0.37*v15
            RTN = v22 
            DTD = v20
            STD = v23 
            IND2 = v25
            IND3 = v26
            B90 = v33
            CPI =v35

            mat=cbind(liquid,profit,debt,RTN,DTD,STD,IND2,IND3,B90,CPI)
            
            
             if (dj==1){
                        mat=as.matrix(mat)
             }
             
            linear=rowSums(t(matrix(rep(newregress,dj),ncol=dj))*mat)
            list(mat,linear)
}
    

starting=c(14.7363,7.5547,-2.0214,10.4773,8.0618,1.1889,0.7890,+ 		
           -1.1221,-7.1481,0.8916,-0.4433,-0.4627,27.2009,-0.2516,0.4539,-0.1165,-0.0880) 


result=optim(par=starting,fn=loglikelihood, method="L-BFGS-B", control=list(trace=6,maxit=500),low=c(rep(0.01,2),-Inf,rep(0.01,2),rep(0.01,2),rep(-Inf,10)), x=byratings_est,z=macroecono,tind=112)
result


rating_insample=function(x,z,para,k){
              
          u=length(para) 
          threshold=c(0,para[1], sum(para[1:2]))
          gamma=c(para[3],sum(para[3:4]), sum(para[3:5]))       
          sigma=c(1,para[6:7])          
          beta=c(para[8:u])

      
          locmacro=which(z[,12]==(k-1))#macroeconomic variables
          macroeco=z[locmacro,]

          transition = vector("list", 12) 
	  transition.moved = vector("list", 12) 
          shannon=0
          shannon.move=0
          mj1=0
          vj1=0
          mj1moved=0
          vj1moved=0
                       
          
          for (kk in 1:12){
                     locsurv=which(x[[kk]][,18]==(k-1))
                     dkl=length(locsurv)
 
                     if (dkl==0){
                         next
                     }
                 
                    covariates=x[[kk]][locsurv,]
                    
                    linearmat=const_covariates(covariates,macroeco,beta) 
                    states=x[[kk]][1,2]
                    
                    pred_prob=numeric()

                    for (jj in 1:4){
                                   if(jj < 4){
                                     ESkl=1/sigma[states]*(threshold[jj]-gamma[states] - linearmat[[2]])
                                   }
                    
                                   if  (jj > 1){
                                     ESkl1=1/sigma[states]*(threshold[jj-1]-gamma[states] - linearmat[[2]])
                                   }  
                               
                                   
                                   if (jj == 4){
                                        phi_kl= 1-Gcdf(ESkl1)
                                        }else{
                                              if (jj == 1){
                                                          phi_kl= Gcdf(ESkl)
                                                          }else{
                                                                phi_kl=Gcdf(ESkl)-Gcdf(ESkl1)
                                                                }
                                        }
                                    pred_prob=cbind(pred_prob,phi_kl)
                                    
                                   
                    }
                    
                    currentstates=rep(states,dkl)        
                    nextstates=x[[kk]][locsurv,3] # vector of next states at k
                    transition[[kk]]=cbind(pred_prob,currentstates,nextstates)
                    moved.trans=cbind(pred_prob[,states],row_sums(pred_prob[,-states]),!(currentstates==nextstates))
                    transition.moved[[kk]]=moved.trans
                                    
                    
                    #shannon's entrophy
                      for (w in 1:dkl){
                                  if (nextstates[w]==1){
                                          shannon=shannon+log(pred_prob[w,1])
                                  }
                                  if (nextstates[w]==2){
                                          shannon=shannon+log(pred_prob[w,2])
                                  }
                                  if (nextstates[w]==3){
                                          shannon=shannon+log(pred_prob[w,3])
                                  }
                                  if (nextstates[w]==4){
                                          shannon=shannon+log(pred_prob[w,4])
                                  } 
                           

                                  mij1=pred_prob[w,1]*log(pred_prob[w,1])+pred_prob[w,2]*log(pred_prob[w,2])+pred_prob[w,3]*log(pred_prob[w,3])+pred_prob[w,4]*log(pred_prob[w,4])
                                  vij1=pred_prob[w,1]*log(pred_prob[w,1])^2+pred_prob[w,2]*log(pred_prob[w,2])^2+pred_prob[w,3]*log(pred_prob[w,3])^2+pred_prob[w,4]*log(pred_prob[w,4])^2-mij1^2              
                       
                                  mj1=mj1+sum(mij1)
	                          vj1=vj1+sum(vij1) 
                       }
                       

                     #change or not entropy
                     #  for (w in 1:dkl){
                     #             if (moved.trans[w,3]){
                     #                     shannon.jth=log(moved.trans[w,2])
                     #             }else{
                     #                     shannon.jth=log(moved.trans[w,1])
                     #              }
                     # 
                     #            shannon.move = shannon.move+shannon.jth
                     #             
                     #             mij1moved=moved.trans[w,1]*log(moved.trans[w,1])+moved.trans[w,2]*log(moved.trans[w,2])
                     #             vij1moved=moved.trans[w,1]*log(moved.trans[w,1])^2+moved.trans[w,2]*log(moved.trans[w,2])^2-mij1moved^2   
                     #             mj1moved=mj1moved+sum(mij1moved)
                     #             vj1moved=vj1moved+sum(vij1moved)
                     #  }
                       
                                          
                    
        } 
    
        zscore=(shannon-mj1)/sqrt(vj1)
        #zscore.moved=(shannon.move-mj1moved)/sqrt(vj1moved)
         
         list(transition=transition, transition.moved=transition.moved,score=zscore)
        #list(transition=transition, transition.moved=transition.moved,score=zscore,zscore.moved=zscore.moved)
              
}

#mle=starting
mle=result$par

entropy=numeric()
for (tt in 29:112){
              results=rating_insample(x=byratings_pred,z=macroecono,para=mle,tt)
              entropy=c(entropy,results$zscore)
}