R code to implement a Bayesian approach for detecting spatial clusters

This is a R code I wrote to replicate a study by Xu and Bao (2004, "Detecting Spatial Clusters: a Bayesian Approach with application to identifying disability patterns in Mississippi and Alabama"). Thanks to everyone who helped me with this, in particular, Xu and Bao, the authors of the original paper, and among others, Crystal and Jing Zhang.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!Code Begins here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#################################################
# Replicate Xu and Bao(2004)
# Disability rate among the elderly(65 or over)
# 149 Counties in Alabama and Mississippi
# 1990 Census data
#################################################
library(poLCA)
library(TeachingDemos)
setwd("C:/Hongwei/Methods/Bayesian Analysis/Spatial/NHGIS_1990 Census/")
################################
# Read in spatial weight matrix
################################
sp = read.table("ALMS_cntywt.csv",header=T,sep=",")
sp = sp[order(sp$countyid,sp$NID),]
spwt = as.matrix(cbind(sp$countyid,sp$NID))
################################
# Read in disability rate data
################################
dat = read.table("ALMS90_DisbNb.csv",header=T,sep=",")
names(dat)
cntyid = dat$countyid # County ID
pop = dat$pop65p # Total elderly population in each county
y = dat$pop65p_dis # Total disability elderly in each county
disrate = dat$disrate # Disability rate
cnty.k = dat$kdis-1 # Pre-defined types of county
n.cnty = length(cntyid) # Total number of counties in the study area

##################################
# Gibbs sampling for the posterior
##################################
##########################################################
# Step 1: Set initial values for p0, p1, p2, x (page 276)
# p0 < p1 < p2
# xi = 0 if yi==1; xi = 1 if yi==2; xi = 2 if yi==3
##########################################################
# Initial values of parameters
p0.0 = .1; p1.0 = .2; p2.0 = .3
x.0 = cnty.k
b = .5
# Initialize vectors to store posteriors of parameters
p0.post = p0.0; p1.post = p1.0; p2.post = p2.0;
x.post = x.0        
#######################
# Step 2. Gibbs sampling
#######################
M = 100000 # Number of iteration
for(i in 2:M){
    ##########################################
    # Conditional on y and x, sample p0,p1,p2
    ##########################################
    # Parameters for Beta distribution
    #b0.L = sum(y[x.0==0]+1); b0.U = sum(pop[x.0==0]-y[x.0==0]+1)
    #b1.L = sum(y[x.0==1]+1); b1.U = sum(pop[x.0==1]-y[x.0==1]+1)
    #b2.L = sum(y[x.0==2]+1); b2.U = sum(pop[x.0==2]-y[x.0==2]+1)
    #print(x.0)
    b0.L = sum(y[x.0==0])+1; b0.U = sum(pop[x.0==0]-y[x.0==0])+1
    b1.L = sum(y[x.0==1])+1; b1.U = sum(pop[x.0==1]-y[x.0==1])+1
    b2.L = sum(y[x.0==2])+1; b2.U = sum(pop[x.0==2]-y[x.0==2])+1
    # Draw p0,p1,p2
    p0.0 = rbeta(1,b0.L,b0.U)
    p1.0 = rbeta(1,b1.L,b1.U)
    p2.0 = rbeta(1,b2.L,b2.U)
    # Store sampled p0,p1,p2
    p0.post = c(p0.post,p0.0)
    p1.post = c(p1.post,p1.0)
    p2.post = c(p2.post,p2.0)
    #########################################################################
    # Conditional on p0,p1,p2 and y, sample x
    # There are several functions to do a random draw from multinomial dist
    # rmultinom from library(stats)
    # e.g. rmulti(c(.1,.3,.6)) from library(poLCA)
    #########################################################################
    # Update x_i for each county sequentially
    # This is another layer of Gibbs sampling
    for(k in 1:n.cnty){
        # list of neighbors of ith county; county ID = 1,2,…,149
        nb = spwt[spwt[,1]==k,2]
        temp = x.0[nb] # x.0 for county i’s neighbors
        # Sum up number of different types of neighbors for county i
        u0 = sum(as.numeric(temp==0)) # A indicator vector of x.0==0
        u1 = sum(as.numeric(temp==1)) # A indicator vector of x.0==1
        u2 = sum(as.numeric(temp==2)) # A indicator vector of x.0==2
        #########################
        # q0,q1,q2 on Log scale
        #########################
        q0.log = (2*b*u0)+y[k]*log(p0.0)+(pop[k]-y[k])*log(1-p0.0)
        q1.log = (2*b*u1)+y[k]*log(p1.0)+(pop[k]-y[k])*log(1-p1.0)
        q2.log = (2*b*u2)+y[k]*log(p2.0)+(pop[k]-y[k])*log(1-p2.0)
        #print(c(q0.log,q1.log,q2.log))
        q0.q=1/(1+exp(q1.log-q0.log)+exp(q2.log-q0.log))
        q1.q=1/(1+exp(q0.log-q1.log)+exp(q2.log-q1.log))
        q2.q=1/(1+exp(q0.log-q2.log)+exp(q1.log-q2.log))
        x.0[k] = rmulti(c(q0.q,q1.q,q2.q))-1
    }
    x.post = cbind(x.post,x.0)
}
burned = M/2 # Number of burned-in iterations
##################################################
# The 95% Highest Posterior Density Interval (HPD)
##################################################
emp.hpd(p0.post,conf=.95)
emp.hpd(p1.post,conf=.95)
emp.hpd(p2.post,conf=.95)
emp.hpd(p0.post[(burned+1):M],conf=.95)
emp.hpd(p1.post[(burned+1):M],conf=.95)
emp.hpd(p2.post[(burned+1):M],conf=.95)
summary(p0.post)
summary(p1.post)
summary(p2.post)
apply(x.post[,(burned+1):M],1,mean)
# Trace plots
par(mfrow=c(2,2))
plot(seq(1,M,1),p0.post,type="l")
plot(seq(1,M,1),p1.post,type="l")
plot(seq(1,M,1),p2.post,type="l")

# Export X.post
write.table(as.matrix(cbind(x.post[,M],apply(x.post[,(burned+1):M],1,mean))),file="x_post.csv",sep=",")

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!Code Ends here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

About Hongwei Xu

I'm a social demographer, a single-child, a husband, and a father.
This entry was posted in Research. Bookmark the permalink.

发表评论

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / 更改 )

Twitter picture

You are commenting using your Twitter account. Log Out / 更改 )

Facebook photo

You are commenting using your Facebook account. Log Out / 更改 )

Google+ photo

You are commenting using your Google+ account. Log Out / 更改 )

Connecting to %s