TangoS.index<-function(dat, geo, lam, Nrep){ # # ----------------------------------------------------------------- # An example of execution # # Nrep<-99 # lam<-c(0.1, (1:20)*5) # geo <-scan("d:/demo/gallblad.geo",list(id=0,y=0,x=0)) # dat <-scan("d:/demo/gallblad.dat",list(id=0,d=0,e=0)) # out<-TangoS.index(dat, geo, lam, Nrep) # # --------------------------------------------------------------------- # # ARGUEMENTS # # Nrep : The number of Monte Carlo replications, e.g., 999, 9999 # geo$id : a vector of region id # geo$x : a vector of x-coordinate of region centroid, e.g., longitude # geo$y : a vector of y-coordinate of region centroid, e.g., latitude # dat$id : (the same id as geo$id) # dat$d : a vector of observed number of cases # dat$e : a vector of expected number of cases # lam : a vector of lambda values used for the profile p-values # # VALUES # p.value : Monte Calro simulated (adjusted) p-vallue # lamstar : optimal value of lambda # id : (the same id as geo$id) # stdcont : a vector of standardized Ui # smr : a vector of standardized mortality (incidence) ratio # # ---------------------------------------------------------------------------- # Explanations of major variables used in the program # # NOTE !! # dc : distance matrix calculated from geo$x and geo$y # ( Depending on the user's situation, this should be modified !) # # nr : the number of regions # nn : the number of cases # kkk: the number of lambda values # pxx: sorted p-values due to Monte Carlo replications under # the null # ss : per cent contribution to C # topid : region id sorted in the descending order of ss # (e.g., top[1] denotes the most likely location of cluster, if any.) # topstd : standardized per cent contribution sorted in the # descending order of ss # pres : The resultant adjusted p-value based on Monte Carlo hypothesis testing # # ---------------------------------------------------------------------------- nloop<-Nrep par(mfrow=c(1,1)) start<-date() nr<-length(geo$id) qdat<-rep(0,nr) p<-rep(0,nr) w<-matrix(0,nr,nr) nn<-sum(dat$d) kvec<-lam kkk<- length(kvec) dc<-matrix(0,nr,nr) regid<-geo$id frq<-matrix(0,nloop,nr) pval<-matrix(0,nloop,kkk) pxx<-rep(0,nloop) pcc<-pxx for (i in 1:nr){ for (j in i:nr){ dij<-sqrt(((geo$x[i]-geo$x[j])*90.15)**2 + ((geo$y[i]-geo$y[j])*110.9)**2) # for Tokyo area # dij<-sqrt(((geo$x[i]-geo$x[j]))**2 + ((geo$y[i]-geo$y[j]))**2) dc[j,i]<- dij dc[i,j]<- dij } } # # Calculation of (1) null distribution of test statistic Pmin # qdat<- dat$d poo<-dat$e p<-poo/sum(poo) pp<-matrix(p) w<-diag(p)-pp%*%t(pp) py<-p vv<-py/sum(py) cvv<-cumsum(vv) for (ijk in 1:nloop) { r1<-hist(runif(nn),breaks=c(0,cvv),plot=F) frq[ijk,]<- frq[ijk,] + r1$count } frq<-frq/nn ori<- qdat qdat<-qdat/nn # # Figure 1: Locations of nr regions and population size # symbols(geo$x,geo$y,circles=sqrt(p),inch=0.50) text(geo$x,geo$y,regid,adj= -0.5, col=8) # for (k in 1:kkk){ rad<- kvec[k] ac <- exp( -4 * (dc/rad)^2 ) hh <- ac%*%w hh2<- hh%*%hh av <- sum(diag( hh )) av2<- sum(diag( hh2 )) av3<- sum(diag( hh2%*%hh )) skew1<- 2*sqrt(2)*av3/ (av2)^1.5 df1<- 8/skew1/skew1 eg1<-av vg1<-2*av2 for (ijk in 1:nloop){ q<-frq[ijk,] gt<-(q-p)%*%ac%*%(q-p)*nn stat1<-(gt-eg1)/sqrt(vg1) aprox<-df1+sqrt(2*df1)*stat1 pval[ijk,k]<- 1-pchisq(aprox, df1) } } for (ijk in 1:nloop){ pcc[ijk]<-min( pval[ijk,] ) } pxx<- sort( pcc ) # # Calculation of (2) test statistic Pmin and its adjusted p-value # of observed data # ptan<-rep(0,kkk) gtt<-rep(0,kkk) jjj<-1:kkk ep<-p*nn po<-ppois(ori,ep) pa<-(po-0.90)/0.10 p1<-ifelse(po > 0.9 & ori>0, pa^3, 0) # # Figure 2: Observed number of cases and the significance of O/E ratio # symbols(geo$x,geo$y,circle=sqrt(p1),inch=0.2) text(geo$x, geo$y, ori, col=8) #par(cex=0.5) #par(mar=c(4,3,3,5)) #par(mfrow=c(1,2)) #par(mgp=c(1.5,0.5,0)) # for (k in 1:kkk) { ti<-date() rad<- kvec[k] ac <- exp( -4 * (dc/rad)^2 ) hh <- ac%*%w hh2<- hh%*%hh av <- sum(diag( hh )) av2<- sum(diag( hh2 )) av3<- sum(diag( hh2%*%hh )) skew1<- 2*sqrt(2)*av3/ (av2)^1.5 df1<- 8/skew1/skew1 eg1<-av vg1<-2*av2 gt<-(qdat-p)%*%ac%*%(qdat-p)*nn gtt[k]<-gt stat1<-(gt-eg1)/sqrt(vg1) aprox<-df1+sqrt(2*df1)*stat1 ptan[k]<- 1-pchisq(aprox, df1) } # # Figure 3: the profile p-value for the values of lambda # p22<-min( ptan ) at<-c(pxx,p22) pres<-length(at[at<=p22])/(length(pxx)+1) for (i in 1:kkk){ if ( abs(p22 - ptan[i] )< 0.0000001 ){pind<-i} } #pind<-jjj[ptan==min(ptan)] plot(kvec,ptan,pch=1,type="b",xlab="cluster size ( lambda values ) ", ylab=" Unadjusted p-values") abline(v=kvec[pind]) text((max(kvec)+2*min(kvec))/3,(3*max(ptan)+min(ptan))/4, "Adjusted p-value = ",col=8) text((1.5*max(kvec)+min(kvec))/2.5,(3*max(ptan)+min(ptan))/4, pres,col=8) # # Figure 4: Percent contribution Ui # rad<- kvec[pind] ac <- exp( -4 * (dc/rad)^2 ) ss<-ac%*%(qdat-p)*nn ss<-(qdat-p)*ss ss2<-ss ss<-ss/sum(ss)*100 plot(regid,ss,pch=1,xlab=" region ID ", ylab=" Percent contribution ") text(regid+2,ss+1,regid,col=8) topid<-regid[sort.list(-ss)] # # Standardized per cent contribution z<- (ss-mean(ss)) %*% (1/sqrt(var(ss))) topstd<-z[sort.list(-ss)] smr<-ori/ep topsmr<-smr[sort.list(-ss)] end<- date() list(p.value=pres, lamstar= rad, id=topid, stdcont=topstd, smr=topsmr) }