Simulation and analysis with R and LDSC

Last updated on Feb 25, 2025 4 min read

My dissertation research always needs to do comparison with the GCTA-GREML and LDSC.Here is an introduction of R analysis with LDSC and GCTA

GCTA

GCTA is a wonderful software designed for the dealing with the GWAS data. On the other hand, it does not always fit in the common statistical simulation pipeline, particualy for the software R. This post is used to document how to do statistical simulation with R and GCTA based on a give genotype file.

What we need:

phenotype file Y1, Y2
Genotype file geno.bed

First, we generate the GRM file based on the genotype file. This function only need be estimated once

MakeGRM = function(ref.bed, keep.snp, keep.ind, output.dir = "./temp/"){

  grm = paste0(output.dir, "geno")

  fwrite(data.frame(keep.snp), file = paste0(output.dir,"snplist.txt"), quote = F, sep = "\t", col.names=T, na = "NA")

  system(paste0("plink --bfile ", ref.bed, " --extract ", paste0(output.dir, "snplist.txt"), " --make-grm-bin --out ", grm ) )
}

In the second step, we need to write out the phenotype file based on the FID and IID

WritePheno = function( Y1, Y2, FID, IID, pheno.file  ){

  pheno = data.frame(FID = FID, IID = IID,
                     Y1 = Y1, Y2 = Y2)
  
  fwrite(pheno, file = pheno.file,
         quote = F, sep = "\t", col.names=T, na = "NA")
}

Then we implement the GCTA,

SimuGCTA = function(geno.file, pheno.file, output.dir){

  output.file = paste0(output.dir, "GCTA.result")

  system(paste0("./gcta64 --reml-bivar --grm ",
                geno.file," --pheno ", pheno.file,
                " --thread-num 20 --out ", output.file ))
}

When the simulated errors have no covariance, we could specify –reml-bivar-nocove for fair comparison. To test for the non-zero genetic covariance, ** –reml-bivar-lrt-rg 0 ** could be used.

SimuGCTA.test = function(geno.file, pheno.file, output.dir){

  output.file = paste0(output.dir, "GCTA.result")
  
  system(paste0("./gcta64 --reml-bivar-nocove --reml-maxit 100 --grm ", 
                geno.file," --pheno ",
                pheno.file," --reml-bivar-lrt-rg 0 --out ",output.file))
}

Read GCTA output file

read.GCTA = function(output.dir, LRT = T){
  
   filename = paste0(output.dir, "GCTA.result.hsq")
    if(file.exists(filename)){
      hsq = data.table::fread(filename, fill = T)
      Est = as.vector(t( hsq[1:3, 2:3])) %>% 
    setNames(c("Heri.1","Heri.1.Se","Heri.2","Heri.2.Se", "GeCv","GeCv.Se" ))
      if(LRT){
      pval = as.numeric(sapply(strsplit(as.character(hsq[hsq$Source == "Pval",2]), "\\("), "[[", 1))
      }
    }
   return(list(Est = Est, pval = pval))
}

LDSC

First, we generate the ld score based on the genotype file. This function only need be estimated once

module load ldsc source activate ldsc-1.0.1 ldsc.py –bfile geno –ld-wind-cm 1 –out chr1_select ldsc.py –bfile ../../Cric_HF_RealData/rawdata/cric.filtered.maf0.01 –extract ./temp/snplist.txt –ld-wind-kb 1000 –out ./temp/geno

Implement the LDSC regression

SimuLDSC = function(Z1, N1, Z2, N2, Nc=0, weight=T, CSTR = T,
                           info = NA, LDSCORE, corenum = 1, output.dir){
  
  if(length(Z1) != length(Z2) ){stop("Z scores should have the same length.")}

  if(!is.na(info)){
    print("update info")
    Z1.data = data.frame(SNP = info$SNP , Z = Z1, N =  N1,
                         A1 =  "A", A2 = "G", stringsAsFactors = F)
    colnames(Z1.data) = c("SNP",  "Z", "N", "A1", "A2")
    fwrite(Z1.data, file = paste0(output.dir, "Z1.sumstats") , quote = F, sep = "\t", col.names=T, na = "NA" )
    Z2.data = data.frame(SNP = info$SNP , Z = Z2, N =  N2,
                         A1 = "A",  A2 = "G", stringsAsFactors = F)
    colnames(Z2.data) = c("SNP", "Z", "N", "A1", "A2")
    fwrite(Z2.data, file = paste0(output.dir, "Z2.sumstats"), quote = F, sep = "\t", col.names=T, na = "NA" )
  }
  if(weight == T){
    name = paste0(output.dir, "LDSC")
    system(paste0("ldsc.py --rg ",paste0(J, "Z1.sumstats,"), paste0(J, "Z2.sumstats"),
                  " --ref-ld ", LDSCORE," --w-ld ",LDSCORE," --intercept-h2 1,1 --out ",name))
  }else{

    stop()

  } 
  }

read the LDSC output

read.LDSC = function(output.dir){
  H = fread(paste0(output.dir, "LDSC.log"), fill = T)
  result.all = rep(NA, 8)
  result.gecr = unlist(H[(stringr::str_detect(unlist(H), "Genetic Correlation:"))]) %>% str_split(, pattern = " ") %>% unlist()
  result.heri = unlist(H[(stringr::str_detect(unlist(H), "Total Observed scale h2:"))]) %>%  str_split(, pattern = " ") %>% unlist()
  result.gecov = unlist(H[(stringr::str_detect(unlist(H), "Total Observed scale gencov:"))]) %>% str_split(, pattern = " ") %>% unlist()
  result.P = unlist(H[(stringr::str_detect(unlist(H), "P:"))]) %>%
    str_split(, pattern = " ") %>% unlist()

  if(!is.null(result.gecr)){
    result =  as.numeric(unlist(regmatches(result.gecr,gregexpr("-?\\ *[0-9]+\\.?[0-9]*(?:[Ee]\\ *-?\\ *[0-9]+)?",result.gecr, perl=TRUE))))
    result.all[1] = result[1]
    result.all[2] = result[2]
  }

  if(!is.null(result.heri)){
    result =  as.numeric(unlist(regmatches(result.heri,
                                           gregexpr("-?\\ *[0-9]+\\.?[0-9]*(?:[Ee]\\ *-?\\ *[0-9]+)?",
                                                    result.heri, perl=TRUE))))
    result.all[3] = result[2]
    result.all[4] = result[3]
    result.all[5] = result[5]
    result.all[6] = result[6]
  }

  if(!is.null(result.gecov)){

    result =  as.numeric(unlist(regmatches(result.gecov,
                                           gregexpr("-?\\ *[0-9]+\\.?[0-9]*(?:[Ee]\\ *-?\\ *[0-9]+)?",
                                                    result.gecov, perl=TRUE))))
    result.all[7] = result[1]
    result.all[8] = result[2]
    zscore = result[1]/result[2]
    pvalue = 2* (1 - pnorm(abs(zscore)))
  }else{
    pvalue = NA
  }
  return( c(GeCv.pv = pvalue, GeCr.pv = as.numeric(result.P[2])) )
}

Simulation and analysis with R and LDSC

GCTA

LDSC

Jianqiao Wang

Assistant Professor