Add files via upload

2025-11-30 09:45:43 -08:00 · 2023-11-20 11:32:21 -08:00 · 2023-11-20 11:32:21 -08:00 · 3480239892
commit 3480239892
parent f0781880cf
6 changed files with 320 additions and 0 deletions
--- a/GWAS_PRS/scripts/step01_plink_QC.sh
+++ b/GWAS_PRS/scripts/step01_plink_QC.sh
@ -1 +1,59 @@
+#files
+cd ~/Desktop/handson_plink/GWAS/
+
+plink_dir=~/Desktop/handson_plink
+#cd genotypes/
+#unrelated_hg38_auto_maf_0.01_500k_snp.pvar
+#unrelated_hg38_auto_maf_0.01_500k_snp.psam
+#unrelated_hg38_auto_maf_0.01_500k_snp.pgen
+#cd phenotypes/
+#phenofile.csv
+#cd covariates/
+#covariate.csv
+
+
+
+########################################################################
+#QC and filtering
+########################################################################
+mkdir QC/
+
+#frequency
+$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp \
+--freq \
+--nonfounders \
+--out QC/unrelated_hg38_auto_maf_0.01_500k_snp
+
+
+#MAF filtering  (--maf 0.05)
+$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp --min-af 0.05 \
+--make-pgen \
+--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp
+
+
+#call rate and missingness
+$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp --geno 0.1 \
+--mind 0.1 \
+--make-pgen \
+--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered
+
+
+#LD pruning - cretae list of independent snps to include in PCs analysis 
+$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered --indep-pairwise 50 5 0.2  \
+--out QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered
+
+#recode genotypes  with indep SNPs only 
+$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
+--extract QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered.prune.in \
+--make-pgen \
+--out genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp
+
+#pca with indep snps only
+$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp  --nonfounders    --pca           \
+  --read-freq  QC/unrelated_hg38_auto_maf_0.01_500k_snp.afreq \
+  --out QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp
+
+########################################################################
+
+#run step02_covariate_PCs.R

--- a/GWAS_PRS/scripts/step02_covariate_PCs.R
+++ b/GWAS_PRS/scripts/step02_covariate_PCs.R
@ -0,0 +1,34 @@
+#in Rstudio
+
+########################################################################
+#plot PCs and creating a second covariate file including PCS
+########################################################################
+
+setwd("~/Desktop/handson_plink/GWAS/")
+cov <- read.csv("covariates/covariates.csv", header=T,comment.char="", check.names=F)
+pc <- read.table("QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp.eigenvec",header=T,  comment.char="", check.names=F)
+colnames(pc)[1] <- "IID"
+EUR <- read.csv("../PRS/EUR_input_for_PRS/EUR_indivs.csv", header=T,comment.char="", check.names=F)
+
+updated_pc <- merge(cov, pc, by="IID")
+updated_pc_EUR <- updated_pc[which(updated_pc$IID %in% EUR$IID),]
+
+if (!dir.exists("R_plots")){
+dir.create("R_plots")
+} else {
+    print("R_plots already exists!")
+}
+
+
+pdf("R_plots/Pop_stratification.pdf")
+plot(updated_pc$PC1, updated_pc$PC2, xlab="PC1", ylab="PC2")
+points(updated_pc_EUR$PC1, updated_pc_EUR$PC2, pch=19, col="forestgreen")
+dev.off()
+
+write.csv(updated_pc, "covariates/covariates_with_10PCs.csv",  quote=F, row.names=F)
+
+########################################################################
+
+#open and run step03_GWAS.sh
+
+########################################################################
--- a/GWAS_PRS/scripts/step03_plink_GWAS.sh
+++ b/GWAS_PRS/scripts/step03_plink_GWAS.sh
@ -0,0 +1,39 @@
+########################################################################
+#GWAS 
+########################################################################
+plink_dir=~/Desktop/handson_plink
+mkdir plink_results/
+#linear model --glm or --linear
+#no covariates
+$plink_dir/plink2  --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered  \
+--glm omit-ref  allow-no-covars  \
+--pheno  phenotypes/phenofile.csv  --pheno-name BMI  \
+--out plink_results/no.covar
+
+#sex and age as covariate
+$plink_dir/plink2  --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered  \
+--covar    covariates/covariates.csv    --covar-variance-standardize   \
+--glm omit-ref  hide-covar  \
+--pheno  phenotypes/phenofile.csv  --pheno-name BMI  \
+--out plink_results/sex.age.adjusted
+
+
+#PCs, sex and age as covariate
+$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered  \
+--covar    covariates/covariates_with_10PCs.csv    --covar-variance-standardize   \
+--glm omit-ref  hide-covar  \
+--pheno  phenotypes/phenofile.csv  --pheno-name BMI  \
+--out plink_results/sex.age.PCs.adjusted
+
+
+
+
+#logistic model --glm or --logistic
+$plink_dir/plink2 --1 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered  \
+--covar    covariates/covariates_with_10PCs.csv    --covar-variance-standardize   \
+--glm omit-ref cols=+beta,+orbeta,+ci hide-covar  \
+--pheno  phenotypes/phenofile.csv  --pheno-name obesity --ci 0.95  \
+--out plink_results/sex.age.PCs.adj
+
+########################################################################
+#open and run in Rstudio step04_GWAS_visualization.R
--- a/GWAS_PRS/scripts/step04_GWAS_visualization.R
+++ b/GWAS_PRS/scripts/step04_GWAS_visualization.R
@ -0,0 +1,58 @@
+
+##################
+#Visualization in R
+##################
+#manhattan plot
+install.packages("qqman")
+library(qqman)
+
+setwd("~/Desktop/handson_plink/GWAS/")
+
+if (!dir.exists("R_plots")){
+dir.create("R_plots")
+} else {
+    print("R_plots already exists!")
+}
+
+gwas_obesity <-  read.table("plink_results/sex.age.PCs.adj.obesity.glm.logistic.hybrid", header=T, comment.char="", check.names=F)
+
+pdf("R_plots/obesity.pdf")
+manhattan(gwas_obesity, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted obesity")
+
+#qqplot
+qq(gwas_obesity$P, main="quantile-quantile plot - sex, age, PCs adjusted obesity")
+dev.off()
+
+
+gwas_BMI_no_cov <-  read.table("plink_results/no.covar.BMI.glm.linear", header=T,comment.char="", check.names=F)
+
+gwas_BMI_sex_age_adj <-  read.table("plink_results/sex.age.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F)
+
+gwas_BMI_sex_age_PCs_adj <-  read.table("plink_results/sex.age.PCs.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F)
+
+
+pdf("R_plots/BMI.pdf")
+manhattan(gwas_BMI_no_cov, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - no covar adjusted BMI")
+
+manhattan(gwas_BMI_sex_age_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age adjusted BMI")
+
+manhattan(gwas_BMI_sex_age_PCs_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted BMI")
+
+#qqplot
+qq(gwas_BMI_no_cov$P, main="quantile-quantile plot - no covar adjusted BMI")
+qq(gwas_BMI_sex_age_adj$P, main="quantile-quantile plot - sex, age adjusted BMI")
+qq(gwas_BMI_sex_age_PCs_adj$P, main="quantile-quantile plot - sex, age, PCs adjusted BMI")
+
+dev.off()
+#include
+#pvals <- read.table("AMH_all_FVG_rs_pvalue", header=T)
+
+#observed <- sort(pvals$pgc)
+#lobs <- -(log10(observed))
+
+#expected <- c(1:length(observed)) 
+#lexp <- -(log10(expected / (length(expected)+1)))
+
+
+########################################################################
+#open and run step05_Plink_PRS.sh
--- a/GWAS_PRS/scripts/step05_plink_PRS.sh
+++ b/GWAS_PRS/scripts/step05_plink_PRS.sh
@ -0,0 +1,62 @@
+
+########################################################################
+#PRS
+########################################################################
+PRSdir=~/Desktop/handson_plink/PRS
+GWASdir=~/Desktop/handson_plink/GWAS
+
+cd ~/Desktop/handson_plink/PRS
+mkdir PRS_results/
+plink_dir=~/Desktop/handson_plink
+#input files are in EUR_input_for_PRS/
+#EUR summary stats from large GWAS meta analysis 
+#Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt
+
+#EUR list of indivs + covariate and pheno files
+#EUR_input_for_PRS/EUR_BMI_phenofile.csv
+#EUR_input_for_PRS/EUR_covariates.csv
+#EUR_input_for_PRS/EUR_indivs.csv
+
+
+#create geno files for european only as input for the PRS - we used maf 0.01 since we did not filter for call rate and missingness
+$plink_dir/plink2 \
+--pfile $GWASdir/genotypes/unrelated_hg38_auto_maf_0.01_500k_snp  \
+--keep  EUR_input_for_PRS/EUR_indivs.csv  \
+--make-bed \
+--out EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered
+
+#clumping the summary stats from large GWAS meta analysis to get indep SNPs based on pvalue overlapping our genotypes
+$plink_dir/plink2  \
+    --bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \
+    --clump-p1 1 \
+    --clump-r2 0.1 \
+    --clump-kb 250 \
+    --clump EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt \
+    --clump-snp-field SNP \
+    --clump-field P \
+    --out EUR_input_for_PRS/EUR_clumped
+    
+#valid independent snps    
+awk 'NR!=1{print $3}' EUR_input_for_PRS/EUR_clumped.clumps >  EUR_input_for_PRS/EUR.valid.snp
+    
+    
+#SNP and P from large GWAS meta analysis
+awk '{print $3,$9}' EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt > EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue
+
+#create threshold of SNPs to create scores for different subsets of SNPs
+echo "0.001 0 0.001" > EUR_input_for_PRS/range_list 
+echo "0.05 0 0.05" >> EUR_input_for_PRS/range_list
+echo "0.1 0 0.1" >> EUR_input_for_PRS/range_list
+echo "0.2 0 0.2" >> EUR_input_for_PRS/range_list
+echo "0.3 0 0.3" >> EUR_input_for_PRS/range_list
+echo "0.4 0 0.4" >> EUR_input_for_PRS/range_list
+echo "0.5 0 0.5" >> EUR_input_for_PRS/range_list
+
+#estimating the score for each range of P using the valid snps (for the score needs snp, allele and BETA)
+$plink_dir/plink2  \
+    --bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \
+    --score EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt 3 4 7 header \
+    --q-score-range EUR_input_for_PRS/range_list EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue \
+    --extract EUR_input_for_PRS/EUR.valid.snp \
+    --out EUR_input_for_PRS/EUR
+########################################################################
--- a/GWAS_PRS/scripts/step06_best_PRS_visualization.R
+++ b/GWAS_PRS/scripts/step06_best_PRS_visualization.R
@ -0,0 +1,69 @@
+####################################################
+#Best PRS model and Visualization in R
+####################################################
+
+setwd("~/Desktop/handson_plink/PRS/")
+
+p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5)
+# Read in the phenotype file 
+phenotype <- read.csv("EUR_input_for_PRS/EUR_BMI_phenofile.csv", header=T, check.names=F, comment.char="")
+# Read in the PCs
+#pcs <- read.table("EUR.eigenvec", header=F)
+# The default output from plink does not include a header
+# To make things simple, we will add the appropriate headers
+# (1:6 because there are 6 PCs)
+#colnames(pcs) <- c("FID", "IID", paste0("PC",1:6)) 
+# Read in the covariates (here, it is sex)
+covariate <- read.csv("EUR_input_for_PRS/EUR_covariates.csv", header=T,check.names=F, comment.char="")
+# Now merge the files
+pheno <- merge(phenotype, covariate, by=c("IID"))
+
+#pheno <- merge(merge(phenotype, covariate, by=c("FID", "IID")), pcs, by=c("FID","IID"))
+# We can then calculate the null model (model with PRS) using a linear regression 
+# (as height is quantitative)
+null.model <- lm(BMI~., data=pheno[,-1])
+# And the R2 of the null model is 
+null.r2 <- summary(null.model)$r.squared
+prs.result <- NULL
+for(i in p.threshold){
+    # Go through each p-value threshold
+    prs <- read.table(paste0("EUR_input_for_PRS/EUR.",i,".sscore"), header=T, check.names=F, comment.char="")
+    # Merge the prs with the phenotype matrix
+    # We only want the FID, IID and PRS from the PRS file, therefore we only select the 
+    # relevant columns
+    pheno.prs <- merge(pheno, prs[,c("IID", "SCORE1_AVG")], by= "IID")
+    # Now perform a linear regression on Height with PRS and the covariates
+    # ignoring the FID and IID from our model
+    model <- lm(BMI~., data=pheno.prs[,-1])
+    # model R2 is obtained as 
+    model.r2 <- summary(model)$r.squared
+    # R2 of PRS is simply calculated as the model R2 minus the null R2
+    prs.r2 <- model.r2-null.r2
+    # We can also obtain the coeffcient and p-value of association of PRS as follow
+    prs.coef <- summary(model)$coeff["SCORE1_AVG",]
+    prs.beta <- as.numeric(prs.coef[1])
+    prs.se <- as.numeric(prs.coef[2])
+    prs.p <- as.numeric(prs.coef[4])
+    # We can then store the results
+    prs.result <- rbind(prs.result, data.frame(Threshold=i, R2=prs.r2, P=prs.p, BETA=prs.beta,SE=prs.se))
+}
+# Best result is:
+prs.result[which.max(prs.result$R2),]
+write.csv(prs.result, file="PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.csv", quote=F, row.names=F)
+
+best_p <- prs.result[which.max(prs.result$R2),1]
+best_prs <- read.table(paste0("EUR_input_for_PRS/EUR.",best_p,".sscore"), header=T,check.names=F, comment.char="")
+# Rename the sex
+covariate$sex <- as.factor(covariate$sex)
+levels(covariate$sex) <- c("Male", "Female")
+# Merge the files
+dat <- merge(merge(best_prs, phenotype, by="IID"), covariate, by="IID")
+
+# Start plotting
+pdf("PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.pdf")
+plot(x=dat$SCORE1_AVG, y=dat$BMI, col="white",
+    xlab="Polygenic Score", ylab="BMI")
+with(subset(dat, sex=="Male"), points(x=SCORE1_AVG, y=BMI, col="red", pch=19))
+with(subset(dat, sex=="Female"), points(x=SCORE1_AVG, y=BMI, col="blue", pch=19))
+dev.off()
+####################################################