diff --git a/GWAS_PRS/scripts/step01_plink_QC.sh b/GWAS_PRS/scripts/step01_plink_QC.sh index 8b13789..920e9a8 100644 --- a/GWAS_PRS/scripts/step01_plink_QC.sh +++ b/GWAS_PRS/scripts/step01_plink_QC.sh @@ -1 +1,59 @@ +#files +cd ~/Desktop/handson_plink/GWAS/ + +plink_dir=~/Desktop/handson_plink +#cd genotypes/ +#unrelated_hg38_auto_maf_0.01_500k_snp.pvar +#unrelated_hg38_auto_maf_0.01_500k_snp.psam +#unrelated_hg38_auto_maf_0.01_500k_snp.pgen +#cd phenotypes/ +#phenofile.csv +#cd covariates/ +#covariate.csv + + + +######################################################################## +#QC and filtering +######################################################################## +mkdir QC/ + +#frequency +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp \ +--freq \ +--nonfounders \ +--out QC/unrelated_hg38_auto_maf_0.01_500k_snp + + +#MAF filtering (--maf 0.05) +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp --min-af 0.05 \ +--make-pgen \ +--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp + + +#call rate and missingness +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp --geno 0.1 \ +--mind 0.1 \ +--make-pgen \ +--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered + + +#LD pruning - cretae list of independent snps to include in PCs analysis +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered --indep-pairwise 50 5 0.2 \ +--out QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered + +#recode genotypes with indep SNPs only +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \ +--extract QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered.prune.in \ +--make-pgen \ +--out genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp + +#pca with indep snps only +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp --nonfounders --pca \ + --read-freq QC/unrelated_hg38_auto_maf_0.01_500k_snp.afreq \ + --out QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp + +######################################################################## + +#run step02_covariate_PCs.R diff --git a/GWAS_PRS/scripts/step02_covariate_PCs.R b/GWAS_PRS/scripts/step02_covariate_PCs.R new file mode 100644 index 0000000..f0b00c7 --- /dev/null +++ b/GWAS_PRS/scripts/step02_covariate_PCs.R @@ -0,0 +1,34 @@ +#in Rstudio + +######################################################################## +#plot PCs and creating a second covariate file including PCS +######################################################################## + +setwd("~/Desktop/handson_plink/GWAS/") +cov <- read.csv("covariates/covariates.csv", header=T,comment.char="", check.names=F) +pc <- read.table("QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp.eigenvec",header=T, comment.char="", check.names=F) +colnames(pc)[1] <- "IID" +EUR <- read.csv("../PRS/EUR_input_for_PRS/EUR_indivs.csv", header=T,comment.char="", check.names=F) + +updated_pc <- merge(cov, pc, by="IID") +updated_pc_EUR <- updated_pc[which(updated_pc$IID %in% EUR$IID),] + +if (!dir.exists("R_plots")){ +dir.create("R_plots") +} else { + print("R_plots already exists!") +} + + +pdf("R_plots/Pop_stratification.pdf") +plot(updated_pc$PC1, updated_pc$PC2, xlab="PC1", ylab="PC2") +points(updated_pc_EUR$PC1, updated_pc_EUR$PC2, pch=19, col="forestgreen") +dev.off() + +write.csv(updated_pc, "covariates/covariates_with_10PCs.csv", quote=F, row.names=F) + +######################################################################## + +#open and run step03_GWAS.sh + +######################################################################## diff --git a/GWAS_PRS/scripts/step03_plink_GWAS.sh b/GWAS_PRS/scripts/step03_plink_GWAS.sh new file mode 100644 index 0000000..034d080 --- /dev/null +++ b/GWAS_PRS/scripts/step03_plink_GWAS.sh @@ -0,0 +1,39 @@ +######################################################################## +#GWAS +######################################################################## +plink_dir=~/Desktop/handson_plink +mkdir plink_results/ +#linear model --glm or --linear +#no covariates +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \ +--glm omit-ref allow-no-covars \ +--pheno phenotypes/phenofile.csv --pheno-name BMI \ +--out plink_results/no.covar + +#sex and age as covariate +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \ +--covar covariates/covariates.csv --covar-variance-standardize \ +--glm omit-ref hide-covar \ +--pheno phenotypes/phenofile.csv --pheno-name BMI \ +--out plink_results/sex.age.adjusted + + +#PCs, sex and age as covariate +$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \ +--covar covariates/covariates_with_10PCs.csv --covar-variance-standardize \ +--glm omit-ref hide-covar \ +--pheno phenotypes/phenofile.csv --pheno-name BMI \ +--out plink_results/sex.age.PCs.adjusted + + + + +#logistic model --glm or --logistic +$plink_dir/plink2 --1 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \ +--covar covariates/covariates_with_10PCs.csv --covar-variance-standardize \ +--glm omit-ref cols=+beta,+orbeta,+ci hide-covar \ +--pheno phenotypes/phenofile.csv --pheno-name obesity --ci 0.95 \ +--out plink_results/sex.age.PCs.adj + +######################################################################## +#open and run in Rstudio step04_GWAS_visualization.R \ No newline at end of file diff --git a/GWAS_PRS/scripts/step04_GWAS_visualization.R b/GWAS_PRS/scripts/step04_GWAS_visualization.R new file mode 100644 index 0000000..a9d6e40 --- /dev/null +++ b/GWAS_PRS/scripts/step04_GWAS_visualization.R @@ -0,0 +1,58 @@ + +################## +#Visualization in R +################## +#manhattan plot +install.packages("qqman") +library(qqman) + +setwd("~/Desktop/handson_plink/GWAS/") + +if (!dir.exists("R_plots")){ +dir.create("R_plots") +} else { + print("R_plots already exists!") +} + +gwas_obesity <- read.table("plink_results/sex.age.PCs.adj.obesity.glm.logistic.hybrid", header=T, comment.char="", check.names=F) + +pdf("R_plots/obesity.pdf") +manhattan(gwas_obesity, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted obesity") + +#qqplot +qq(gwas_obesity$P, main="quantile-quantile plot - sex, age, PCs adjusted obesity") +dev.off() + + +gwas_BMI_no_cov <- read.table("plink_results/no.covar.BMI.glm.linear", header=T,comment.char="", check.names=F) + +gwas_BMI_sex_age_adj <- read.table("plink_results/sex.age.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F) + +gwas_BMI_sex_age_PCs_adj <- read.table("plink_results/sex.age.PCs.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F) + + +pdf("R_plots/BMI.pdf") +manhattan(gwas_BMI_no_cov, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - no covar adjusted BMI") + +manhattan(gwas_BMI_sex_age_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age adjusted BMI") + +manhattan(gwas_BMI_sex_age_PCs_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted BMI") + +#qqplot +qq(gwas_BMI_no_cov$P, main="quantile-quantile plot - no covar adjusted BMI") +qq(gwas_BMI_sex_age_adj$P, main="quantile-quantile plot - sex, age adjusted BMI") +qq(gwas_BMI_sex_age_PCs_adj$P, main="quantile-quantile plot - sex, age, PCs adjusted BMI") + +dev.off() +#include +#pvals <- read.table("AMH_all_FVG_rs_pvalue", header=T) + +#observed <- sort(pvals$pgc) +#lobs <- -(log10(observed)) + +#expected <- c(1:length(observed)) +#lexp <- -(log10(expected / (length(expected)+1))) + + +######################################################################## +#open and run step05_Plink_PRS.sh diff --git a/GWAS_PRS/scripts/step05_plink_PRS.sh b/GWAS_PRS/scripts/step05_plink_PRS.sh new file mode 100644 index 0000000..c5a1d73 --- /dev/null +++ b/GWAS_PRS/scripts/step05_plink_PRS.sh @@ -0,0 +1,62 @@ + +######################################################################## +#PRS +######################################################################## +PRSdir=~/Desktop/handson_plink/PRS +GWASdir=~/Desktop/handson_plink/GWAS + +cd ~/Desktop/handson_plink/PRS +mkdir PRS_results/ +plink_dir=~/Desktop/handson_plink +#input files are in EUR_input_for_PRS/ +#EUR summary stats from large GWAS meta analysis +#Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt + +#EUR list of indivs + covariate and pheno files +#EUR_input_for_PRS/EUR_BMI_phenofile.csv +#EUR_input_for_PRS/EUR_covariates.csv +#EUR_input_for_PRS/EUR_indivs.csv + + +#create geno files for european only as input for the PRS - we used maf 0.01 since we did not filter for call rate and missingness +$plink_dir/plink2 \ +--pfile $GWASdir/genotypes/unrelated_hg38_auto_maf_0.01_500k_snp \ +--keep EUR_input_for_PRS/EUR_indivs.csv \ +--make-bed \ +--out EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered + +#clumping the summary stats from large GWAS meta analysis to get indep SNPs based on pvalue overlapping our genotypes +$plink_dir/plink2 \ + --bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \ + --clump-p1 1 \ + --clump-r2 0.1 \ + --clump-kb 250 \ + --clump EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt \ + --clump-snp-field SNP \ + --clump-field P \ + --out EUR_input_for_PRS/EUR_clumped + +#valid independent snps +awk 'NR!=1{print $3}' EUR_input_for_PRS/EUR_clumped.clumps > EUR_input_for_PRS/EUR.valid.snp + + +#SNP and P from large GWAS meta analysis +awk '{print $3,$9}' EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt > EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue + +#create threshold of SNPs to create scores for different subsets of SNPs +echo "0.001 0 0.001" > EUR_input_for_PRS/range_list +echo "0.05 0 0.05" >> EUR_input_for_PRS/range_list +echo "0.1 0 0.1" >> EUR_input_for_PRS/range_list +echo "0.2 0 0.2" >> EUR_input_for_PRS/range_list +echo "0.3 0 0.3" >> EUR_input_for_PRS/range_list +echo "0.4 0 0.4" >> EUR_input_for_PRS/range_list +echo "0.5 0 0.5" >> EUR_input_for_PRS/range_list + +#estimating the score for each range of P using the valid snps (for the score needs snp, allele and BETA) +$plink_dir/plink2 \ + --bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \ + --score EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt 3 4 7 header \ + --q-score-range EUR_input_for_PRS/range_list EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue \ + --extract EUR_input_for_PRS/EUR.valid.snp \ + --out EUR_input_for_PRS/EUR +######################################################################## diff --git a/GWAS_PRS/scripts/step06_best_PRS_visualization.R b/GWAS_PRS/scripts/step06_best_PRS_visualization.R new file mode 100644 index 0000000..634c4e1 --- /dev/null +++ b/GWAS_PRS/scripts/step06_best_PRS_visualization.R @@ -0,0 +1,69 @@ +#################################################### +#Best PRS model and Visualization in R +#################################################### + +setwd("~/Desktop/handson_plink/PRS/") + +p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5) +# Read in the phenotype file +phenotype <- read.csv("EUR_input_for_PRS/EUR_BMI_phenofile.csv", header=T, check.names=F, comment.char="") +# Read in the PCs +#pcs <- read.table("EUR.eigenvec", header=F) +# The default output from plink does not include a header +# To make things simple, we will add the appropriate headers +# (1:6 because there are 6 PCs) +#colnames(pcs) <- c("FID", "IID", paste0("PC",1:6)) +# Read in the covariates (here, it is sex) +covariate <- read.csv("EUR_input_for_PRS/EUR_covariates.csv", header=T,check.names=F, comment.char="") +# Now merge the files +pheno <- merge(phenotype, covariate, by=c("IID")) + +#pheno <- merge(merge(phenotype, covariate, by=c("FID", "IID")), pcs, by=c("FID","IID")) +# We can then calculate the null model (model with PRS) using a linear regression +# (as height is quantitative) +null.model <- lm(BMI~., data=pheno[,-1]) +# And the R2 of the null model is +null.r2 <- summary(null.model)$r.squared +prs.result <- NULL +for(i in p.threshold){ + # Go through each p-value threshold + prs <- read.table(paste0("EUR_input_for_PRS/EUR.",i,".sscore"), header=T, check.names=F, comment.char="") + # Merge the prs with the phenotype matrix + # We only want the FID, IID and PRS from the PRS file, therefore we only select the + # relevant columns + pheno.prs <- merge(pheno, prs[,c("IID", "SCORE1_AVG")], by= "IID") + # Now perform a linear regression on Height with PRS and the covariates + # ignoring the FID and IID from our model + model <- lm(BMI~., data=pheno.prs[,-1]) + # model R2 is obtained as + model.r2 <- summary(model)$r.squared + # R2 of PRS is simply calculated as the model R2 minus the null R2 + prs.r2 <- model.r2-null.r2 + # We can also obtain the coeffcient and p-value of association of PRS as follow + prs.coef <- summary(model)$coeff["SCORE1_AVG",] + prs.beta <- as.numeric(prs.coef[1]) + prs.se <- as.numeric(prs.coef[2]) + prs.p <- as.numeric(prs.coef[4]) + # We can then store the results + prs.result <- rbind(prs.result, data.frame(Threshold=i, R2=prs.r2, P=prs.p, BETA=prs.beta,SE=prs.se)) +} +# Best result is: +prs.result[which.max(prs.result$R2),] +write.csv(prs.result, file="PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.csv", quote=F, row.names=F) + +best_p <- prs.result[which.max(prs.result$R2),1] +best_prs <- read.table(paste0("EUR_input_for_PRS/EUR.",best_p,".sscore"), header=T,check.names=F, comment.char="") +# Rename the sex +covariate$sex <- as.factor(covariate$sex) +levels(covariate$sex) <- c("Male", "Female") +# Merge the files +dat <- merge(merge(best_prs, phenotype, by="IID"), covariate, by="IID") + +# Start plotting +pdf("PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.pdf") +plot(x=dat$SCORE1_AVG, y=dat$BMI, col="white", + xlab="Polygenic Score", ylab="BMI") +with(subset(dat, sex=="Male"), points(x=SCORE1_AVG, y=BMI, col="red", pch=19)) +with(subset(dat, sex=="Female"), points(x=SCORE1_AVG, y=BMI, col="blue", pch=19)) +dev.off() +####################################################