mirror of
https://github.com/gladstone-institutes/Bioinformatics-Workshops.git
synced 2025-11-30 09:45:43 -08:00
Add files via upload
This commit is contained in:
parent
f0781880cf
commit
3480239892
6 changed files with 320 additions and 0 deletions
|
|
@ -1 +1,59 @@
|
|||
#files
|
||||
cd ~/Desktop/handson_plink/GWAS/
|
||||
|
||||
plink_dir=~/Desktop/handson_plink
|
||||
#cd genotypes/
|
||||
#unrelated_hg38_auto_maf_0.01_500k_snp.pvar
|
||||
#unrelated_hg38_auto_maf_0.01_500k_snp.psam
|
||||
#unrelated_hg38_auto_maf_0.01_500k_snp.pgen
|
||||
#cd phenotypes/
|
||||
#phenofile.csv
|
||||
#cd covariates/
|
||||
#covariate.csv
|
||||
|
||||
|
||||
|
||||
########################################################################
|
||||
#QC and filtering
|
||||
########################################################################
|
||||
mkdir QC/
|
||||
|
||||
#frequency
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp \
|
||||
--freq \
|
||||
--nonfounders \
|
||||
--out QC/unrelated_hg38_auto_maf_0.01_500k_snp
|
||||
|
||||
|
||||
#MAF filtering (--maf 0.05)
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp --min-af 0.05 \
|
||||
--make-pgen \
|
||||
--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp
|
||||
|
||||
|
||||
#call rate and missingness
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp --geno 0.1 \
|
||||
--mind 0.1 \
|
||||
--make-pgen \
|
||||
--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered
|
||||
|
||||
|
||||
#LD pruning - cretae list of independent snps to include in PCs analysis
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered --indep-pairwise 50 5 0.2 \
|
||||
--out QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered
|
||||
|
||||
#recode genotypes with indep SNPs only
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
|
||||
--extract QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered.prune.in \
|
||||
--make-pgen \
|
||||
--out genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp
|
||||
|
||||
#pca with indep snps only
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp --nonfounders --pca \
|
||||
--read-freq QC/unrelated_hg38_auto_maf_0.01_500k_snp.afreq \
|
||||
--out QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp
|
||||
|
||||
########################################################################
|
||||
|
||||
#run step02_covariate_PCs.R
|
||||
|
||||
|
|
|
|||
34
GWAS_PRS/scripts/step02_covariate_PCs.R
Normal file
34
GWAS_PRS/scripts/step02_covariate_PCs.R
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
#in Rstudio
|
||||
|
||||
########################################################################
|
||||
#plot PCs and creating a second covariate file including PCS
|
||||
########################################################################
|
||||
|
||||
setwd("~/Desktop/handson_plink/GWAS/")
|
||||
cov <- read.csv("covariates/covariates.csv", header=T,comment.char="", check.names=F)
|
||||
pc <- read.table("QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp.eigenvec",header=T, comment.char="", check.names=F)
|
||||
colnames(pc)[1] <- "IID"
|
||||
EUR <- read.csv("../PRS/EUR_input_for_PRS/EUR_indivs.csv", header=T,comment.char="", check.names=F)
|
||||
|
||||
updated_pc <- merge(cov, pc, by="IID")
|
||||
updated_pc_EUR <- updated_pc[which(updated_pc$IID %in% EUR$IID),]
|
||||
|
||||
if (!dir.exists("R_plots")){
|
||||
dir.create("R_plots")
|
||||
} else {
|
||||
print("R_plots already exists!")
|
||||
}
|
||||
|
||||
|
||||
pdf("R_plots/Pop_stratification.pdf")
|
||||
plot(updated_pc$PC1, updated_pc$PC2, xlab="PC1", ylab="PC2")
|
||||
points(updated_pc_EUR$PC1, updated_pc_EUR$PC2, pch=19, col="forestgreen")
|
||||
dev.off()
|
||||
|
||||
write.csv(updated_pc, "covariates/covariates_with_10PCs.csv", quote=F, row.names=F)
|
||||
|
||||
########################################################################
|
||||
|
||||
#open and run step03_GWAS.sh
|
||||
|
||||
########################################################################
|
||||
39
GWAS_PRS/scripts/step03_plink_GWAS.sh
Normal file
39
GWAS_PRS/scripts/step03_plink_GWAS.sh
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
########################################################################
|
||||
#GWAS
|
||||
########################################################################
|
||||
plink_dir=~/Desktop/handson_plink
|
||||
mkdir plink_results/
|
||||
#linear model --glm or --linear
|
||||
#no covariates
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
|
||||
--glm omit-ref allow-no-covars \
|
||||
--pheno phenotypes/phenofile.csv --pheno-name BMI \
|
||||
--out plink_results/no.covar
|
||||
|
||||
#sex and age as covariate
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
|
||||
--covar covariates/covariates.csv --covar-variance-standardize \
|
||||
--glm omit-ref hide-covar \
|
||||
--pheno phenotypes/phenofile.csv --pheno-name BMI \
|
||||
--out plink_results/sex.age.adjusted
|
||||
|
||||
|
||||
#PCs, sex and age as covariate
|
||||
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
|
||||
--covar covariates/covariates_with_10PCs.csv --covar-variance-standardize \
|
||||
--glm omit-ref hide-covar \
|
||||
--pheno phenotypes/phenofile.csv --pheno-name BMI \
|
||||
--out plink_results/sex.age.PCs.adjusted
|
||||
|
||||
|
||||
|
||||
|
||||
#logistic model --glm or --logistic
|
||||
$plink_dir/plink2 --1 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
|
||||
--covar covariates/covariates_with_10PCs.csv --covar-variance-standardize \
|
||||
--glm omit-ref cols=+beta,+orbeta,+ci hide-covar \
|
||||
--pheno phenotypes/phenofile.csv --pheno-name obesity --ci 0.95 \
|
||||
--out plink_results/sex.age.PCs.adj
|
||||
|
||||
########################################################################
|
||||
#open and run in Rstudio step04_GWAS_visualization.R
|
||||
58
GWAS_PRS/scripts/step04_GWAS_visualization.R
Normal file
58
GWAS_PRS/scripts/step04_GWAS_visualization.R
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
##################
|
||||
#Visualization in R
|
||||
##################
|
||||
#manhattan plot
|
||||
install.packages("qqman")
|
||||
library(qqman)
|
||||
|
||||
setwd("~/Desktop/handson_plink/GWAS/")
|
||||
|
||||
if (!dir.exists("R_plots")){
|
||||
dir.create("R_plots")
|
||||
} else {
|
||||
print("R_plots already exists!")
|
||||
}
|
||||
|
||||
gwas_obesity <- read.table("plink_results/sex.age.PCs.adj.obesity.glm.logistic.hybrid", header=T, comment.char="", check.names=F)
|
||||
|
||||
pdf("R_plots/obesity.pdf")
|
||||
manhattan(gwas_obesity, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted obesity")
|
||||
|
||||
#qqplot
|
||||
qq(gwas_obesity$P, main="quantile-quantile plot - sex, age, PCs adjusted obesity")
|
||||
dev.off()
|
||||
|
||||
|
||||
gwas_BMI_no_cov <- read.table("plink_results/no.covar.BMI.glm.linear", header=T,comment.char="", check.names=F)
|
||||
|
||||
gwas_BMI_sex_age_adj <- read.table("plink_results/sex.age.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F)
|
||||
|
||||
gwas_BMI_sex_age_PCs_adj <- read.table("plink_results/sex.age.PCs.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F)
|
||||
|
||||
|
||||
pdf("R_plots/BMI.pdf")
|
||||
manhattan(gwas_BMI_no_cov, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - no covar adjusted BMI")
|
||||
|
||||
manhattan(gwas_BMI_sex_age_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age adjusted BMI")
|
||||
|
||||
manhattan(gwas_BMI_sex_age_PCs_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted BMI")
|
||||
|
||||
#qqplot
|
||||
qq(gwas_BMI_no_cov$P, main="quantile-quantile plot - no covar adjusted BMI")
|
||||
qq(gwas_BMI_sex_age_adj$P, main="quantile-quantile plot - sex, age adjusted BMI")
|
||||
qq(gwas_BMI_sex_age_PCs_adj$P, main="quantile-quantile plot - sex, age, PCs adjusted BMI")
|
||||
|
||||
dev.off()
|
||||
#include
|
||||
#pvals <- read.table("AMH_all_FVG_rs_pvalue", header=T)
|
||||
|
||||
#observed <- sort(pvals$pgc)
|
||||
#lobs <- -(log10(observed))
|
||||
|
||||
#expected <- c(1:length(observed))
|
||||
#lexp <- -(log10(expected / (length(expected)+1)))
|
||||
|
||||
|
||||
########################################################################
|
||||
#open and run step05_Plink_PRS.sh
|
||||
62
GWAS_PRS/scripts/step05_plink_PRS.sh
Normal file
62
GWAS_PRS/scripts/step05_plink_PRS.sh
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
########################################################################
|
||||
#PRS
|
||||
########################################################################
|
||||
PRSdir=~/Desktop/handson_plink/PRS
|
||||
GWASdir=~/Desktop/handson_plink/GWAS
|
||||
|
||||
cd ~/Desktop/handson_plink/PRS
|
||||
mkdir PRS_results/
|
||||
plink_dir=~/Desktop/handson_plink
|
||||
#input files are in EUR_input_for_PRS/
|
||||
#EUR summary stats from large GWAS meta analysis
|
||||
#Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt
|
||||
|
||||
#EUR list of indivs + covariate and pheno files
|
||||
#EUR_input_for_PRS/EUR_BMI_phenofile.csv
|
||||
#EUR_input_for_PRS/EUR_covariates.csv
|
||||
#EUR_input_for_PRS/EUR_indivs.csv
|
||||
|
||||
|
||||
#create geno files for european only as input for the PRS - we used maf 0.01 since we did not filter for call rate and missingness
|
||||
$plink_dir/plink2 \
|
||||
--pfile $GWASdir/genotypes/unrelated_hg38_auto_maf_0.01_500k_snp \
|
||||
--keep EUR_input_for_PRS/EUR_indivs.csv \
|
||||
--make-bed \
|
||||
--out EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered
|
||||
|
||||
#clumping the summary stats from large GWAS meta analysis to get indep SNPs based on pvalue overlapping our genotypes
|
||||
$plink_dir/plink2 \
|
||||
--bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \
|
||||
--clump-p1 1 \
|
||||
--clump-r2 0.1 \
|
||||
--clump-kb 250 \
|
||||
--clump EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt \
|
||||
--clump-snp-field SNP \
|
||||
--clump-field P \
|
||||
--out EUR_input_for_PRS/EUR_clumped
|
||||
|
||||
#valid independent snps
|
||||
awk 'NR!=1{print $3}' EUR_input_for_PRS/EUR_clumped.clumps > EUR_input_for_PRS/EUR.valid.snp
|
||||
|
||||
|
||||
#SNP and P from large GWAS meta analysis
|
||||
awk '{print $3,$9}' EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt > EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue
|
||||
|
||||
#create threshold of SNPs to create scores for different subsets of SNPs
|
||||
echo "0.001 0 0.001" > EUR_input_for_PRS/range_list
|
||||
echo "0.05 0 0.05" >> EUR_input_for_PRS/range_list
|
||||
echo "0.1 0 0.1" >> EUR_input_for_PRS/range_list
|
||||
echo "0.2 0 0.2" >> EUR_input_for_PRS/range_list
|
||||
echo "0.3 0 0.3" >> EUR_input_for_PRS/range_list
|
||||
echo "0.4 0 0.4" >> EUR_input_for_PRS/range_list
|
||||
echo "0.5 0 0.5" >> EUR_input_for_PRS/range_list
|
||||
|
||||
#estimating the score for each range of P using the valid snps (for the score needs snp, allele and BETA)
|
||||
$plink_dir/plink2 \
|
||||
--bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \
|
||||
--score EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt 3 4 7 header \
|
||||
--q-score-range EUR_input_for_PRS/range_list EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue \
|
||||
--extract EUR_input_for_PRS/EUR.valid.snp \
|
||||
--out EUR_input_for_PRS/EUR
|
||||
########################################################################
|
||||
69
GWAS_PRS/scripts/step06_best_PRS_visualization.R
Normal file
69
GWAS_PRS/scripts/step06_best_PRS_visualization.R
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
####################################################
|
||||
#Best PRS model and Visualization in R
|
||||
####################################################
|
||||
|
||||
setwd("~/Desktop/handson_plink/PRS/")
|
||||
|
||||
p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5)
|
||||
# Read in the phenotype file
|
||||
phenotype <- read.csv("EUR_input_for_PRS/EUR_BMI_phenofile.csv", header=T, check.names=F, comment.char="")
|
||||
# Read in the PCs
|
||||
#pcs <- read.table("EUR.eigenvec", header=F)
|
||||
# The default output from plink does not include a header
|
||||
# To make things simple, we will add the appropriate headers
|
||||
# (1:6 because there are 6 PCs)
|
||||
#colnames(pcs) <- c("FID", "IID", paste0("PC",1:6))
|
||||
# Read in the covariates (here, it is sex)
|
||||
covariate <- read.csv("EUR_input_for_PRS/EUR_covariates.csv", header=T,check.names=F, comment.char="")
|
||||
# Now merge the files
|
||||
pheno <- merge(phenotype, covariate, by=c("IID"))
|
||||
|
||||
#pheno <- merge(merge(phenotype, covariate, by=c("FID", "IID")), pcs, by=c("FID","IID"))
|
||||
# We can then calculate the null model (model with PRS) using a linear regression
|
||||
# (as height is quantitative)
|
||||
null.model <- lm(BMI~., data=pheno[,-1])
|
||||
# And the R2 of the null model is
|
||||
null.r2 <- summary(null.model)$r.squared
|
||||
prs.result <- NULL
|
||||
for(i in p.threshold){
|
||||
# Go through each p-value threshold
|
||||
prs <- read.table(paste0("EUR_input_for_PRS/EUR.",i,".sscore"), header=T, check.names=F, comment.char="")
|
||||
# Merge the prs with the phenotype matrix
|
||||
# We only want the FID, IID and PRS from the PRS file, therefore we only select the
|
||||
# relevant columns
|
||||
pheno.prs <- merge(pheno, prs[,c("IID", "SCORE1_AVG")], by= "IID")
|
||||
# Now perform a linear regression on Height with PRS and the covariates
|
||||
# ignoring the FID and IID from our model
|
||||
model <- lm(BMI~., data=pheno.prs[,-1])
|
||||
# model R2 is obtained as
|
||||
model.r2 <- summary(model)$r.squared
|
||||
# R2 of PRS is simply calculated as the model R2 minus the null R2
|
||||
prs.r2 <- model.r2-null.r2
|
||||
# We can also obtain the coeffcient and p-value of association of PRS as follow
|
||||
prs.coef <- summary(model)$coeff["SCORE1_AVG",]
|
||||
prs.beta <- as.numeric(prs.coef[1])
|
||||
prs.se <- as.numeric(prs.coef[2])
|
||||
prs.p <- as.numeric(prs.coef[4])
|
||||
# We can then store the results
|
||||
prs.result <- rbind(prs.result, data.frame(Threshold=i, R2=prs.r2, P=prs.p, BETA=prs.beta,SE=prs.se))
|
||||
}
|
||||
# Best result is:
|
||||
prs.result[which.max(prs.result$R2),]
|
||||
write.csv(prs.result, file="PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.csv", quote=F, row.names=F)
|
||||
|
||||
best_p <- prs.result[which.max(prs.result$R2),1]
|
||||
best_prs <- read.table(paste0("EUR_input_for_PRS/EUR.",best_p,".sscore"), header=T,check.names=F, comment.char="")
|
||||
# Rename the sex
|
||||
covariate$sex <- as.factor(covariate$sex)
|
||||
levels(covariate$sex) <- c("Male", "Female")
|
||||
# Merge the files
|
||||
dat <- merge(merge(best_prs, phenotype, by="IID"), covariate, by="IID")
|
||||
|
||||
# Start plotting
|
||||
pdf("PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.pdf")
|
||||
plot(x=dat$SCORE1_AVG, y=dat$BMI, col="white",
|
||||
xlab="Polygenic Score", ylab="BMI")
|
||||
with(subset(dat, sex=="Male"), points(x=SCORE1_AVG, y=BMI, col="red", pch=19))
|
||||
with(subset(dat, sex=="Female"), points(x=SCORE1_AVG, y=BMI, col="blue", pch=19))
|
||||
dev.off()
|
||||
####################################################
|
||||
Loading…
Add table
Add a link
Reference in a new issue