Add files via upload

This commit is contained in:
MichelaTr 2023-11-20 11:32:21 -08:00 committed by GitHub
parent f0781880cf
commit 3480239892
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 320 additions and 0 deletions

View file

@ -1 +1,59 @@
#files
cd ~/Desktop/handson_plink/GWAS/
plink_dir=~/Desktop/handson_plink
#cd genotypes/
#unrelated_hg38_auto_maf_0.01_500k_snp.pvar
#unrelated_hg38_auto_maf_0.01_500k_snp.psam
#unrelated_hg38_auto_maf_0.01_500k_snp.pgen
#cd phenotypes/
#phenofile.csv
#cd covariates/
#covariate.csv
########################################################################
#QC and filtering
########################################################################
mkdir QC/
#frequency
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp \
--freq \
--nonfounders \
--out QC/unrelated_hg38_auto_maf_0.01_500k_snp
#MAF filtering (--maf 0.05)
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.01_500k_snp --min-af 0.05 \
--make-pgen \
--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp
#call rate and missingness
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp --geno 0.1 \
--mind 0.1 \
--make-pgen \
--out genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered
#LD pruning - cretae list of independent snps to include in PCs analysis
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered --indep-pairwise 50 5 0.2 \
--out QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered
#recode genotypes with indep SNPs only
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
--extract QC/unrelated_hg38_auto_maf_0.05_500k_snp_filtered.prune.in \
--make-pgen \
--out genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp
#pca with indep snps only
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp --nonfounders --pca \
--read-freq QC/unrelated_hg38_auto_maf_0.01_500k_snp.afreq \
--out QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp
########################################################################
#run step02_covariate_PCs.R

View file

@ -0,0 +1,34 @@
#in Rstudio
########################################################################
#plot PCs and creating a second covariate file including PCS
########################################################################
setwd("~/Desktop/handson_plink/GWAS/")
cov <- read.csv("covariates/covariates.csv", header=T,comment.char="", check.names=F)
pc <- read.table("QC/unrelated_hg38_auto_maf_0.05_500k_filtered_indep_snp.eigenvec",header=T, comment.char="", check.names=F)
colnames(pc)[1] <- "IID"
EUR <- read.csv("../PRS/EUR_input_for_PRS/EUR_indivs.csv", header=T,comment.char="", check.names=F)
updated_pc <- merge(cov, pc, by="IID")
updated_pc_EUR <- updated_pc[which(updated_pc$IID %in% EUR$IID),]
if (!dir.exists("R_plots")){
dir.create("R_plots")
} else {
print("R_plots already exists!")
}
pdf("R_plots/Pop_stratification.pdf")
plot(updated_pc$PC1, updated_pc$PC2, xlab="PC1", ylab="PC2")
points(updated_pc_EUR$PC1, updated_pc_EUR$PC2, pch=19, col="forestgreen")
dev.off()
write.csv(updated_pc, "covariates/covariates_with_10PCs.csv", quote=F, row.names=F)
########################################################################
#open and run step03_GWAS.sh
########################################################################

View file

@ -0,0 +1,39 @@
########################################################################
#GWAS
########################################################################
plink_dir=~/Desktop/handson_plink
mkdir plink_results/
#linear model --glm or --linear
#no covariates
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
--glm omit-ref allow-no-covars \
--pheno phenotypes/phenofile.csv --pheno-name BMI \
--out plink_results/no.covar
#sex and age as covariate
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
--covar covariates/covariates.csv --covar-variance-standardize \
--glm omit-ref hide-covar \
--pheno phenotypes/phenofile.csv --pheno-name BMI \
--out plink_results/sex.age.adjusted
#PCs, sex and age as covariate
$plink_dir/plink2 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
--covar covariates/covariates_with_10PCs.csv --covar-variance-standardize \
--glm omit-ref hide-covar \
--pheno phenotypes/phenofile.csv --pheno-name BMI \
--out plink_results/sex.age.PCs.adjusted
#logistic model --glm or --logistic
$plink_dir/plink2 --1 --pfile genotypes/unrelated_hg38_auto_maf_0.05_500k_snp_filtered \
--covar covariates/covariates_with_10PCs.csv --covar-variance-standardize \
--glm omit-ref cols=+beta,+orbeta,+ci hide-covar \
--pheno phenotypes/phenofile.csv --pheno-name obesity --ci 0.95 \
--out plink_results/sex.age.PCs.adj
########################################################################
#open and run in Rstudio step04_GWAS_visualization.R

View file

@ -0,0 +1,58 @@
##################
#Visualization in R
##################
#manhattan plot
install.packages("qqman")
library(qqman)
setwd("~/Desktop/handson_plink/GWAS/")
if (!dir.exists("R_plots")){
dir.create("R_plots")
} else {
print("R_plots already exists!")
}
gwas_obesity <- read.table("plink_results/sex.age.PCs.adj.obesity.glm.logistic.hybrid", header=T, comment.char="", check.names=F)
pdf("R_plots/obesity.pdf")
manhattan(gwas_obesity, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted obesity")
#qqplot
qq(gwas_obesity$P, main="quantile-quantile plot - sex, age, PCs adjusted obesity")
dev.off()
gwas_BMI_no_cov <- read.table("plink_results/no.covar.BMI.glm.linear", header=T,comment.char="", check.names=F)
gwas_BMI_sex_age_adj <- read.table("plink_results/sex.age.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F)
gwas_BMI_sex_age_PCs_adj <- read.table("plink_results/sex.age.PCs.adjusted.BMI.glm.linear", header=T,comment.char="", check.names=F)
pdf("R_plots/BMI.pdf")
manhattan(gwas_BMI_no_cov, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - no covar adjusted BMI")
manhattan(gwas_BMI_sex_age_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age adjusted BMI")
manhattan(gwas_BMI_sex_age_PCs_adj, chr="#CHROM", bp="POS", snp="ID", p="P" , annotatePval = 0.0001, main="Manhattan plot - sex, age, PCs adjusted BMI")
#qqplot
qq(gwas_BMI_no_cov$P, main="quantile-quantile plot - no covar adjusted BMI")
qq(gwas_BMI_sex_age_adj$P, main="quantile-quantile plot - sex, age adjusted BMI")
qq(gwas_BMI_sex_age_PCs_adj$P, main="quantile-quantile plot - sex, age, PCs adjusted BMI")
dev.off()
#include
#pvals <- read.table("AMH_all_FVG_rs_pvalue", header=T)
#observed <- sort(pvals$pgc)
#lobs <- -(log10(observed))
#expected <- c(1:length(observed))
#lexp <- -(log10(expected / (length(expected)+1)))
########################################################################
#open and run step05_Plink_PRS.sh

View file

@ -0,0 +1,62 @@
########################################################################
#PRS
########################################################################
PRSdir=~/Desktop/handson_plink/PRS
GWASdir=~/Desktop/handson_plink/GWAS
cd ~/Desktop/handson_plink/PRS
mkdir PRS_results/
plink_dir=~/Desktop/handson_plink
#input files are in EUR_input_for_PRS/
#EUR summary stats from large GWAS meta analysis
#Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt
#EUR list of indivs + covariate and pheno files
#EUR_input_for_PRS/EUR_BMI_phenofile.csv
#EUR_input_for_PRS/EUR_covariates.csv
#EUR_input_for_PRS/EUR_indivs.csv
#create geno files for european only as input for the PRS - we used maf 0.01 since we did not filter for call rate and missingness
$plink_dir/plink2 \
--pfile $GWASdir/genotypes/unrelated_hg38_auto_maf_0.01_500k_snp \
--keep EUR_input_for_PRS/EUR_indivs.csv \
--make-bed \
--out EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered
#clumping the summary stats from large GWAS meta analysis to get indep SNPs based on pvalue overlapping our genotypes
$plink_dir/plink2 \
--bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \
--clump-p1 1 \
--clump-r2 0.1 \
--clump-kb 250 \
--clump EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt \
--clump-snp-field SNP \
--clump-field P \
--out EUR_input_for_PRS/EUR_clumped
#valid independent snps
awk 'NR!=1{print $3}' EUR_input_for_PRS/EUR_clumped.clumps > EUR_input_for_PRS/EUR.valid.snp
#SNP and P from large GWAS meta analysis
awk '{print $3,$9}' EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt > EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue
#create threshold of SNPs to create scores for different subsets of SNPs
echo "0.001 0 0.001" > EUR_input_for_PRS/range_list
echo "0.05 0 0.05" >> EUR_input_for_PRS/range_list
echo "0.1 0 0.1" >> EUR_input_for_PRS/range_list
echo "0.2 0 0.2" >> EUR_input_for_PRS/range_list
echo "0.3 0 0.3" >> EUR_input_for_PRS/range_list
echo "0.4 0 0.4" >> EUR_input_for_PRS/range_list
echo "0.5 0 0.5" >> EUR_input_for_PRS/range_list
#estimating the score for each range of P using the valid snps (for the score needs snp, allele and BETA)
$plink_dir/plink2 \
--bfile EUR_input_for_PRS/EUR_unrelated_hg38_auto_maf_0.01_500k_snp_filtered \
--score EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.txt 3 4 7 header \
--q-score-range EUR_input_for_PRS/range_list EUR_input_for_PRS/Meta-analysis_Locke_et_al+UKBiobank_2018_UPDATED.SNP.pvalue \
--extract EUR_input_for_PRS/EUR.valid.snp \
--out EUR_input_for_PRS/EUR
########################################################################

View file

@ -0,0 +1,69 @@
####################################################
#Best PRS model and Visualization in R
####################################################
setwd("~/Desktop/handson_plink/PRS/")
p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5)
# Read in the phenotype file
phenotype <- read.csv("EUR_input_for_PRS/EUR_BMI_phenofile.csv", header=T, check.names=F, comment.char="")
# Read in the PCs
#pcs <- read.table("EUR.eigenvec", header=F)
# The default output from plink does not include a header
# To make things simple, we will add the appropriate headers
# (1:6 because there are 6 PCs)
#colnames(pcs) <- c("FID", "IID", paste0("PC",1:6))
# Read in the covariates (here, it is sex)
covariate <- read.csv("EUR_input_for_PRS/EUR_covariates.csv", header=T,check.names=F, comment.char="")
# Now merge the files
pheno <- merge(phenotype, covariate, by=c("IID"))
#pheno <- merge(merge(phenotype, covariate, by=c("FID", "IID")), pcs, by=c("FID","IID"))
# We can then calculate the null model (model with PRS) using a linear regression
# (as height is quantitative)
null.model <- lm(BMI~., data=pheno[,-1])
# And the R2 of the null model is
null.r2 <- summary(null.model)$r.squared
prs.result <- NULL
for(i in p.threshold){
# Go through each p-value threshold
prs <- read.table(paste0("EUR_input_for_PRS/EUR.",i,".sscore"), header=T, check.names=F, comment.char="")
# Merge the prs with the phenotype matrix
# We only want the FID, IID and PRS from the PRS file, therefore we only select the
# relevant columns
pheno.prs <- merge(pheno, prs[,c("IID", "SCORE1_AVG")], by= "IID")
# Now perform a linear regression on Height with PRS and the covariates
# ignoring the FID and IID from our model
model <- lm(BMI~., data=pheno.prs[,-1])
# model R2 is obtained as
model.r2 <- summary(model)$r.squared
# R2 of PRS is simply calculated as the model R2 minus the null R2
prs.r2 <- model.r2-null.r2
# We can also obtain the coeffcient and p-value of association of PRS as follow
prs.coef <- summary(model)$coeff["SCORE1_AVG",]
prs.beta <- as.numeric(prs.coef[1])
prs.se <- as.numeric(prs.coef[2])
prs.p <- as.numeric(prs.coef[4])
# We can then store the results
prs.result <- rbind(prs.result, data.frame(Threshold=i, R2=prs.r2, P=prs.p, BETA=prs.beta,SE=prs.se))
}
# Best result is:
prs.result[which.max(prs.result$R2),]
write.csv(prs.result, file="PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.csv", quote=F, row.names=F)
best_p <- prs.result[which.max(prs.result$R2),1]
best_prs <- read.table(paste0("EUR_input_for_PRS/EUR.",best_p,".sscore"), header=T,check.names=F, comment.char="")
# Rename the sex
covariate$sex <- as.factor(covariate$sex)
levels(covariate$sex) <- c("Male", "Female")
# Merge the files
dat <- merge(merge(best_prs, phenotype, by="IID"), covariate, by="IID")
# Start plotting
pdf("PRS_results/EUR_BMI_sex_age_adjusted_PRS_fit.pdf")
plot(x=dat$SCORE1_AVG, y=dat$BMI, col="white",
xlab="Polygenic Score", ylab="BMI")
with(subset(dat, sex=="Male"), points(x=SCORE1_AVG, y=BMI, col="red", pch=19))
with(subset(dat, sex=="Female"), points(x=SCORE1_AVG, y=BMI, col="blue", pch=19))
dev.off()
####################################################