Add files via upload

This commit is contained in:
Ayushi Agrawal 2022-07-07 11:09:16 -07:00 committed by GitHub
parent f4a77a6943
commit 28c2b03b1d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 43197 additions and 0 deletions

View file

@ -0,0 +1,24 @@
>J01859.1 Escherichia coli 16S ribosomal RNA, complete sequence
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGT
AACAGGAAGAAGCTTGCTCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATG
GAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCG
GGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACG
ATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGG
CAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTT
CGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCG
CAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAAT
TACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAAC
TGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGT
AGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCG
TGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCC
TTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACT
CAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCT
TACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAGAATGTGCCTTCGGGAACCGTGAGACAGGTGC
TGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCT
TTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGA
CGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGA
CCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATG
AAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCG
CCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTT
TGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT
A

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,151 @@
"Sepal.Length";"Sepal.Width";"Petal.Length";"Petal.Width";"Species"
"1";5.1;3.5;1.4;0.2;"setosa"
"2";4.9;3;1.4;0.2;"setosa"
"3";4.7;3.2;1.3;0.2;"setosa"
"4";4.6;3.1;1.5;0.2;"setosa"
"5";5;3.6;1.4;0.2;"setosa"
"6";5.4;3.9;1.7;0.4;"setosa"
"7";4.6;3.4;1.4;0.3;"setosa"
"8";5;3.4;1.5;0.2;"setosa"
"9";4.4;2.9;1.4;0.2;"setosa"
"10";4.9;3.1;1.5;0.1;"setosa"
"11";5.4;3.7;1.5;0.2;"setosa"
"12";4.8;3.4;1.6;0.2;"setosa"
"13";4.8;3;1.4;0.1;"setosa"
"14";4.3;3;1.1;0.1;"setosa"
"15";5.8;4;1.2;0.2;"setosa"
"16";5.7;4.4;1.5;0.4;"setosa"
"17";5.4;3.9;1.3;0.4;"setosa"
"18";5.1;3.5;1.4;0.3;"setosa"
"19";5.7;3.8;1.7;0.3;"setosa"
"20";5.1;3.8;1.5;0.3;"setosa"
"21";5.4;3.4;1.7;0.2;"setosa"
"22";5.1;3.7;1.5;0.4;"setosa"
"23";4.6;3.6;1;0.2;"setosa"
"24";5.1;3.3;1.7;0.5;"setosa"
"25";4.8;3.4;1.9;0.2;"setosa"
"26";5;3;1.6;0.2;"setosa"
"27";5;3.4;1.6;0.4;"setosa"
"28";5.2;3.5;1.5;0.2;"setosa"
"29";5.2;3.4;1.4;0.2;"setosa"
"30";4.7;3.2;1.6;0.2;"setosa"
"31";4.8;3.1;1.6;0.2;"setosa"
"32";5.4;3.4;1.5;0.4;"setosa"
"33";5.2;4.1;1.5;0.1;"setosa"
"34";5.5;4.2;1.4;0.2;"setosa"
"35";4.9;3.1;1.5;0.2;"setosa"
"36";5;3.2;1.2;0.2;"setosa"
"37";5.5;3.5;1.3;0.2;"setosa"
"38";4.9;3.6;1.4;0.1;"setosa"
"39";4.4;3;1.3;0.2;"setosa"
"40";5.1;3.4;1.5;0.2;"setosa"
"41";5;3.5;1.3;0.3;"setosa"
"42";4.5;2.3;1.3;0.3;"setosa"
"43";4.4;3.2;1.3;0.2;"setosa"
"44";5;3.5;1.6;0.6;"setosa"
"45";5.1;3.8;1.9;0.4;"setosa"
"46";4.8;3;1.4;0.3;"setosa"
"47";5.1;3.8;1.6;0.2;"setosa"
"48";4.6;3.2;1.4;0.2;"setosa"
"49";5.3;3.7;1.5;0.2;"setosa"
"50";5;3.3;1.4;0.2;"setosa"
"51";7;3.2;4.7;1.4;"versicolor"
"52";6.4;3.2;4.5;1.5;"versicolor"
"53";6.9;3.1;4.9;1.5;"versicolor"
"54";5.5;2.3;4;1.3;"versicolor"
"55";6.5;2.8;4.6;1.5;"versicolor"
"56";5.7;2.8;4.5;1.3;"versicolor"
"57";6.3;3.3;4.7;1.6;"versicolor"
"58";4.9;2.4;3.3;1;"versicolor"
"59";6.6;2.9;4.6;1.3;"versicolor"
"60";5.2;2.7;3.9;1.4;"versicolor"
"61";5;2;3.5;1;"versicolor"
"62";5.9;3;4.2;1.5;"versicolor"
"63";6;2.2;4;1;"versicolor"
"64";6.1;2.9;4.7;1.4;"versicolor"
"65";5.6;2.9;3.6;1.3;"versicolor"
"66";6.7;3.1;4.4;1.4;"versicolor"
"67";5.6;3;4.5;1.5;"versicolor"
"68";5.8;2.7;4.1;1;"versicolor"
"69";6.2;2.2;4.5;1.5;"versicolor"
"70";5.6;2.5;3.9;1.1;"versicolor"
"71";5.9;3.2;4.8;1.8;"versicolor"
"72";6.1;2.8;4;1.3;"versicolor"
"73";6.3;2.5;4.9;1.5;"versicolor"
"74";6.1;2.8;4.7;1.2;"versicolor"
"75";6.4;2.9;4.3;1.3;"versicolor"
"76";6.6;3;4.4;1.4;"versicolor"
"77";6.8;2.8;4.8;1.4;"versicolor"
"78";6.7;3;5;1.7;"versicolor"
"79";6;2.9;4.5;1.5;"versicolor"
"80";5.7;2.6;3.5;1;"versicolor"
"81";5.5;2.4;3.8;1.1;"versicolor"
"82";5.5;2.4;3.7;1;"versicolor"
"83";5.8;2.7;3.9;1.2;"versicolor"
"84";6;2.7;5.1;1.6;"versicolor"
"85";5.4;3;4.5;1.5;"versicolor"
"86";6;3.4;4.5;1.6;"versicolor"
"87";6.7;3.1;4.7;1.5;"versicolor"
"88";6.3;2.3;4.4;1.3;"versicolor"
"89";5.6;3;4.1;1.3;"versicolor"
"90";5.5;2.5;4;1.3;"versicolor"
"91";5.5;2.6;4.4;1.2;"versicolor"
"92";6.1;3;4.6;1.4;"versicolor"
"93";5.8;2.6;4;1.2;"versicolor"
"94";5;2.3;3.3;1;"versicolor"
"95";5.6;2.7;4.2;1.3;"versicolor"
"96";5.7;3;4.2;1.2;"versicolor"
"97";5.7;2.9;4.2;1.3;"versicolor"
"98";6.2;2.9;4.3;1.3;"versicolor"
"99";5.1;2.5;3;1.1;"versicolor"
"100";5.7;2.8;4.1;1.3;"versicolor"
"101";6.3;3.3;6;2.5;"virginica"
"102";5.8;2.7;5.1;1.9;"virginica"
"103";7.1;3;5.9;2.1;"virginica"
"104";6.3;2.9;5.6;1.8;"virginica"
"105";6.5;3;5.8;2.2;"virginica"
"106";7.6;3;6.6;2.1;"virginica"
"107";4.9;2.5;4.5;1.7;"virginica"
"108";7.3;2.9;6.3;1.8;"virginica"
"109";6.7;2.5;5.8;1.8;"virginica"
"110";7.2;3.6;6.1;2.5;"virginica"
"111";6.5;3.2;5.1;2;"virginica"
"112";6.4;2.7;5.3;1.9;"virginica"
"113";6.8;3;5.5;2.1;"virginica"
"114";5.7;2.5;5;2;"virginica"
"115";5.8;2.8;5.1;2.4;"virginica"
"116";6.4;3.2;5.3;2.3;"virginica"
"117";6.5;3;5.5;1.8;"virginica"
"118";7.7;3.8;6.7;2.2;"virginica"
"119";7.7;2.6;6.9;2.3;"virginica"
"120";6;2.2;5;1.5;"virginica"
"121";6.9;3.2;5.7;2.3;"virginica"
"122";5.6;2.8;4.9;2;"virginica"
"123";7.7;2.8;6.7;2;"virginica"
"124";6.3;2.7;4.9;1.8;"virginica"
"125";6.7;3.3;5.7;2.1;"virginica"
"126";7.2;3.2;6;1.8;"virginica"
"127";6.2;2.8;4.8;1.8;"virginica"
"128";6.1;3;4.9;1.8;"virginica"
"129";6.4;2.8;5.6;2.1;"virginica"
"130";7.2;3;5.8;1.6;"virginica"
"131";7.4;2.8;6.1;1.9;"virginica"
"132";7.9;3.8;6.4;2;"virginica"
"133";6.4;2.8;5.6;2.2;"virginica"
"134";6.3;2.8;5.1;1.5;"virginica"
"135";6.1;2.6;5.6;1.4;"virginica"
"136";7.7;3;6.1;2.3;"virginica"
"137";6.3;3.4;5.6;2.4;"virginica"
"138";6.4;3.1;5.5;1.8;"virginica"
"139";6;3;4.8;1.8;"virginica"
"140";6.9;3.1;5.4;2.1;"virginica"
"141";6.7;3.1;5.6;2.4;"virginica"
"142";6.9;3.1;5.1;2.3;"virginica"
"143";5.8;2.7;5.1;1.9;"virginica"
"144";6.8;3.2;5.9;2.3;"virginica"
"145";6.7;3.3;5.7;2.5;"virginica"
"146";6.7;3;5.2;2.3;"virginica"
"147";6.3;2.5;5;1.9;"virginica"
"148";6.5;3;5.2;2;"virginica"
"149";6.2;3.4;5.4;2.3;"virginica"
"150";5.9;3;5.1;1.8;"virginica"

View file

@ -0,0 +1,172 @@
#Reading data file.
dat <- read.table("GSE60450_Lactation-GenewiseCounts.txt")
#Let us examine how our data looks.
View(dat)
#Seems like all data points are there. Can we improve appearance?
#Examine the details of read.table command.
?read.table
#Looks like we can inform read.table about separator type and presence of header.
dat <- read.table("GSE60450_Lactation-GenewiseCounts.txt",
header= TRUE, sep = "\t")
#Let us examine how data looks now.
View(dat)
#What are the observations represented in our data?
#colnames gives the names of columns of data.
colnames(dat)
#To check the number of rows and columns in table.
dim(dat)
#To check first few rows of table.
head(dat)
#To check last few rows of table.
tail(dat)
#To check basic stats for each column.
summary(dat)
#Let us extract a column of data.
#For example, EntrezGeneID.
geneIds <- dat$EntrezGeneID
#Check what kind of variable geneIds is.
class(geneIds)
#geneIds should be string type.
dat$EntrezGeneID <- as.character(dat$EntrezGeneID)
#Check the class of gene ids again
class(dat$EntrezGeneID)
#Information about samples is in another file.
phenotype_info <- read.table("targets.csv",
header = TRUE,
sep = ",")
#The column named GEO in the table represents sample id on GEO.
#Status and CellType are factor levels for statistical analysis.
phenotype_info$CellType <- as.factor(phenotype_info$CellType)
#Check class of CellType. It is a factor variable now.
class(phenotype_info$CellType)
#Currently Status and CellType has repetition of the same values.
#Get unique values.
celltypes <- unique(phenotype_info$CellType)
#Perhaps, no point in keeping spcs as factor in the above object.
#Convert celltypes to character variable.
celltypes <- as.character(celltypes)
#Checking if a text is present in a character variable?
"cardiomyocyte" %in% celltypes
#Let us say we want subset of data corresponding to B cells.
which_B <- phenotype_info$CellType == "B"
phenotype_info_B <- phenotype_info[which_B, ]
#Class of which_B? Logical
class(which_B)
#Alternative way to subset data.
phenotype_info_B <- subset(phenotype_info,
CellType == "B")
#Check mean counts for a sample.
mean(dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1)
#Check mean counts for all genes in the B cell samples.
clnames_dat <- colnames(dat)[-2:-1]
clnames_dat <- strsplit(clnames_dat, split = "_")
clnames_dat <- data.frame(clnames_dat)
clnames_dat <- t(clnames_dat)
rownames(clnames_dat) <- NULL
clnames_dat <- clnames_dat[, 1]
which_B <- which(clnames_dat %in% phenotype_info_B$X)
cnts_B <- dat[, which_B + 2]
#Estimate median counts for casein protein.
median(unlist(dat[dat$EntrezGeneID == "12992", -2:-1]))
median(unlist(dat[dat$EntrezGeneID == "12992", which_B + 2]))
#Estimate standard deviation.
sd(unlist(dat[dat$EntrezGeneID == "12992", -2:-1]))
sd(unlist(dat[dat$EntrezGeneID == "12992", which_B + 2]))
#Check histograms of data.
hist(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1))
hist(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1))
#These histograms are not easy to compare.
#Perhaps, we can fix the axis limits.
hist(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
xlim = c(0, 20), ylim = c(0, 15000))
hist(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
xlim = c(0, 20), ylim = c(0, 15000))
#Let us look at boxplots.
boxplot(log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
ylim = c(0, 25))
boxplot(log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
ylim = c(0, 25))
#Scatter plots.
plot(x= log2(1 + dat$MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
y= log2(1 + dat$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1))
#Are higher counts associated with longer genes?
#Can we color data points based on length?
#install.packages("ggplot2")
library(ggplot2)
qplot(x = log2(1 + MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
data = dat[1:1000, ], color = log10(Length))
#Color scale does not give clear insight.
#Can we use discrete color scale of points instead?
dat_subset <- dat[1:1000, ]
dat_subset$genetype <- "smallGene"
dat_subset$genetype[dat_subset$Length > median(dat_subset$Length)] <- "longGene"
qplot(x = log2(1 + MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1),
y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
data = dat_subset, color = genetype)
#Check the boxplots for long and small genes simultaneously.
qplot(x = genetype,
y = log2(1 + MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1),
data = dat_subset, geom = "boxplot")
#Additional stuff.
#Hypothesis test.
#Are counts for long genes significantly different from the small genes?
dat_small <- subset(dat_subset, genetype == "smallGene")
dat_long <- subset(dat_subset, genetype == "longGene")
test <- t.test(dat_small$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1,
dat_long$MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1)
test$p.value
#Conditional statements take a condition and perform steps depending on validitiy of the statement.
if (test$p.value < 0.05) {
print("Counts depend on gene length.")
} else {
print("Counts don't depend on gene length.")
}
#Looping for repeating the same task but on different data.
for (item in c("smallGene", "longGene")) {
y <- subset(dat_subset, genetype == item)
smry <- summary(y)
print(item)
print(smry)
}

View file

@ -0,0 +1,13 @@
,GEO,SRA,CellType,Status
MCL1.DG,GSM1480297,SRR1552450,B,virgin
MCL1.DH,GSM1480298,SRR1552451,B,virgin
MCL1.DI,GSM1480299,SRR1552452,B,pregnant
MCL1.DJ,GSM1480300,SRR1552453,B,pregnant
MCL1.DK,GSM1480301,SRR1552454,B,lactating
MCL1.DL,GSM1480302,SRR1552455,B,lactating
MCL1.LA,GSM1480291,SRR1552444,L,virgin
MCL1.LB,GSM1480292,SRR1552445,L,virgin
MCL1.LC,GSM1480293,SRR1552446,L,pregnant
MCL1.LD,GSM1480294,SRR1552447,L,pregnant
MCL1.LE,GSM1480295,SRR1552448,L,lactating
MCL1.LF,GSM1480296,SRR1552449,L,lactating
1 GEO SRA CellType Status
2 MCL1.DG GSM1480297 SRR1552450 B virgin
3 MCL1.DH GSM1480298 SRR1552451 B virgin
4 MCL1.DI GSM1480299 SRR1552452 B pregnant
5 MCL1.DJ GSM1480300 SRR1552453 B pregnant
6 MCL1.DK GSM1480301 SRR1552454 B lactating
7 MCL1.DL GSM1480302 SRR1552455 B lactating
8 MCL1.LA GSM1480291 SRR1552444 L virgin
9 MCL1.LB GSM1480292 SRR1552445 L virgin
10 MCL1.LC GSM1480293 SRR1552446 L pregnant
11 MCL1.LD GSM1480294 SRR1552447 L pregnant
12 MCL1.LE GSM1480295 SRR1552448 L lactating
13 MCL1.LF GSM1480296 SRR1552449 L lactating