SDS 358 Dr. Woodward
knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(ggplot2)
install.packages("tidyr")
## Installing package into '/stor/home/jhec819/R/x86_64-pc-linux-gnu-library/3.4'
## (as 'lib' is unspecified)
## Warning in install.packages("tidyr"): installation of package 'tidyr' had
## non-zero exit status
library(tidyr)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ purrr 0.3.2 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readr)
install.packages("dplyr")
## Installing package into '/stor/home/jhec819/R/x86_64-pc-linux-gnu-library/3.4'
## (as 'lib' is unspecified)
## Warning in install.packages("dplyr"): installation of package 'dplyr' had
## non-zero exit status
library(dplyr)
library(pheatmap)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
Workflow Diagram
abundance_pre_6_7 <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/D6-7Pre_EVA68.fastq/abundance_pre_6_7.csv", header = T, sep = "\t")
#load in the dataframe
pre_6_7_ti_list <- abundance_pre_6_7$target_id[1:18610]
## gather all the transcript id's and putting them into a list
pre_6_7_ti_list[1435]
## [1] ENSTGUT00000000664.1
## 18610 Levels: ENSTGUT00000000001.1 ... ENSTGUT00000019483.1
#test to see if the line above actually worked
write.table(pre_6_7_ti_list, file = "pre_6_7_ti_list", sep = "\t",
row.names = FALSE)
#put list into txt file
pre_6_7_total_list <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/D6-7Pre_EVA68.fastq/pre_6_7_total_list.csv")
#load in the dataframe which is the product of Ensembl
pre_6_7_giti_df <- pre_6_7_total_list[, c("Transcript.stable.ID.version", "Gene.stable.ID")]
#pull two columns into a new dataframe
pre_6_7_giti_total <- left_join(abundance_pre_6_7, pre_6_7_giti_df, by = c("target_id"="Transcript.stable.ID.version"))
length(unique(pre_6_7_giti_total$Gene.stable.ID))
## [1] 17894
#see if there are repeating gene id's
#we see that there are repeating gene id's
head(pre_6_7_giti_total %>% filter(duplicated(pre_6_7_giti_total$Gene.stable.ID)))
## target_id length eff_length est_counts tpm
## 1 ENSTGUT00000017949.1 369 190 0.00000e+00 0.00000e+00
## 2 ENSTGUT00000017847.1 804 625 0.00000e+00 0.00000e+00
## 3 ENSTGUT00000017827.1 762 583 0.00000e+00 0.00000e+00
## 4 ENSTGUT00000017808.1 993 814 2.92971e-06 1.35037e-05
## 5 ENSTGUT00000017757.1 480 301 0.00000e+00 0.00000e+00
## 6 ENSTGUT00000017760.1 408 229 0.00000e+00 0.00000e+00
## Gene.stable.ID
## 1 ENSTGUG00000017270
## 2 ENSTGUG00000017167
## 3 ENSTGUG00000017153
## 4 ENSTGUG00000017136
## 5 ENSTGUG00000017088
## 6 ENSTGUG00000017087
#find out which gene id's are repeated
pre_6_7_w_means <- pre_6_7_giti_total %>% group_by(Gene.stable.ID) %>% summarise(mean_lenght = mean(length), mean_eff_lenght = mean(eff_length), mean_est_counts = mean(est_counts), mean_tpm = mean(tpm))
#make new column name
pre_6_7_w_means
## # A tibble: 17,894 x 5
## Gene.stable.ID mean_lenght mean_eff_lenght mean_est_counts mean_tpm
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ENSTGUG00000000001 1434 1255 6 17.9
## 2 ENSTGUG00000000002 1525 1346 24 66.9
## 3 ENSTGUG00000000003 2630 2451 8 12.2
## 4 ENSTGUG00000000004 678 499 0 0
## 5 ENSTGUG00000000005 504 325 0 0
## 6 ENSTGUG00000000006 1863 1684 9 20.1
## 7 ENSTGUG00000000007 5148 4969 16 12.1
## 8 ENSTGUG00000000008 363 184 0 0
## 9 ENSTGUG00000000010 1318 1139 51 168.
## 10 ENSTGUG00000000011 504 325 1 11.5
## # … with 17,884 more rows
#view dataframe
abundance_post_6_7 <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/EVA-69_D6-7Post.fastq/abundance_post_6_7.csv", header = T, sep = "\t")
#load in the dataframe
post_6_7_ti_list <- abundance_post_6_7$target_id[1:18610]
## gather all the transcript id's and putting them into a list
post_6_7_ti_list[1435]
## [1] ENSTGUT00000000664.1
## 18610 Levels: ENSTGUT00000000001.1 ... ENSTGUT00000019483.1
#test to see if the line above actually worked
write.table(post_6_7_ti_list, file = "post_6_7_ti_list.txt", sep = "\t",
row.names = FALSE)
#put list into txt file
post_6_7_total_list <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/EVA-69_D6-7Post.fastq/post_6_7_total_list.csv")
#load in the dataframe which is the product of Ensembl
post_6_7_giti_df <- post_6_7_total_list[, c("Transcript.stable.ID.version", "Gene.stable.ID")]
#pull two columns into a new dataframe
post_6_7_giti_total <- left_join(abundance_post_6_7, post_6_7_giti_df, by = c("target_id"="Transcript.stable.ID.version"))
length(unique(post_6_7_giti_total$Gene.stable.ID))
## [1] 17894
#see if there are repeating gene id's
#we see that there are repeating gene id's
head(post_6_7_giti_total %>% filter(duplicated(post_6_7_giti_total$Gene.stable.ID)))
## target_id length eff_length est_counts tpm
## 1 ENSTGUT00000017949.1 369 190 1 26.084
## 2 ENSTGUT00000017847.1 804 625 0 0.000
## 3 ENSTGUT00000017827.1 762 583 0 0.000
## 4 ENSTGUT00000017808.1 993 814 0 0.000
## 5 ENSTGUT00000017757.1 480 301 0 0.000
## 6 ENSTGUT00000017760.1 408 229 0 0.000
## Gene.stable.ID
## 1 ENSTGUG00000017270
## 2 ENSTGUG00000017167
## 3 ENSTGUG00000017153
## 4 ENSTGUG00000017136
## 5 ENSTGUG00000017088
## 6 ENSTGUG00000017087
#find out which gene id's are repeated
post_6_7_w_means <- post_6_7_giti_total %>% group_by(Gene.stable.ID) %>% summarise(mean_lenght = mean(length), mean_eff_lenght = mean(eff_length), mean_est_counts = mean(est_counts), mean_tpm = mean(tpm))
#make new column name
post_6_7_w_means
## # A tibble: 17,894 x 5
## Gene.stable.ID mean_lenght mean_eff_lenght mean_est_counts mean_tpm
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ENSTGUG00000000001 1434 1255 12 47.4
## 2 ENSTGUG00000000002 1525 1346 23 84.7
## 3 ENSTGUG00000000003 2630 2451 21 42.5
## 4 ENSTGUG00000000004 678 499 0 0
## 5 ENSTGUG00000000005 504 325 0 0
## 6 ENSTGUG00000000006 1863 1684 19 55.9
## 7 ENSTGUG00000000007 5148 4969 34 33.9
## 8 ENSTGUG00000000008 363 184 0 0
## 9 ENSTGUG00000000010 1318 1139 47 205.
## 10 ENSTGUG00000000011 504 325 2 30.5
## # … with 17,884 more rows
#view dataframe
abundance_post_8 <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Post8_08PSU12E4_EVA72.fastq/abundance_post_8.csv", header = T, sep = "\t")
#load in the dataframe
post_8_ti_list <- abundance_post_8$target_id[1:18610]
## gather all the transcript id's and putting them into a list
post_8_ti_list[1435]
## [1] ENSTGUT00000000664.1
## 18610 Levels: ENSTGUT00000000001.1 ... ENSTGUT00000019483.1
#test to see if the line above actually worked
write.table(post_8_ti_list, file = "post_8_ti_list.txt", sep = "\t",
row.names = FALSE)
#put list into txt file
post_8_total_list <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Post8_08PSU12E4_EVA72.fastq//post_8_total_list.csv")
#load in the dataframe which is the product of Ensembl
post_8_giti_df <- post_8_total_list[, c("Transcript.stable.ID.version", "Gene.stable.ID")]
#pull two columns into a new dataframe
post_8_giti_total <- left_join(abundance_post_8, post_8_giti_df, by = c("target_id"="Transcript.stable.ID.version"))
length(unique(post_8_giti_total$Gene.stable.ID))
## [1] 17894
#see if there are repeating gene id's
#we see that there are repeating gene id's
head(post_8_giti_total %>% filter(duplicated(post_8_total_list$Gene.stable.ID)))
## target_id length eff_length est_counts tpm
## 1 ENSTGUT00000017952.1 1632 1453.0000 0.00000 0.00000
## 2 ENSTGUT00000017841.1 1706 1527.0000 3.00000 6.88357
## 3 ENSTGUT00000017828.1 2658 2479.0000 3.00000 4.24010
## 4 ENSTGUT00000017801.1 903 724.0000 24.00000 116.14600
## 5 ENSTGUT00000017745.1 189 20.4424 0.00000 0.00000
## 6 ENSTGUT00000017751.1 477 298.0000 3.96057 46.56640
## Gene.stable.ID
## 1 ENSTGUG00000017269
## 2 ENSTGUG00000017160
## 3 ENSTGUG00000017145
## 4 ENSTGUG00000017126
## 5 ENSTGUG00000017077
## 6 ENSTGUG00000017075
#find out which gene id's are repeated
post_8_w_means <- post_8_giti_total %>% group_by(Gene.stable.ID) %>% summarise(mean_lenght = mean(length), mean_eff_lenght = mean(eff_length), mean_est_counts = mean(est_counts), mean_tpm = mean(tpm))
#make new column name
post_8_w_means
## # A tibble: 17,894 x 5
## Gene.stable.ID mean_lenght mean_eff_lenght mean_est_counts mean_tpm
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ENSTGUG00000000001 1434 1255 9 25.1
## 2 ENSTGUG00000000002 1525 1346 29 75.5
## 3 ENSTGUG00000000003 2630 2451 24 34.3
## 4 ENSTGUG00000000004 678 499 0 0
## 5 ENSTGUG00000000005 504 325 0 0
## 6 ENSTGUG00000000006 1863 1684 12 25.0
## 7 ENSTGUG00000000007 5148 4969 27 19.0
## 8 ENSTGUG00000000008 363 184 0 0
## 9 ENSTGUG00000000010 1318 1139 47 145.
## 10 ENSTGUG00000000011 504 325 0 0
## # … with 17,884 more rows
#view dataframe
abundance_post_9 <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Post9_08PSU12E2_EVA70.fastq/abundance_post_9.csv", header = T, sep = "\t")
#load in the dataframe
post_9_ti_list <- abundance_post_9$target_id[1:18610]
## gather all the transcript id's and putting them into a list
post_9_ti_list[1435]
## [1] ENSTGUT00000000664.1
## 18610 Levels: ENSTGUT00000000001.1 ... ENSTGUT00000019483.1
#test to see if the line above actually worked
write.table(post_9_ti_list, file = "post_9_ti_list.txt", sep = "\t",
row.names = FALSE)
#put list into txt file
post_9_total_list <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Post9_08PSU12E2_EVA70.fastq/post_9_total_list.csv")
#load in the dataframe which is the product of Ensembl
post_9_giti_df <- post_9_total_list[, c("Transcript.stable.ID.version", "Gene.stable.ID")]
#pull two columns into a new dataframe
post_9_giti_total <- left_join(abundance_post_9, post_9_giti_df, by = c("target_id"="Transcript.stable.ID.version"))
length(unique(post_9_giti_total$Gene.stable.ID))
## [1] 17894
#see if there are repeating gene id's
#we see that there are repeating gene id's
head(post_9_giti_total %>% filter(duplicated(post_9_total_list$Gene.stable.ID)))
## target_id length eff_length est_counts tpm
## 1 ENSTGUT00000017952.1 1632 1453.0000 0.00000 0.00000
## 2 ENSTGUT00000017841.1 1706 1527.0000 2.00000 6.57853
## 3 ENSTGUT00000017828.1 2658 2479.0000 2.00000 4.05221
## 4 ENSTGUT00000017801.1 903 724.0000 6.00000 41.62470
## 5 ENSTGUT00000017745.1 189 20.4424 0.00000 0.00000
## 6 ENSTGUT00000017751.1 477 298.0000 6.96057 117.31800
## Gene.stable.ID
## 1 ENSTGUG00000017269
## 2 ENSTGUG00000017160
## 3 ENSTGUG00000017145
## 4 ENSTGUG00000017126
## 5 ENSTGUG00000017077
## 6 ENSTGUG00000017075
#find out which gene id's are repeated
post_9_w_means <- post_9_giti_total %>% group_by(Gene.stable.ID) %>% summarise(mean_lenght = mean(length), mean_eff_lenght = mean(eff_length), mean_est_counts = mean(est_counts), mean_tpm = mean(tpm))
#make new column name
post_9_w_means
## # A tibble: 17,894 x 5
## Gene.stable.ID mean_lenght mean_eff_lenght mean_est_counts mean_tpm
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ENSTGUG00000000001 1434 1255 8 32.0
## 2 ENSTGUG00000000002 1525 1346 13 48.5
## 3 ENSTGUG00000000003 2630 2451 10 20.5
## 4 ENSTGUG00000000004 678 499 0 0
## 5 ENSTGUG00000000005 504 325 0 0
## 6 ENSTGUG00000000006 1863 1684 11 32.8
## 7 ENSTGUG00000000007 5148 4969 14 14.2
## 8 ENSTGUG00000000008 363 184 2 54.6
## 9 ENSTGUG00000000010 1318 1139 34 150.
## 10 ENSTGUG00000000011 504 325 0 0
## # … with 17,884 more rows
#view dataframe
abundance_pre_8 <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Pre8_08CBS6E3_Eva79.fastq/abundance_pre_8.csv", header = T, sep = "\t")
#load in the dataframe
pre_8_ti_list <- abundance_pre_8$target_id[1:18610]
## gather all the transcript id's and putting them into a list
pre_8_ti_list[1435]
## [1] ENSTGUT00000000664.1
## 18610 Levels: ENSTGUT00000000001.1 ... ENSTGUT00000019483.1
#test to see if the line above actually worked
write.table(pre_8_ti_list, file = "pre_8_ti_list.txt", sep = "\t",
row.names = FALSE)
#put list into txt file
pre_8_total_list <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Pre8_08CBS6E3_Eva79.fastq//pre_8_total_list.csv")
#load in the dataframe which is the product of Ensembl
pre_8_giti_df <- pre_8_total_list[, c("Transcript.stable.ID.version", "Gene.stable.ID")]
#pull two columns into a new dataframe
pre_8_giti_total <- left_join(abundance_pre_8, pre_8_giti_df, by = c("target_id"="Transcript.stable.ID.version"))
length(unique(pre_8_giti_total$Gene.stable.ID))
## [1] 17894
#see if there are repeating gene id's
#we see that there are repeating gene id's
head(pre_8_giti_total %>% filter(duplicated(pre_8_total_list$Gene.stable.ID)))
## target_id length eff_length est_counts tpm
## 1 ENSTGUT00000017952.1 1632 1453.0000 1.00000 3.12138
## 2 ENSTGUT00000017841.1 1706 1527.0000 3.00000 8.91036
## 3 ENSTGUT00000017828.1 2658 2479.0000 0.00000 0.00000
## 4 ENSTGUT00000017801.1 903 724.0000 16.00000 100.22900
## 5 ENSTGUT00000017745.1 189 20.4424 0.00000 0.00000
## 6 ENSTGUT00000017751.1 477 298.0000 4.74216 72.17270
## Gene.stable.ID
## 1 ENSTGUG00000017269
## 2 ENSTGUG00000017160
## 3 ENSTGUG00000017145
## 4 ENSTGUG00000017126
## 5 ENSTGUG00000017077
## 6 ENSTGUG00000017075
#find out which gene id's are repeated
pre_8_w_means <- pre_8_giti_total %>% group_by(Gene.stable.ID) %>% summarise(mean_lenght = mean(length), mean_eff_lenght = mean(eff_length), mean_est_counts = mean(est_counts), mean_tpm = mean(tpm))
#make new column name
pre_8_w_means
## # A tibble: 17,894 x 5
## Gene.stable.ID mean_lenght mean_eff_lenght mean_est_counts mean_tpm
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ENSTGUG00000000001 1434 1255 9 32.5
## 2 ENSTGUG00000000002 1525 1346 17 57.3
## 3 ENSTGUG00000000003 2630 2451 7 13.0
## 4 ENSTGUG00000000004 678 499 1 9.09
## 5 ENSTGUG00000000005 504 325 0 0
## 6 ENSTGUG00000000006 1863 1684 12 32.3
## 7 ENSTGUG00000000007 5148 4969 24 21.9
## 8 ENSTGUG00000000008 363 184 0 0
## 9 ENSTGUG00000000010 1318 1139 34 135.
## 10 ENSTGUG00000000011 504 325 3 41.9
## # … with 17,884 more rows
#view dataframe
abundance_pre_9 <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Pre9_08CBS6E2_EVA71.fastq/abundance_pre_9.csv", header = T, sep = "\t")
#load in the dataframe
pre_9_ti_list <- abundance_pre_9$target_id[1:18610]
## gather all the transcript id's and putting them into a list
pre_9_ti_list[1435]
## [1] ENSTGUT00000000664.1
## 18610 Levels: ENSTGUT00000000001.1 ... ENSTGUT00000019483.1
#test to see if the line above actually worked
write.table(pre_9_ti_list, file = "pre_9_ti_list.txt", sep = "\t",
row.names = FALSE)
#put list into txt file
pre_9_total_list <- read.csv("/stor/work/Hofmann/Shared/Undergraduate_Students/FRI_BigData_TimeSeries/John_Henry_Cruz/Zebrafinch/Pre9_08CBS6E2_EVA71.fastq/pre_9_total_list.csv")
#load in the dataframe which is the product of Ensembl
pre_9_giti_df <- pre_9_total_list[, c("Transcript.stable.ID.version", "Gene.stable.ID")]
#pull two columns into a new dataframe
pre_9_giti_total <- left_join(abundance_pre_9, pre_9_giti_df, by = c("target_id"="Transcript.stable.ID.version"))
length(unique(pre_9_giti_total$Gene.stable.ID))
## [1] 17894
#see if there are repeating gene id's
#we see that there are repeating gene id's
head(pre_9_giti_total %>% filter(duplicated(pre_9_total_list$Gene.stable.ID)))
## target_id length eff_length est_counts tpm
## 1 ENSTGUT00000017952.1 1632 1453.0000 0.0000 0.00000
## 2 ENSTGUT00000017841.1 1706 1527.0000 4.0000 9.06992
## 3 ENSTGUT00000017828.1 2658 2479.0000 1.0000 1.39671
## 4 ENSTGUT00000017801.1 903 724.0000 20.0000 95.64760
## 5 ENSTGUT00000017745.1 189 20.4424 0.0000 0.00000
## 6 ENSTGUT00000017751.1 477 298.0000 11.8514 137.70000
## Gene.stable.ID
## 1 ENSTGUG00000017269
## 2 ENSTGUG00000017160
## 3 ENSTGUG00000017145
## 4 ENSTGUG00000017126
## 5 ENSTGUG00000017077
## 6 ENSTGUG00000017075
#find out which gene id's are repeated
pre_9_w_means <- pre_9_giti_total %>% group_by(Gene.stable.ID) %>% summarise(mean_lenght = mean(length), mean_eff_lenght = mean(eff_length), mean_est_counts = mean(est_counts), mean_tpm = mean(tpm))
#make new column name
pre_9_w_means
## # A tibble: 17,894 x 5
## Gene.stable.ID mean_lenght mean_eff_lenght mean_est_counts mean_tpm
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ENSTGUG00000000001 1434 1255 19 52.4
## 2 ENSTGUG00000000002 1525 1346 32 82.3
## 3 ENSTGUG00000000003 2630 2451 21 29.7
## 4 ENSTGUG00000000004 678 499 1 6.94
## 5 ENSTGUG00000000005 504 325 0 0
## 6 ENSTGUG00000000006 1863 1684 8 16.4
## 7 ENSTGUG00000000007 5148 4969 31 21.6
## 8 ENSTGUG00000000008 363 184 0 0
## 9 ENSTGUG00000000010 1318 1139 44 134.
## 10 ENSTGUG00000000011 504 325 5 53.3
## # … with 17,884 more rows
#view dataframe
post_6_7_tpm <- post_6_7_w_means %>% select(Gene.stable.ID, mean_tpm) %>% rename("post_6_7"=mean_tpm)
post_8_tpm <- post_8_w_means %>% select(Gene.stable.ID, mean_tpm) %>% rename("post_8"=mean_tpm)
post_9_tpm <- post_9_w_means %>% select(Gene.stable.ID, mean_tpm) %>% rename("post_9"=mean_tpm)
pre_6_7_tpm <- pre_6_7_w_means %>% select(Gene.stable.ID, mean_tpm) %>% rename("pre_6_7"=mean_tpm)
pre_8_tpm <- pre_8_w_means %>% select(Gene.stable.ID, mean_tpm) %>% rename("pre_8"=mean_tpm)
pre_9_tpm <- pre_9_w_means %>% select(Gene.stable.ID, mean_tpm) %>% rename("pre_9"=mean_tpm)
#for each sample, take the gene id's and the tpm values and make a new dataframe
#rename the mean_tpm column to the sample name
#use gene id's as a way to join the datasets together
test <- left_join(post_6_7_tpm, post_8_tpm, by = "Gene.stable.ID")
test <- left_join(test, pre_6_7_tpm, by = "Gene.stable.ID")
test <- left_join(test, pre_8_tpm, by = "Gene.stable.ID")
test <- left_join(test, pre_9_tpm, by = "Gene.stable.ID")
test <- left_join(test, post_9_tpm, by = "Gene.stable.ID")
tpm_df <- test
#put all the sample's tpm values in one dataframe with the gene id's
sum(is.na(tpm_df$post_6_7)) + sum(is.na(tpm_df$post_8)) + sum(is.na(tpm_df$post_9)) + sum(is.na(tpm_df$pre_6_7)) + sum(is.na(tpm_df$pre_8)) + sum(is.na(tpm_df$pre_9))
## [1] 0
#make sure that no tpm columns have na values to confirm joining was a success
head(tpm_df)
## # A tibble: 6 x 7
## Gene.stable.ID post_6_7 post_8 pre_6_7 pre_8 pre_9 post_9
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSTGUG00000000001 47.4 25.1 17.9 32.5 52.4 32.0
## 2 ENSTGUG00000000002 84.7 75.5 66.9 57.3 82.3 48.5
## 3 ENSTGUG00000000003 42.5 34.3 12.2 13.0 29.7 20.5
## 4 ENSTGUG00000000004 0 0 0 9.09 6.94 0
## 5 ENSTGUG00000000005 0 0 0 0 0 0
## 6 ENSTGUG00000000006 55.9 25.0 20.1 32.3 16.4 32.8
tpm_df_frequency <- tpm_df %>% mutate_at(vars(post_6_7:post_9), function(x) ifelse(x >= 2,1,0))
#make a dataframe for presence and absense where presence is 1 and absence is 0
tpm_df_w_cutoff <- tpm_df %>%
mutate(post_6_7_tpm = case_when(post_6_7 < 2 ~ 0, TRUE ~ post_6_7)) %>%
mutate(post_8_tpm = case_when(post_8 < 2 ~ 0, TRUE ~ post_8)) %>%
mutate(post_9_tpm = case_when(post_9 < 2 ~ 0, TRUE ~ post_9)) %>%
mutate(pre_6_7_tpm = case_when(pre_6_7 < 2 ~ 0, TRUE ~ pre_6_7)) %>%
mutate(pre_8_tpm = case_when(pre_8 < 2 ~ 0, TRUE ~ pre_8)) %>%
mutate(pre_9_tpm = case_when(pre_9 < 2 ~ 0, TRUE ~ pre_9)) %>% select(1,8:13)
#make a dataframe of the raw counts but filtering data that is less than 2 and changing those values to 0
frequency_list <- rowSums(tpm_df_frequency[2:7])
#make a list of the sums of the elements in each row of the presence absense dataframe to see how frequently we see that gene throughout the samples
frequency_df <- data.frame(frequency = matrix(unlist(frequency_list), nrow=17894, byrow=T),stringsAsFactors=FALSE)
#make a dataframe of the row sums of the frequency of the presence of the gene throughout the samples
tpm_df_frequency_sums <- cbind.data.frame(tpm_df_frequency, frequency_df)
#make a dataframe with the frequency of the genes across all samples and the dataframe of presence absence
tpm_df_frequency_sums_4_up <- tpm_df_frequency_sums[tpm_df_frequency_sums$frequency > 3,]
#make a new dataframe with only the genes that were seen at least 4 times
tpm_df_frequency_sums_5_up <- tpm_df_frequency_sums[tpm_df_frequency_sums$frequency > 4,]
#make a new dataframe with only the genes that were seen at least 5 times
frequency_4_up_list <- tpm_df_frequency_sums_4_up$Gene.stable.ID[1:9845]
#get list of the gene names that were seen at least 4 times
frequency_5_up_list <- tpm_df_frequency_sums_5_up$Gene.stable.ID[1:8782]
#get list of the gene names that were seen at least 5 times
tpm_df_filtered_4up <- left_join(tpm_df_frequency_sums_4_up, tpm_df_w_cutoff, by ="Gene.stable.ID")
#make a new dataframe of only the raw counts with the genes that were seen at least 4 times, making the filtered dataframe
tpm_df_filtered_4up <- tpm_df_filtered_4up %>% select(1,9:14)
#take only the columns with the tpm values and the gene names
tpm_df_filtered_5up <- left_join(tpm_df_frequency_sums_5_up, tpm_df_w_cutoff, by ="Gene.stable.ID")
#make a new dataframe of only the raw counts with the genes that were seen at least 4 times, making the filtered dataframe
tpm_df_filtered_5up <- tpm_df_filtered_5up %>% select(1, 9:14)
#take only the columns with the tpm values and the gene names
frequency_4_up_df <- as.data.frame(frequency_4_up_list)
frequency_5_up_df <- as.data.frame(frequency_5_up_list)
#make dataframe of the gene id's that were seen at a certain frequency
post_6_7_counts <- post_6_7_w_means %>% select(Gene.stable.ID, mean_est_counts) %>% rename("post_6_7"=mean_est_counts)
post_8_counts <- post_8_w_means %>% select(Gene.stable.ID, mean_est_counts) %>% rename("post_8"=mean_est_counts)
post_9_counts <- post_9_w_means %>% select(Gene.stable.ID, mean_est_counts) %>% rename('post_9'=mean_est_counts)
pre_6_7_counts <- pre_6_7_w_means %>% select(Gene.stable.ID, mean_est_counts) %>% rename('pre_6_7'=mean_est_counts)
pre_8_counts <- pre_8_w_means %>% select(Gene.stable.ID, mean_est_counts) %>% rename('pre_8'=mean_est_counts)
pre_9_counts <- pre_9_w_means %>% select(Gene.stable.ID, mean_est_counts) %>% rename('pre_9'=mean_est_counts)
#for each sample, take the gene id's and the tpm values and make a new dataframe
#rename the mean_tpm column to the sample name
#use gene id's as a way to join the datasets together
test <- left_join(post_6_7_counts, post_8_counts, by = "Gene.stable.ID")
test <- left_join(test, pre_6_7_counts, by = "Gene.stable.ID")
test <- left_join(test, pre_8_counts, by = "Gene.stable.ID")
test <- left_join(test, pre_9_counts, by = "Gene.stable.ID")
test <- left_join(test, post_9_counts, by = "Gene.stable.ID")
counts_df <- test
#put all the sample's tpm values in one dataframe with the gene id's
sum(is.na(counts_df$post_6_7)) + sum(is.na(counts_df$post_8)) + sum(is.na(counts_df$post_9)) + sum(is.na(counts_df$pre_6_7)) + sum(is.na(counts_df$pre_8)) + sum(is.na(counts_df$pre_9))
## [1] 0
#make sure that no tpm columns have na values to confirm joining was a success]
install.packages("UpSetR")
## Installing package into '/stor/home/jhec819/R/x86_64-pc-linux-gnu-library/3.4'
## (as 'lib' is unspecified)
library(UpSetR)
gene_freq_vis <- upset(tpm_df_frequency_sums, sets = c("pre_6_7", "pre_8", "pre_9", "post_6_7", "post_8", "post_9"), sets.x.label = "Intersections Per Sample", mainbar.y.label = 'Gene Intersections', order.by = 'degree', number.angles = -20)
gene_freq_vis
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
##
## shift
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
##
## apply
row.names(counts_df) <- counts_df$Gene.stable.ID
## Warning: Setting row names on a tibble is deprecated.
row.names(counts_df) <- counts_df$Gene.stable.ID
## Warning: Setting row names on a tibble is deprecated.
counts_df$Gene.stable.ID <- NULL
(condition <- factor(c('post', 'post', 'pre', 'pre', 'pre', 'post')))
## [1] post post pre pre pre post
## Levels: post pre
#make conditions for the DESeq2
testst <- as.data.frame(counts_df)
testst$post_6_7 <- as.integer(testst$post_6_7)
testst$post_8 <- as.integer(testst$post_8)
testst$post_9 <- as.integer(testst$post_9)
testst$pre_6_7 <- as.integer(testst$pre_6_7)
testst$pre_8 <- as.integer(testst$pre_8)
testst$pre_9 <- as.integer(testst$pre_9)
counts_df_int <- testst
#make counts df where doubles are changed to integers
#change all the doubles to integers
#save the test dataframe to counts_df_int
gene_ids <- tpm_df$Gene.stable.ID
#save gene id's into a list
counts_df_int$gene_ids <- gene_ids
#add the gene names into the counts df
counts_df_int_4_up <- left_join(frequency_4_up_df, counts_df_int, by = c('frequency_4_up_list'="gene_ids"))
#make counts df with 4 and up genes
counts_df_int_5_up <- left_join(frequency_5_up_df, counts_df_int, by = c('frequency_5_up_list'="gene_ids"))
#make counts df with 5 and up genes
counts_df_int_4_up <- as.data.frame(counts_df_int_4_up)
#copy dataframe
counts_df_int_4_up <- counts_df_int_4_up %>% remove_rownames %>% column_to_rownames(var="frequency_4_up_list")
#make column elements in rownames
counts_df_int_5_up <- as.data.frame(counts_df_int_5_up)
#copy dataframe
counts_df_int_5_up <- counts_df_int_5_up %>% remove_rownames %>% column_to_rownames(var="frequency_5_up_list")
#make column elements in rownames
(coldata_4 <- data.frame(row.names=colnames(counts_df_int_4_up), condition))
## condition
## post_6_7 post
## post_8 post
## pre_6_7 pre
## pre_8 pre
## pre_9 pre
## post_9 post
coldata_4
## condition
## post_6_7 post
## post_8 post
## pre_6_7 pre
## pre_8 pre
## pre_9 pre
## post_9 post
#save the conditions of the samples so DESeq2 can tell what to compare
dds_4 <- DESeqDataSetFromMatrix(countData = counts_df_int_4_up,
colData = coldata_4,
design= ~ condition)
dds_4
## class: DESeqDataSet
## dim: 9845 6
## metadata(1): version
## assays(1): counts
## rownames(9845): ENSTGUG00000000001 ENSTGUG00000000002 ...
## ENSTGUG00000018764 ENSTGUG00000018767
## rowData names(0):
## colnames(6): post_6_7 post_8 ... pre_9 post_9
## colData names(1): condition
dds_4 <- DESeq(dds_4)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
rld_4 <- rlogTransformation(dds_4)
plotDispEsts(dds_4, main="Dispersion plot")
resultsNames(dds_4) # lists the coefficients
## [1] "Intercept" "condition_pre_vs_post"
res_4 <- results(dds_4, name="condition_pre_vs_post")
# or to shrink log fold changes association with condition:
#res <- lfcShrink(dds, coef="condition_pre_vs_post", type="apeglm")
res_4
## log2 fold change (MLE): condition pre vs post
## Wald test p-value: condition pre vs post
## DataFrame with 9845 rows and 6 columns
## baseMean log2FoldChange lfcSE stat
## <numeric> <numeric> <numeric> <numeric>
## ENSTGUG00000000001 10.02320 0.02182031 0.6106632 0.03573215
## ENSTGUG00000000002 21.73322 0.01950677 0.4267808 0.04570678
## ENSTGUG00000000003 14.33011 -0.79002711 0.5357132 -1.47472012
## ENSTGUG00000000006 11.93298 -0.67352452 0.5749746 -1.17139879
## ENSTGUG00000000007 23.53816 -0.23009868 0.4400195 -0.52292835
## ... ... ... ... ...
## ENSTGUG00000018754 278.039219 0.2941791 0.2399692 1.2259037
## ENSTGUG00000018759 61.020649 0.5113449 0.2991068 1.7095729
## ENSTGUG00000018763 93.978350 0.5138150 0.2622658 1.9591381
## ENSTGUG00000018764 130.652780 0.5466184 0.2590036 2.1104665
## ENSTGUG00000018767 8.416531 0.2383410 0.6304698 0.3780371
## pvalue padj
## <numeric> <numeric>
## ENSTGUG00000000001 0.9714959 0.9999854
## ENSTGUG00000000002 0.9635440 0.9999854
## ENSTGUG00000000003 0.1402878 0.9999854
## ENSTGUG00000000006 0.2414385 0.9999854
## ENSTGUG00000000007 0.6010241 0.9999854
## ... ... ...
## ENSTGUG00000018754 0.22023492 0.9999854
## ENSTGUG00000018759 0.08734487 0.9999854
## ENSTGUG00000018763 0.05009662 0.9999854
## ENSTGUG00000018764 0.03481819 0.9999854
## ENSTGUG00000018767 0.70540303 0.9999854
res_df_4 <- res_4 %>% data.frame
#save the DESeq2 results as a dataframe
res_df_4_sig <- res_df_4 %>% filter(pvalue < 0.05)
#make the results into a dataframe and filter for significant genes
plotMA(res_4, ylim=c(-5,5))
#plot adjusted p values
(coldata_5 <- data.frame(row.names=colnames(counts_df_int_5_up), condition))
## condition
## post_6_7 post
## post_8 post
## pre_6_7 pre
## pre_8 pre
## pre_9 pre
## post_9 post
coldata_5
## condition
## post_6_7 post
## post_8 post
## pre_6_7 pre
## pre_8 pre
## pre_9 pre
## post_9 post
#save the conditions of the samples so DESeq2 can tell what to compare
dds_5 <- DESeqDataSetFromMatrix(countData = counts_df_int_5_up,
colData = coldata_5,
design= ~ condition)
dds_5
## class: DESeqDataSet
## dim: 8782 6
## metadata(1): version
## assays(1): counts
## rownames(8782): ENSTGUG00000000001 ENSTGUG00000000002 ...
## ENSTGUG00000018764 ENSTGUG00000018767
## rowData names(0):
## colnames(6): post_6_7 post_8 ... pre_9 post_9
## colData names(1): condition
dds_5 <- DESeq(dds_5)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
rld_5 <- rlogTransformation(dds_5)
plotDispEsts(dds_5, main="Dispersion plot")
resultsNames(dds_5) # lists the coefficients
## [1] "Intercept" "condition_pre_vs_post"
res_5 <- results(dds_5, name="condition_pre_vs_post")
# or to shrink log fold changes association with condition:
#res <- lfcShrink(dds, coef="condition_pre_vs_post", type="apeglm")
res_5
## log2 fold change (MLE): condition pre vs post
## Wald test p-value: condition pre vs post
## DataFrame with 8782 rows and 6 columns
## baseMean log2FoldChange lfcSE stat
## <numeric> <numeric> <numeric> <numeric>
## ENSTGUG00000000001 10.02580 0.02292516 0.5945471 0.03855903
## ENSTGUG00000000002 21.73591 0.01879556 0.4199061 0.04476133
## ENSTGUG00000000003 14.33397 -0.78970935 0.5237999 -1.50765459
## ENSTGUG00000000006 11.93545 -0.67494182 0.5611877 -1.20270238
## ENSTGUG00000000007 23.54365 -0.23078186 0.4337104 -0.53211053
## ... ... ... ... ...
## ENSTGUG00000018754 278.051188 0.2931725 0.2480241 1.1820322
## ENSTGUG00000018759 61.025746 0.5107431 0.3008852 1.6974680
## ENSTGUG00000018763 93.988381 0.5130987 0.2666135 1.9245038
## ENSTGUG00000018764 130.667535 0.5457025 0.2649402 2.0597198
## ENSTGUG00000018767 8.417191 0.2374668 0.6131921 0.3872633
## pvalue padj
## <numeric> <numeric>
## ENSTGUG00000000001 0.9692420 0.9998844
## ENSTGUG00000000002 0.9642975 0.9998844
## ENSTGUG00000000003 0.1316430 0.9998844
## ENSTGUG00000000006 0.2290915 0.9998844
## ENSTGUG00000000007 0.5946494 0.9998844
## ... ... ...
## ENSTGUG00000018754 0.23719294 0.9998844
## ENSTGUG00000018759 0.08960822 0.9998844
## ENSTGUG00000018763 0.05429146 0.9998844
## ENSTGUG00000018764 0.03942533 0.9998844
## ENSTGUG00000018767 0.69856130 0.9998844
res_df_5 <- res_5 %>% data.frame
#save the results as a dataframe
res_df_5_sig <- res_df_5 %>% filter(pvalue < 0.05)
#make the results into a dataframe and filter for significant genes
plotMA(res_5, ylim=c(-5,5))
#plot adjusted p values
ggplot(res_df_4, aes(x=pvalue)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite values (stat_bin).
ggplot(res_df_5, aes(x=pvalue)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
COPY_4 <- as.data.frame(res_df_4)
#make a copy of the dataframe to test with
setDT(COPY_4, keep.rownames = TRUE)[]
## rn baseMean log2FoldChange lfcSE stat
## 1: ENSTGUG00000000001 10.023196 0.02182031 0.6106632 0.03573215
## 2: ENSTGUG00000000002 21.733221 0.01950677 0.4267808 0.04570678
## 3: ENSTGUG00000000003 14.330108 -0.79002711 0.5357132 -1.47472012
## 4: ENSTGUG00000000006 11.932980 -0.67352452 0.5749746 -1.17139879
## 5: ENSTGUG00000000007 23.538161 -0.23009868 0.4400195 -0.52292835
## ---
## 9841: ENSTGUG00000018754 278.039219 0.29417911 0.2399692 1.22590369
## 9842: ENSTGUG00000018759 61.020649 0.51134494 0.2991068 1.70957295
## 9843: ENSTGUG00000018763 93.978350 0.51381498 0.2622658 1.95913807
## 9844: ENSTGUG00000018764 130.652780 0.54661839 0.2590036 2.11046652
## 9845: ENSTGUG00000018767 8.416531 0.23834098 0.6304698 0.37803710
## pvalue padj
## 1: 0.97149593 0.9999854
## 2: 0.96354396 0.9999854
## 3: 0.14028780 0.9999854
## 4: 0.24143852 0.9999854
## 5: 0.60102412 0.9999854
## ---
## 9841: 0.22023492 0.9999854
## 9842: 0.08734487 0.9999854
## 9843: 0.05009662 0.9999854
## 9844: 0.03481819 0.9999854
## 9845: 0.70540303 0.9999854
#make a column of the rowname numbers into their own column
COPY_4_gn_sig <- COPY_4 %>% filter(pvalue < 0.05)
#find the significant genes in the results dataframe using a cutoff on the p values
heatmap_4 <- left_join(COPY_4_gn_sig, tpm_df, by = c('rn' = 'Gene.stable.ID'))
## Warning: Column `rn`/`Gene.stable.ID` joining character vector and factor,
## coercing into character vector
#set up dataframe with info needed for my heatmap
heatmap_4 <- heatmap_4 %>% select(1,8:13)
#select the only columns that have the samples and theiir corresponding tpm values
heatmap_4 <- heatmap_4 %>% remove_rownames %>% column_to_rownames(var="rn")
#make the gene id columns into the row names
heatmap_4 <- heatmap_4[-c(12), ]
#remove a specific gene that has very high expression
pheatmap(heatmap_4, show_rownames = F)
## Gene 4372
COPY_5 <- as.data.frame(res_df_5)
#make a copy of the dataframe to test with
setDT(COPY_5, keep.rownames = TRUE)[]
## rn baseMean log2FoldChange lfcSE stat
## 1: ENSTGUG00000000001 10.025803 0.02292516 0.5945471 0.03855903
## 2: ENSTGUG00000000002 21.735912 0.01879556 0.4199061 0.04476133
## 3: ENSTGUG00000000003 14.333973 -0.78970935 0.5237999 -1.50765459
## 4: ENSTGUG00000000006 11.935452 -0.67494182 0.5611877 -1.20270238
## 5: ENSTGUG00000000007 23.543651 -0.23078186 0.4337104 -0.53211053
## ---
## 8778: ENSTGUG00000018754 278.051188 0.29317246 0.2480241 1.18203216
## 8779: ENSTGUG00000018759 61.025746 0.51074306 0.3008852 1.69746796
## 8780: ENSTGUG00000018763 93.988381 0.51309871 0.2666135 1.92450383
## 8781: ENSTGUG00000018764 130.667535 0.54570249 0.2649402 2.05971981
## 8782: ENSTGUG00000018767 8.417191 0.23746678 0.6131921 0.38726330
## pvalue padj
## 1: 0.96924196 0.9998844
## 2: 0.96429755 0.9998844
## 3: 0.13164295 0.9998844
## 4: 0.22909151 0.9998844
## 5: 0.59464944 0.9998844
## ---
## 8778: 0.23719294 0.9998844
## 8779: 0.08960822 0.9998844
## 8780: 0.05429146 0.9998844
## 8781: 0.03942533 0.9998844
## 8782: 0.69856130 0.9998844
#make a column of the rowname numbers into their own column
COPY_5_gn_sig <- COPY_5 %>% filter(pvalue < 0.05)
#find the significant genes in the results dataframe using a cutoff on the p values
heatmap_5 <- left_join(COPY_5_gn_sig, tpm_df, by = c('rn' = 'Gene.stable.ID'))
## Warning: Column `rn`/`Gene.stable.ID` joining character vector and factor,
## coercing into character vector
#set up dataframe with info needed for my heatmap
heatmap_5 <- heatmap_5 %>% select(1,8:13)
#select the only columns that have the samples and theiir corresponding tpm values
heatmap_5 <- heatmap_5 %>% remove_rownames %>% column_to_rownames(var="rn")
#make the gene id columns into the row names
pheatmap(heatmap_5, show_rownames = F)