diff --git a/CpG_annotations.R b/CpG_annotations.R index 0b0ffc579952c45fe8f6c192523e57b8dc390ec7..fad40006fda0aee7223b78110f87a7ecb3275628 100644 --- a/CpG_annotations.R +++ b/CpG_annotations.R @@ -1,12 +1,16 @@ library(data.table) library(ChIPpeakAnno) library(BSgenome.Ggallus.UCSC.galGal6) -library(TxDb.Ggallus.UCSC.galGal6.refGene) +library(GenomicFeatures) library(ggplot2) +library(Repitools) + +# genome annotation --------- +ens_galgal6 <- makeTxDbFromGFF("http://hgdownload.soe.ucsc.edu/goldenPath/galGal6/bigZips/genes/galGal6.ensGene.gtf.gz") # all CpG sites --------- # https://support.bioconductor.org/p/95239/ -chrs <- names(Ggallus) +chrs <- names(Ggallus) cgs <- lapply(chrs, function(x) start(matchPattern("CG", Ggallus[[x]]))) cgs <- do.call( c, @@ -21,7 +25,6 @@ cgs <- do.call( ) # CpG islands ---------- - cgi <- data.table::fread("http://hgdownload.soe.ucsc.edu/goldenPath/galGal6/database/cpgIslandExtUnmasked.txt.gz") cgi <- with(cgi, GRanges(V2, IRanges(V3, V4))) @@ -41,40 +44,51 @@ rrbs <- with(rrbs, GRanges(chr, IRanges(pos, pos +1))) length(rrbs) # 920670 length(intersect(rrbs, cgs)) # 809995, why are we loosing so much yet so few? -> probably non standards chr - # DMC ? --------- +# DMC ? --------- +dmc_c <- fread("/work2/genphyse/genepi/RRBS/ChickStress/AnalyseDiff/methdiff/run20200515/results/DMC.bed", skip = 1) +dmc_c <- with(dmc_c, GRanges(paste0("chr",V1), IRanges(V2, V3))) - # annotattion ----------- +dmc_s <- fread("/work2/genphyse/genepi/RRBS/ChickStress/AnalyseDiff/methdiff/run20200513/results/DMC.bed", skip = 1) +dmc_s <- with(dmc_s, GRanges(paste0("chr",V1), IRanges(V2, V3), direction = V4, difference = V5)) -regions <- list(cgs = cgs, cgis = cgis, rrbs = rrbs) +# annotattion ----------- +regions <- list(cgs = cgs, cgis = cgis, rrbs = rrbs, dmc_c = dmc_c, dmc_s = dmc_s) +nCpG <- vapply(regions, function(x) length(x), 0L) +nCpG + +# data aquisition and formating aCRs <- lapply(regions, function(x) assignChromosomeRegion( x, - nucleotideLevel = FALSE, - precedence = c("Promoters", "immediateDownstream", "fiveUTRs", "threeUTRs", "Exons", "Introns"), - TxDb = TxDb.Ggallus.UCSC.galGal6.refGene - )) + nucleotideLevel = FALSE, + precedence = c("Promoters", "immediateDownstream", "fiveUTRs", "threeUTRs", "Exons", "Introns"), + TxDb = ens_galgal6 +)) aCRs <- lapply(seq_along(aCRs), function(i) { - res <- as.data.table(aCRs[[i]]) - res[, "regions"] <- names(aCRs)[i] + res <- as.data.table(aCRs[[i]]$percentage, keep.rownames = TRUE) + colnames(res) <- c("regions", "percentage") + res[, "data"] <- names(aCRs)[i] res }) aCRs <- rbindlist(aCRs) -colnames(aCRs) <- c("regions", "percentage", "jaccard", "data") +# renaming and reordering a few things for plot aCRs$regions <- with(aCRs, dplyr::case_when( - regions == "Intergenic.Region" ~ "Intergenic", - regions == "fiveUTRs" ~ "5'UTR", - regions == "threeUTRs" ~ "3'UTR", - regions == "immediateDownstream" ~ "Downstream\n(≤1kb)", - TRUE ~ regions - )) - -col_order <- c("Intergenic", "Promoters", "5'UTR", "Exons", "Introns", "3'UTR", "Downstream\n(≤1kb)") -aCRs$data <- factor(aCRs$data, levels = c("cgs", "cgis", "rrbs")) - - + regions == "Intergenic.Region" ~ "Intergenic", + regions == "fiveUTRs" ~ "5'UTR", + regions == "threeUTRs" ~ "3'UTR", + regions == "immediateDownstream" ~ "Downstream\n(≤1kb)", + regions == "Promoters" ~ "Promoters\n(≤1kb)", + TRUE ~ regions +)) + +col_order <- c("Intergenic", "Promoters\n(≤1kb)", "5'UTR", "Exons", "Introns", "3'UTR", "Downstream\n(≤1kb)") +aCRs$data <- factor(aCRs$data, levels = c("cgs", "cgis", "rrbs", "dmc_c", "dmc_s")) +aCRs$isDMC <- ifelse(aCRs$data %in% c("dmc_c", "dmc_s"), TRUE, FALSE) + +# plot p1 <- ggplot(aCRs, aes(x = factor(regions, levels = col_order), y = percentage, fill = data)) + - geom_col(position = "dodge", color = "black") + + geom_col(aes(linetype = isDMC), position = "dodge", color = "black") + labs( title = "CpG localisations", x = "", @@ -83,10 +97,95 @@ p1 <- ggplot(aCRs, aes(x = factor(regions, levels = col_order), y = percentage, #coord_flip() + scale_fill_manual( name = "", - labels = c("All CpG", "CpG in CGI", "CpG in RRBS"), - values = viridisLite::viridis(3) + labels = c( + "All CpG\n(n=12M)", + "CpG in CGI\n(n=2.5M)", + "CpG in RRBS\n(n=0.9M)", + paste0("DMC per condition\n(n=", length(dmc_c), ")"), + paste0("DMC per sex\n(n=", length(dmc_s), ")") + ), + values = viridisLite::viridis(length(regions)) ) + theme_bw(base_size = 14) + - theme(legend.position = "top") + theme(legend.position = "top") + + guides(linetype = FALSE) + +ggsave("CpG_annotations.png", p1, width = 8) + + +# distance to closest TSS ------------ +transcripts <- as.data.table(toGRanges(ens_galgal6, feature="transcript")) + +# selection of one middle TSS amongst transcripts of the same gene +getMiddleLineFor <- function(gene) { + tempt <- dplyr::filter(transcripts, gene_id == gene) + if(tempt$strand[1] == "+") { + # si le gène est '+', le TSS est au 'start' + tempt <- dplyr::arrange(tempt, start) + } else if(tempt$strand[1] == "-") { + # si le gène est '-', le TSS est au 'end' + tempt <- dplyr::arrange(tempt, end) + } + # retourne la ligne du mileur + dplyr::slice(tempt, ceiling(nrow(tempt)/2)) +} + +# quite a long step :-/ +midTranscript <- lapply( + unique(transcripts$gene_id), + getMiddleLineFor +) +midTranscript <- rbindlist(midTranscript) +midTranscript <- with(midTranscript, GRanges(seqnames, IRanges(start, end), strand = strand, gene_id = gene_id)) + +cpg_arround_tss <- lapply( + seq_along(regions[1:3]), + function(i) { + fs <- featureScores( # utility function from Repitools + regions[[i]], + midTranscript, + up = 9875, + down = 9875, + freq = 250, + use.strand = FALSE, # reads must not be on same strand as feature + ) + fs <- tables(fs)[[1]] + fs <- fs * length(regions[[i]]) * 1000/250 # transform values in nCpG per kb + data.table( + data = names(regions)[i], + pos = as.numeric(colnames(fs)), + mean = apply(fs, 2, mean), + sem = apply(fs, 2, sd)/sqrt(nrow(fs)-1) + ) + } +) +cpg_arround_tss <- rbindlist(cpg_arround_tss) +cpg_arround_tss$data <- factor(cpg_arround_tss$data, levels = c("cgs", "cgis", "rrbs")) + +p2 <- ggplot(cpg_arround_tss, aes(x = pos, y = mean)) + + geom_ribbon(aes(ymin = mean - sem, ymax = mean + sem, fill = data), alpha = 0.5) + + geom_line(aes(colour = data)) + + scale_fill_manual( + name = "", + labels = c( + "All CpG\n(n=12M)", + "CpG in CGI\n(n=2.5M)", + "CpG in RRBS\n(n=0.9M)" + ), + values = viridisLite::viridis(3, begin = 0, end = 0.5) + ) + + scale_color_manual( + name = "", + labels = c( + "All CpG\n(n=12M)", + "CpG in CGI\n(n=2.5M)", + "CpG in RRBS\n(n=0.9M)" + ), + values = viridisLite::viridis(3, begin = 0, end = 0.5) + ) + + labs(title = "CpG distribution arround gene starts", x = "Distance to TSS", y = "Average number of CpG in a 1kb window") + + theme_bw(base_size = 14) + +ggsave("CpG_annotations_arround_TSS.png", p2, width = 8) + -ggsave("CpG_annotations.png", p1) diff --git a/CpG_annotations.png b/CpG_annotations.png index 1b8a55740b17cc81b09d6093dcc796fe382b0e41..6d20e2cbaf4b277d1cabef0973c9f3bae63a6163 100644 Binary files a/CpG_annotations.png and b/CpG_annotations.png differ diff --git a/CpG_annotations_arround_TSS.png b/CpG_annotations_arround_TSS.png new file mode 100644 index 0000000000000000000000000000000000000000..728d5d603ba9ad8f51d27fb02ab36294c9bfbcfc Binary files /dev/null and b/CpG_annotations_arround_TSS.png differ