Report for UU application.
Written by Dat Thanh Nguyen on
Prism highlighter is a very powerful thing. In this article I'm going to show you what you can actually do with it, some tricks and tips while editing your post. Tocs is also enabled as you can see in summary.
Summary
- Task 1: identifying patients that have a tumor with a genetic variant that matched to specific cancer treatments
- Task 2: Counting genetic variants are found in each tumor
- Task 3: Identification tumors has a genetic variant in a known driver gene: ATRX
- Task 4: Integrating information
- Task 5: Plotting
- Exporting results of task2 and task 4 into txt files
# set working directory
# please change this line of code to your directory that contains 3 files of dataset to reproduce my results.
setwd("/Users/datn/Documents/DATA2020/PhD_application/UU_bioinfor_test")
# load data
data1 = read.delim("200417_Dataset_01.txt", stringsAsFactors = FALSE)
data2 = read.delim("200417_Dataset_02.txt", stringsAsFactors = FALSE)
data3 = read.delim("180413_Dataset_03.txt", stringsAsFactors = FALSE)
Task 1: identifying patients that have a tumor with a genetic variant that matched to specific cancer treatments
# create a new colunm named "OncoKB" with initial value is NA
data2$OncoKB = NA
# flag patients with associated generic variants that present on 'Precision Oncology Knowledge Base' as "OncoKB positive".
data2$OncoKB[which(data2$Gene %in% data3$Gene)]= "OncoKB positive"
# See how many patient-tumors pairs have its correnspoding treatment
table(data2$OncoKB) # some patients have multiple tumors => overlaping.
##
## OncoKB positive
## 42
# patients have its correnspoding treatment
patient_OncoKB_positive = data2$Sample_id[which(data2$OncoKB == "OncoKB positive")]
# see UNIQUE patients have its correnspoding treatment
unique(patient_OncoKB_positive)
## [1] "QCMG-66-ITNET_0813-SMGres-ASRL-20131114-064"
## [2] "PanNET93PT"
## [3] "PanNET31PT"
## [4] "QCMG-66-ITNET_0783-SMGres-ASRL-20131114-074"
## [5] "TCGA-3A-A9IO"
## [6] "QCMG-66-ITNET_1257-SMGRES-ASRL-20131106-004"
## [7] "QCMG-66-ITNET_0152-SMGres-ASRL-20131114-036"
## [8] "QCMG-66-ICGC_0501-ICGC-ABMB-20131120-014"
## [9] "INS13"
## [10] "QCMG-66-ITNET_1053-SMGres-ASRL-20131114-106"
## [11] "QCMG-66-ITNET_0020-SMGres-ASRL-20131114-024"
## [12] "QCMG-66-NE_0033-ICGC-MGLP-20131004-030"
## [13] "QCMG-66-ITNET_0107-SMGres-ASRL-20131114-008"
## [14] "QCMG-66-ITNET_1308-SMGRES-ASRL-20131106-030"
## [15] "QCMG-66-ICGC_0498-ICGC-ABMB-20131107-127"
## [16] "PanNET10PT"
## [17] "QCMG-66-ITNET_1320-SMGRES-ASRL-20131106-042"
## [18] "QCMG-66-ITNET_0052-SMGres-ASRL-20131114-002"
## [19] "MO_1529.SI_11749.SI_11750.exome.01"
## [20] "QCMG-66-ITNET_0938-SMGres-ASRL-20131114-082"
## [21] "INS3"
## [22] "Case3"
## [23] "TCGA-3A-A9IS"
## [24] "PanNET25PT"
## [25] "QCMG-66-ITNET_0124-SMGres-ASRL-20131114-016"
## [26] "TCGA-3A-A9IV"
## [27] "QCMG-66-ITNET_0128-SMGres-ASRL-20131114-004"
## [28] "QCMG-66-ITNET_0900-SMGres-ASRL-20131114-076"
## [29] "PanNET24PT"
## [30] "QCMG-66-ITNET_0833-SMGres-ASRL-20131114-066"
## [31] "QCMG-66-ICGC_0432-ICGC-ABMB-20120905-010"
## [32] "QCMG-66-NE_0012-ICGC-MGLP-20131004-006"
## [33] "Case2"
## [34] "QCMG-66-ITNET_1047-SMGres-ASRL-20131114-102"
## [35] "QCMG-66-NE_0026-ICGC-MGLP-20131004-020"
Task 2: Counting genetic variants are found in each tumor
# create unique patient-tumor identifier
data2$unique_tumorID = paste(data2$Sample_id,data2$Sample_2_id, sep = "--")
# create a data frame to keep task 2 results
task2 = data.frame(unique_tumorID = unique(data2$unique_tumorID))
2.1 Number of genetic variants per each tumor
variants_count = table(data2$unique_tumorID)
task2$variants_count = variants_count[match(task2$unique_tumorID,names(variants_count))]
2.2 Number of number of substitutions per each tumor
substitution_count = table(data2$unique_tumorID[data2$Type == "substitution"])
task2$substitution_count = substitution_count[match(task2$unique_tumorID,names(substitution_count))]
2.3 Number of number of INDELs per each tumor
indel_count = table(data2$unique_tumorID[data2$Type == "ins" | data2$Type == "del"])
task2$indel_count = indel_count[match(task2$unique_tumorID,names(indel_count))]
# see the fisrt 6 rows of results
head(task2)
## unique_tumorID variants_count
## 1 NET-009--Diagnosis 36
## 2 QCMG-66-ICGC_0433-ICGC-ABMB-20120905-011-- 45
## 3 QCMG-66-ITNET_0813-SMGres-ASRL-20131114-064-- 37
## 4 PanNET93PT-- 15
## 5 PanNET31PT-- 21
## 6 QCMG-66-ITNET_0783-SMGres-ASRL-20131114-074-- 36
## substitution_count indel_count
## 1 22 14
## 2 42 3
## 3 32 5
## 4 14 1
## 5 19 2
## 6 33 3
Task 3: Identification tumors has a genetic variant in a known driver gene: ATRX
# re-use data2$unique_tumorID generated in task 2
ATRX_driven = data2$unique_tumorID[data2$Gene %in% "ATRX"]
# print unique_tumorID of ATRX_driven tummors
ATRX_driven
## [1] "QCMG-66-ITNET_1257-SMGRES-ASRL-20131106-004--"
## [2] "QCMG-66-ICGC_0437-ICGC-ABMB-20120905-014--"
## [3] "PanNET3PT--"
## [4] "QCMG-66-ITNET_0052-SMGres-ASRL-20131114-002--"
## [5] "QCMG-66-ICGC_0498-ICGC-ABMB-20131107-127--"
## [6] "Case2--"
## [7] "TCGA-3A-A9IS--"
## [8] "NET-009--Diagnosis"
## [9] "NET-009--Second Biopsy"
## [10] "NET-008--Second Biopsy"
## [11] "QCMG-66-ITNET_0935-SMGres-ASRL-20131114-080--"
## [12] "QCMG-66-ITNET_0020-SMGres-ASRL-20131114-024--"
## [13] "QCMG-66-ITNET_0026-SMGres-ASRL-20131114-022--"
## [14] "QCMG-66-ITNET_0900-SMGres-ASRL-20131114-076--"
## [15] "QCMG-66-ITNET_1000-SMGres-ASRL-20131114-094--"
## [16] "QCMG-66-ICGC_0436-ICGC-ABMB-20120905-087--"
Task 4: Integrating information
# Generating unique_tumorID for dataset 1
data1$unique_tumorID = paste(data1$Patient_id,data1$Sample_id, sep = "--")
# Matching task 1 results to data1
data1$OncoKB = NA
data1$OncoKB[which(data1$Patient_id %in% patient_OncoKB_positive)] = "OncoKB positive"
# Matching task 2 results to data1
## geting index
match_Index = match(data1$unique_tumorID,task2$unique_tumorID)
## matching by variable
data1$variants_count = task2$variants_count[match_Index]
data1$substitution_count = task2$substitution_count[match_Index]
data1$indel_count = task2$indel_count[match_Index]
# see the fisrt 10 rows results with selected colunms
data1[1:10, c("unique_tumorID","OncoKB","variants_count","substitution_count","indel_count")]
## unique_tumorID OncoKB variants_count substitution_count indel_count
## 1 NET-001-- <NA> NA NA NA
## 2 NET-003--Diagnosis <NA> 39 28 11
## 3 NET-003--Second biopsy <NA> NA NA NA
## 4 NET-008-- <NA> NA NA NA
## 5 NET-009--Diagnosis <NA> 36 22 14
## 6 NET-009--Second biopsy <NA> NA NA NA
## 7 Case 1-- <NA> NA NA NA
## 8 Case 2-- <NA> NA NA NA
## 9 Case 3-- <NA> NA NA NA
## 10 Case 4-- <NA> NA NA NA
Task 5: Plotting
library(ggplot2)
# add a dummy colunm
task2$boxplot = "variant_count_per_patient"
p <- ggplot(task2, aes( x = boxplot, y= variants_count))
p + geom_boxplot(outlier.shape = NA) + scale_y_continuous(trans = 'log2') + theme_classic() + geom_jitter(shape=16, position=position_jitter(0.2), color = "blue")
- Note: Number of variant is a counting variable Thus, I did use log2 transform to obtain normal distribution before ploting.
Exporting results of task2 and task 4 into txt files
write.table(task2, file = "task2.txt", sep = "\t",row.names = FALSE)
write.table(data1, file = "task4.txt", sep = "\t",row.names = FALSE)
Please see the exported files in your working directory
Comments