Report for UU application.
Written by Dat Thanh Nguyen
- Task 1: identifying patients that have a tumor with a genetic variant that matched to specific cancer treatments
- Task 2: Counting genetic variants are found in each tumor
- Task 3: Identification tumors has a genetic variant in a known driver gene: ATRX
- Task 4: Integrating information
- Task 5: Plotting
- Exporting results of task2 and task 4 into txt files
# set working directory
# please change this line of code to your directory that contains 3 files of dataset to reproduce my results.
# load data
data1 = read.delim("200417_Dataset_01.txt", stringsAsFactors = FALSE)
data2 = read.delim("200417_Dataset_02.txt", stringsAsFactors = FALSE)
data3 = read.delim("180413_Dataset_03.txt", stringsAsFactors = FALSE)
Task 1: identifying patients that have a tumor with a genetic variant that matched to specific cancer treatments
# create a new colunm named "OncoKB" with initial value is NA
data2$OncoKB = NA
# flag patients with associated generic variants that present on 'Precision Oncology Knowledge Base' as "OncoKB positive".
data2$OncoKB[which(data2$Gene %in% data3$Gene)]= "OncoKB positive"
# See how many patient-tumors pairs have its correnspoding treatment
table(data2$OncoKB) # some patients have multiple tumors => overlaping.
## OncoKB positive
## 42
# patients have its correnspoding treatment
patient_OncoKB_positive = data2$Sample_id[which(data2$OncoKB == "OncoKB positive")]
# see UNIQUE patients have its correnspoding treatment
## [1] "QCMG-66-ITNET_0813-SMGres-ASRL-20131114-064"
## [2] "PanNET93PT"
## [3] "PanNET31PT"
## [4] "QCMG-66-ITNET_0783-SMGres-ASRL-20131114-074"
## [5] "TCGA-3A-A9IO"
## [6] "QCMG-66-ITNET_1257-SMGRES-ASRL-20131106-004"
## [7] "QCMG-66-ITNET_0152-SMGres-ASRL-20131114-036"
## [8] "QCMG-66-ICGC_0501-ICGC-ABMB-20131120-014"
## [9] "INS13"
## [10] "QCMG-66-ITNET_1053-SMGres-ASRL-20131114-106"
## [11] "QCMG-66-ITNET_0020-SMGres-ASRL-20131114-024"
## [12] "QCMG-66-NE_0033-ICGC-MGLP-20131004-030"
## [13] "QCMG-66-ITNET_0107-SMGres-ASRL-20131114-008"
## [14] "QCMG-66-ITNET_1308-SMGRES-ASRL-20131106-030"
## [15] "QCMG-66-ICGC_0498-ICGC-ABMB-20131107-127"
## [16] "PanNET10PT"
## [17] "QCMG-66-ITNET_1320-SMGRES-ASRL-20131106-042"
## [18] "QCMG-66-ITNET_0052-SMGres-ASRL-20131114-002"
## [19] "MO_1529.SI_11749.SI_11750.exome.01"
## [20] "QCMG-66-ITNET_0938-SMGres-ASRL-20131114-082"
## [21] "INS3"
## [22] "Case3"
## [23] "TCGA-3A-A9IS"
## [24] "PanNET25PT"
## [25] "QCMG-66-ITNET_0124-SMGres-ASRL-20131114-016"
## [26] "TCGA-3A-A9IV"
## [27] "QCMG-66-ITNET_0128-SMGres-ASRL-20131114-004"
## [28] "QCMG-66-ITNET_0900-SMGres-ASRL-20131114-076"
## [29] "PanNET24PT"
## [30] "QCMG-66-ITNET_0833-SMGres-ASRL-20131114-066"
## [31] "QCMG-66-ICGC_0432-ICGC-ABMB-20120905-010"
## [32] "QCMG-66-NE_0012-ICGC-MGLP-20131004-006"
## [33] "Case2"
## [34] "QCMG-66-ITNET_1047-SMGres-ASRL-20131114-102"
## [35] "QCMG-66-NE_0026-ICGC-MGLP-20131004-020"
Task 2: Counting genetic variants are found in each tumor
# create unique patient-tumor identifier
data2$unique_tumorID = paste(data2$Sample_id,data2$Sample_2_id, sep = "--")
# create a data frame to keep task 2 results
task2 = data.frame(unique_tumorID = unique(data2$unique_tumorID))
2.1 Number of genetic variants per each tumor
variants_count = table(data2$unique_tumorID)
task2$variants_count = variants_count[match(task2$unique_tumorID,names(variants_count))]
2.2 Number of number of substitutions per each tumor
substitution_count = table(data2$unique_tumorID[data2$Type == "substitution"])
task2$substitution_count = substitution_count[match(task2$unique_tumorID,names(substitution_count))]
2.3 Number of number of INDELs per each tumor
indel_count = table(data2$unique_tumorID[data2$Type == "ins" | data2$Type == "del"])
task2$indel_count = indel_count[match(task2$unique_tumorID,names(indel_count))]
# see the fisrt 6 rows of results
## unique_tumorID variants_count
## 1 NET-009--Diagnosis 36
## 2 QCMG-66-ICGC_0433-ICGC-ABMB-20120905-011-- 45
## 3 QCMG-66-ITNET_0813-SMGres-ASRL-20131114-064-- 37
## 4 PanNET93PT-- 15
## 5 PanNET31PT-- 21
## 6 QCMG-66-ITNET_0783-SMGres-ASRL-20131114-074-- 36
## substitution_count indel_count
## 1 22 14
## 2 42 3
## 3 32 5
## 4 14 1
## 5 19 2
## 6 33 3
Task 3: Identification tumors has a genetic variant in a known driver gene: ATRX
# re-use data2$unique_tumorID generated in task 2
ATRX_driven = data2$unique_tumorID[data2$Gene %in% "ATRX"]
# print unique_tumorID of ATRX_driven tummors
## [1] "QCMG-66-ITNET_1257-SMGRES-ASRL-20131106-004--"
## [2] "QCMG-66-ICGC_0437-ICGC-ABMB-20120905-014--"
## [3] "PanNET3PT--"
## [4] "QCMG-66-ITNET_0052-SMGres-ASRL-20131114-002--"
## [5] "QCMG-66-ICGC_0498-ICGC-ABMB-20131107-127--"
## [6] "Case2--"
## [7] "TCGA-3A-A9IS--"
## [8] "NET-009--Diagnosis"
## [9] "NET-009--Second Biopsy"
## [10] "NET-008--Second Biopsy"
## [11] "QCMG-66-ITNET_0935-SMGres-ASRL-20131114-080--"
## [12] "QCMG-66-ITNET_0020-SMGres-ASRL-20131114-024--"
## [13] "QCMG-66-ITNET_0026-SMGres-ASRL-20131114-022--"
## [14] "QCMG-66-ITNET_0900-SMGres-ASRL-20131114-076--"
## [15] "QCMG-66-ITNET_1000-SMGres-ASRL-20131114-094--"
## [16] "QCMG-66-ICGC_0436-ICGC-ABMB-20120905-087--"
Task 4: Integrating information
# Generating unique_tumorID for dataset 1
data1$unique_tumorID = paste(data1$Patient_id,data1$Sample_id, sep = "--")
# Matching task 1 results to data1
data1$OncoKB = NA
data1$OncoKB[which(data1$Patient_id %in% patient_OncoKB_positive)] = "OncoKB positive"
# Matching task 2 results to data1
## geting index
match_Index = match(data1$unique_tumorID,task2$unique_tumorID)
## matching by variable
data1$variants_count = task2$variants_count[match_Index]
data1$substitution_count = task2$substitution_count[match_Index]
data1$indel_count = task2$indel_count[match_Index]
# see the fisrt 10 rows results with selected colunms
data1[1:10, c("unique_tumorID","OncoKB","variants_count","substitution_count","indel_count")]
## unique_tumorID OncoKB variants_count substitution_count indel_count
## 1 NET-001-- <NA> NA NA NA
## 2 NET-003--Diagnosis <NA> 39 28 11
## 3 NET-003--Second biopsy <NA> NA NA NA
## 4 NET-008-- <NA> NA NA NA
## 5 NET-009--Diagnosis <NA> 36 22 14
## 6 NET-009--Second biopsy <NA> NA NA NA
## 7 Case 1-- <NA> NA NA NA
## 8 Case 2-- <NA> NA NA NA
## 9 Case 3-- <NA> NA NA NA
## 10 Case 4-- <NA> NA NA NA
Task 5: Plotting
# add a dummy colunm
task2$boxplot = "variant_count_per_patient"
p <- ggplot(task2, aes( x = boxplot, y= variants_count))
p + geom_boxplot(outlier.shape = NA) + scale_y_continuous(trans = 'log2') + theme_classic() + geom_jitter(shape=16, position=position_jitter(0.2), color = "blue")
- Note: Number of variant is a counting variable Thus, I did use log2 transform to obtain normal distribution before ploting.
Exporting results of task2 and task 4 into txt files
write.table(task2, file = "task2.txt", sep = "\t",row.names = FALSE)
write.table(data1, file = "task4.txt", sep = "\t",row.names = FALSE)
Please see the exported files in your working directory