Report for UU application.

Written by Dat Thanh Nguyen on 09 Jun 2000

Prism highlighter is a very powerful thing. In this article I'm going to show you what you can actually do with it, some tricks and tips while editing your post. Tocs is also enabled as you can see in summary.

Summary

Task 1: identifying patients that have a tumor with a genetic variant that matched to specific cancer treatments
Task 2: Counting genetic variants are found in each tumor
Task 3: Identification tumors has a genetic variant in a known driver gene: ATRX
Task 4: Integrating information
Task 5: Plotting
Exporting results of task2 and task 4 into txt files

# set working directory

# please change this line of code to your directory that contains 3 files of dataset to reproduce my results.

setwd("/Users/datn/Documents/DATA2020/PhD_application/UU_bioinfor_test")

# load data
data1 = read.delim("200417_Dataset_01.txt", stringsAsFactors = FALSE)
data2 = read.delim("200417_Dataset_02.txt", stringsAsFactors = FALSE)
data3 = read.delim("180413_Dataset_03.txt", stringsAsFactors = FALSE)

Task 1: identifying patients that have a tumor with a genetic variant that matched to specific cancer treatments

# create a new colunm named "OncoKB" with initial value is NA
data2$OncoKB  = NA

# flag patients with associated generic variants that present on  'Precision Oncology Knowledge Base' as "OncoKB positive".
data2$OncoKB[which(data2$Gene %in% data3$Gene)]= "OncoKB positive"

# See how many patient-tumors pairs have its correnspoding treatment 
table(data2$OncoKB) # some patients have multiple tumors => overlaping.

## 
## OncoKB positive 
##              42

# patients have its correnspoding treatment
patient_OncoKB_positive = data2$Sample_id[which(data2$OncoKB == "OncoKB positive")]

# see UNIQUE patients have its correnspoding treatment
unique(patient_OncoKB_positive)

##  [1] "QCMG-66-ITNET_0813-SMGres-ASRL-20131114-064"
##  [2] "PanNET93PT"                                 
##  [3] "PanNET31PT"                                 
##  [4] "QCMG-66-ITNET_0783-SMGres-ASRL-20131114-074"
##  [5] "TCGA-3A-A9IO"                               
##  [6] "QCMG-66-ITNET_1257-SMGRES-ASRL-20131106-004"
##  [7] "QCMG-66-ITNET_0152-SMGres-ASRL-20131114-036"
##  [8] "QCMG-66-ICGC_0501-ICGC-ABMB-20131120-014"   
##  [9] "INS13"                                      
## [10] "QCMG-66-ITNET_1053-SMGres-ASRL-20131114-106"
## [11] "QCMG-66-ITNET_0020-SMGres-ASRL-20131114-024"
## [12] "QCMG-66-NE_0033-ICGC-MGLP-20131004-030"     
## [13] "QCMG-66-ITNET_0107-SMGres-ASRL-20131114-008"
## [14] "QCMG-66-ITNET_1308-SMGRES-ASRL-20131106-030"
## [15] "QCMG-66-ICGC_0498-ICGC-ABMB-20131107-127"   
## [16] "PanNET10PT"                                 
## [17] "QCMG-66-ITNET_1320-SMGRES-ASRL-20131106-042"
## [18] "QCMG-66-ITNET_0052-SMGres-ASRL-20131114-002"
## [19] "MO_1529.SI_11749.SI_11750.exome.01"         
## [20] "QCMG-66-ITNET_0938-SMGres-ASRL-20131114-082"
## [21] "INS3"                                       
## [22] "Case3"                                      
## [23] "TCGA-3A-A9IS"                               
## [24] "PanNET25PT"                                 
## [25] "QCMG-66-ITNET_0124-SMGres-ASRL-20131114-016"
## [26] "TCGA-3A-A9IV"                               
## [27] "QCMG-66-ITNET_0128-SMGres-ASRL-20131114-004"
## [28] "QCMG-66-ITNET_0900-SMGres-ASRL-20131114-076"
## [29] "PanNET24PT"                                 
## [30] "QCMG-66-ITNET_0833-SMGres-ASRL-20131114-066"
## [31] "QCMG-66-ICGC_0432-ICGC-ABMB-20120905-010"   
## [32] "QCMG-66-NE_0012-ICGC-MGLP-20131004-006"     
## [33] "Case2"                                      
## [34] "QCMG-66-ITNET_1047-SMGres-ASRL-20131114-102"
## [35] "QCMG-66-NE_0026-ICGC-MGLP-20131004-020"

Task 2: Counting genetic variants are found in each tumor

# create unique patient-tumor identifier

data2$unique_tumorID = paste(data2$Sample_id,data2$Sample_2_id, sep = "--")

# create a data frame to keep task 2 results

task2 = data.frame(unique_tumorID = unique(data2$unique_tumorID))

2.1 Number of genetic variants per each tumor

variants_count = table(data2$unique_tumorID)

task2$variants_count = variants_count[match(task2$unique_tumorID,names(variants_count))]

2.2 Number of number of substitutions per each tumor

substitution_count = table(data2$unique_tumorID[data2$Type == "substitution"])

task2$substitution_count = substitution_count[match(task2$unique_tumorID,names(substitution_count))]

2.3 Number of number of INDELs per each tumor

indel_count = table(data2$unique_tumorID[data2$Type == "ins" | data2$Type == "del"])

task2$indel_count = indel_count[match(task2$unique_tumorID,names(indel_count))]

# see the fisrt 6 rows of results

head(task2)

##                                  unique_tumorID variants_count
## 1                            NET-009--Diagnosis             36
## 2    QCMG-66-ICGC_0433-ICGC-ABMB-20120905-011--             45
## 3 QCMG-66-ITNET_0813-SMGres-ASRL-20131114-064--             37
## 4                                  PanNET93PT--             15
## 5                                  PanNET31PT--             21
## 6 QCMG-66-ITNET_0783-SMGres-ASRL-20131114-074--             36
##   substitution_count indel_count
## 1                 22          14
## 2                 42           3
## 3                 32           5
## 4                 14           1
## 5                 19           2
## 6                 33           3

Task 3: Identification tumors has a genetic variant in a known driver gene: ATRX

# re-use data2$unique_tumorID generated in task 2

ATRX_driven = data2$unique_tumorID[data2$Gene %in% "ATRX"]

# print unique_tumorID of ATRX_driven tummors
ATRX_driven

##  [1] "QCMG-66-ITNET_1257-SMGRES-ASRL-20131106-004--"
##  [2] "QCMG-66-ICGC_0437-ICGC-ABMB-20120905-014--"   
##  [3] "PanNET3PT--"                                  
##  [4] "QCMG-66-ITNET_0052-SMGres-ASRL-20131114-002--"
##  [5] "QCMG-66-ICGC_0498-ICGC-ABMB-20131107-127--"   
##  [6] "Case2--"                                      
##  [7] "TCGA-3A-A9IS--"                               
##  [8] "NET-009--Diagnosis"                           
##  [9] "NET-009--Second Biopsy"                       
## [10] "NET-008--Second Biopsy"                       
## [11] "QCMG-66-ITNET_0935-SMGres-ASRL-20131114-080--"
## [12] "QCMG-66-ITNET_0020-SMGres-ASRL-20131114-024--"
## [13] "QCMG-66-ITNET_0026-SMGres-ASRL-20131114-022--"
## [14] "QCMG-66-ITNET_0900-SMGres-ASRL-20131114-076--"
## [15] "QCMG-66-ITNET_1000-SMGres-ASRL-20131114-094--"
## [16] "QCMG-66-ICGC_0436-ICGC-ABMB-20120905-087--"

Task 4: Integrating information

# Generating unique_tumorID for dataset 1
data1$unique_tumorID = paste(data1$Patient_id,data1$Sample_id, sep = "--")

# Matching task 1 results to data1

data1$OncoKB = NA

data1$OncoKB[which(data1$Patient_id %in% patient_OncoKB_positive)] = "OncoKB positive"

# Matching task 2 results to data1

## geting index
match_Index = match(data1$unique_tumorID,task2$unique_tumorID)

## matching by variable
data1$variants_count = task2$variants_count[match_Index]
data1$substitution_count = task2$substitution_count[match_Index]
data1$indel_count = task2$indel_count[match_Index]

# see the fisrt 10 rows results with selected colunms

data1[1:10, c("unique_tumorID","OncoKB","variants_count","substitution_count","indel_count")]

##            unique_tumorID OncoKB variants_count substitution_count indel_count
## 1               NET-001--   <NA>             NA                 NA          NA
## 2      NET-003--Diagnosis   <NA>             39                 28          11
## 3  NET-003--Second biopsy   <NA>             NA                 NA          NA
## 4               NET-008--   <NA>             NA                 NA          NA
## 5      NET-009--Diagnosis   <NA>             36                 22          14
## 6  NET-009--Second biopsy   <NA>             NA                 NA          NA
## 7                Case 1--   <NA>             NA                 NA          NA
## 8                Case 2--   <NA>             NA                 NA          NA
## 9                Case 3--   <NA>             NA                 NA          NA
## 10               Case 4--   <NA>             NA                 NA          NA

Task 5: Plotting

library(ggplot2)

# add a dummy colunm

task2$boxplot = "variant_count_per_patient"

p <- ggplot(task2, aes( x = boxplot, y= variants_count))

p + geom_boxplot(outlier.shape = NA) + scale_y_continuous(trans = 'log2') + theme_classic() + geom_jitter(shape=16, position=position_jitter(0.2), color = "blue")

Note: Number of variant is a counting variable Thus, I did use log2 transform to obtain normal distribution before ploting.

Exporting results of task2 and task 4 into txt files

write.table(task2, file = "task2.txt", sep = "\t",row.names = FALSE)
write.table(data1, file = "task4.txt", sep = "\t",row.names = FALSE)

Please see the exported files in your working directory