dim(discoveryData_ISB58)
length(ISB355_id)
length(ISB58_id)
dim(discoveryData_ISB355)
library(tidyverse)
dim(GSE36809_data)
ISB63_genes
ISB19_genes
tGenes63
GSE36809_sampleinfo
View(GSE36809_sampleinfo)
ISB63_genes
tumorType <- c('ACC',
'BLCA',
'BRCA',
'CESC',
'COADREAD',
'ESCC',
'GEA',
'HNSC',
'KIRCKICH',
'KIRP',
'LGGGBM',
'LIHCCHOL',
'LUAD',
'LUSC',
'MESO',
'OV',
'PAAD',
'PCPG',
'PRAD',
'SARC',
'SKCM',
'TGCT',
'THCA',
'THYM',
'UCEC',
'UVM')
tumorType <- c('ACC',
'BLCA',
'BRCA',
'CESC',
'COADREAD',
'ESCC',
'GEA',
'HNSC',
'KIRCKICH',
'KIRP',
'LGGGBM',
'LIHCCHOL',
'LUAD',
'LUSC',
'MESO',
'OV',
'PAAD',
'PCPG',
'PRAD',
'SARC',
'SKCM',
'TGCT',
'THCA',
'THYM',
'UCEC',
'UVM')
ii <- 1
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv", sep="\t", header=T)
d <- read.table(name)
name
#==
setwd("E:/GDAN/CCG_TMP_AWG/Output/v12_new/sample_predictions")
tumorType <- c('ACC',
'BLCA',
'BRCA',
'CESC',
'COADREAD',
'ESCC',
'GEA',
'HNSC',
'KIRCKICH',
'KIRP',
'LGGGBM',
'LIHCCHOL',
'LUAD',
'LUSC',
'MESO',
'OV',
'PAAD',
'PCPG',
'PRAD',
'SARC',
'SKCM',
'TGCT',
'THCA',
'THYM',
'UCEC',
'UVM')
ii <- 1
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv", sep="\t", header=T)
name
d <- read.table(name)
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv")
name
d <- read.table(name, sep="\t", header=T)
nC <- ncol(d)
length(unique(d$Sample_ID))
table(unique(data.frame(d$Sample_ID, d$Label))$d.Label)
library(tidyverse)
dsub <- d %>%
gather(Feature, PredLabel, 6:nC) %>%
mutate(Pred = as.numeric(Label == PredLabel)) %>%
group_by(Repeat, Test, Label, Feature) %>%
summarise(sPred = sum(Pred)/n()) %>%
spread(Feature, sPred)
accuracy_train <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 0) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Train") #%>%
accuracy_test <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 1) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Test") #%>%
df <- rbind(accuracy_train, accuracy_test)
#df$Feature <- as.factor(as.numeric(sub("X", "", df$Feature)))
#df$Feature <- str_replace_all(gsub('.{11}$', '', df$Feature), "[.]", "-")
df$Feature <- gsub('.{11}$', '', df$Feature)
#library(gtools)
featureTypeOrder <- c("CF.All_All", "CF.All_Top.1000", "CF.All_Top.100", "CF.All_Top.50", "CF.All_Top.10", "CF.All_Top.5", "CF.All_Top.1",
"CF.CNVR_All", "CF.CNVR_Top.1000", "CF.CNVR_Top.100", "CF.CNVR_Top.50", "CF.CNVR_Top.10", "CF.CNVR_Top.5", "CF.CNVR_Top.1",
"CF.GEXP_All", "CF.GEXP_Top.1000", "CF.GEXP_Top.100", "CF.GEXP_Top.50", "CF.GEXP_Top.10", "CF.GEXP_Top.5", "CF.GEXP_Top.1",
"CF.METH_All", "CF.METH_Top.1000", "CF.METH_Top.100", "CF.METH_Top.50", "CF.METH_Top.10", "CF.METH_Top.5", "CF.METH_Top.1",
"CF.MUTA_All", "CF.MUTA_Top.1000", "CF.MUTA_Top.100", "CF.MUTA_Top.50", "CF.MUTA_Top.10", "CF.MUTA_Top.5", "CF.MUTA_Top.1")
library(ggplot2)
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle("Accuracy")
p + theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 15),
axis.text.y = element_text(size= 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15))
ggsave(paste0(tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 12, height = 10, units ="cm")
getwd()
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle("Accuracy")
p + theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 8),
axis.text.y = element_text(size= 8),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10))
ggsave(paste0(tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 12, height = 10, units ="cm")
ggsave("../../", paste0(tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 12, height = 10, units ="cm")
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 12, height = 10, units ="cm")
for ( ii in c(1:26) ) {
print(ii)
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv")
name
d <- read.table(name, sep="\t", header=T)
nC <- ncol(d)
length(unique(d$Sample_ID))
table(unique(data.frame(d$Sample_ID, d$Label))$d.Label)
library(tidyverse)
#df <- d
#fNames <- colnames(df[, 6:nC])
#colnames(df)[6:nC] <- str_replace_all(gsub('.{11}$', '', fNames), "[.]", "-")
dsub <- d %>%
gather(Feature, PredLabel, 6:nC) %>%
mutate(Pred = as.numeric(Label == PredLabel)) %>%
group_by(Repeat, Test, Label, Feature) %>%
summarise(sPred = sum(Pred)/n()) %>%
spread(Feature, sPred)
accuracy_train <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 0) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Train") #%>%
accuracy_test <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 1) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Test") #%>%
df <- rbind(accuracy_train, accuracy_test)
#df$Feature <- as.factor(as.numeric(sub("X", "", df$Feature)))
#df$Feature <- str_replace_all(gsub('.{11}$', '', df$Feature), "[.]", "-")
df$Feature <- gsub('.{11}$', '', df$Feature)
#library(gtools)
featureTypeOrder <- c("CF.All_All", "CF.All_Top.1000", "CF.All_Top.100", "CF.All_Top.50", "CF.All_Top.10", "CF.All_Top.5", "CF.All_Top.1",
"CF.CNVR_All", "CF.CNVR_Top.1000", "CF.CNVR_Top.100", "CF.CNVR_Top.50", "CF.CNVR_Top.10", "CF.CNVR_Top.5", "CF.CNVR_Top.1",
"CF.GEXP_All", "CF.GEXP_Top.1000", "CF.GEXP_Top.100", "CF.GEXP_Top.50", "CF.GEXP_Top.10", "CF.GEXP_Top.5", "CF.GEXP_Top.1",
"CF.METH_All", "CF.METH_Top.1000", "CF.METH_Top.100", "CF.METH_Top.50", "CF.METH_Top.10", "CF.METH_Top.5", "CF.METH_Top.1",
"CF.MUTA_All", "CF.MUTA_Top.1000", "CF.MUTA_Top.100", "CF.MUTA_Top.50", "CF.MUTA_Top.10", "CF.MUTA_Top.5", "CF.MUTA_Top.1")
library(ggplot2)
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle("Accuracy")
p + theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 8),
axis.text.y = element_text(size= 8),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10))
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 12, height = 10, units ="cm")
}
View(accuracy_test)
tumorType <- c('ACC',
'BLCA',
'BRCA',
'CESC',
'COADREAD',
'ESCC',
'GEA',
'HNSC',
'KIRCKICH',
'KIRP',
'LGGGBM',
'LIHCCHOL',
'LUAD',
'LUSC',
'MESO',
'OV',
'PAAD',
'PCPG',
'PRAD',
'SARC',
'SKCM',
'TGCT',
'THCA',
'THYM',
'UCEC',
'UVM')
for ( ii in c(1:26) ) {
print(ii)
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv")
name
d <- read.table(name, sep="\t", header=T)
nC <- ncol(d)
length(unique(d$Sample_ID))
table(unique(data.frame(d$Sample_ID, d$Label))$d.Label)
library(tidyverse)
#df <- d
#fNames <- colnames(df[, 6:nC])
#colnames(df)[6:nC] <- str_replace_all(gsub('.{11}$', '', fNames), "[.]", "-")
dsub <- d %>%
gather(Feature, PredLabel, 6:nC) %>%
mutate(Pred = as.numeric(Label == PredLabel)) %>%
group_by(Repeat, Test, Label, Feature) %>%
summarise(sPred = sum(Pred)/n()) %>%
spread(Feature, sPred)
accuracy_train <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 0) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Train") #%>%
accuracy_test <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 1) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Test") #%>%
df <- rbind(accuracy_train, accuracy_test)
#df$Feature <- as.factor(as.numeric(sub("X", "", df$Feature)))
#df$Feature <- str_replace_all(gsub('.{11}$', '', df$Feature), "[.]", "-")
df$Feature <- gsub('.{11}$', '', df$Feature)
#library(gtools)
featureTypeOrder <- c("CF.All_All", "CF.All_Top.1000", "CF.All_Top.100", "CF.All_Top.50", "CF.All_Top.10", "CF.All_Top.5", "CF.All_Top.1",
"CF.CNVR_All", "CF.CNVR_Top.1000", "CF.CNVR_Top.100", "CF.CNVR_Top.50", "CF.CNVR_Top.10", "CF.CNVR_Top.5", "CF.CNVR_Top.1",
"CF.GEXP_All", "CF.GEXP_Top.1000", "CF.GEXP_Top.100", "CF.GEXP_Top.50", "CF.GEXP_Top.10", "CF.GEXP_Top.5", "CF.GEXP_Top.1",
"CF.METH_All", "CF.METH_Top.1000", "CF.METH_Top.100", "CF.METH_Top.50", "CF.METH_Top.10", "CF.METH_Top.5", "CF.METH_Top.1",
"CF.MIR_All", "CF.MIR_Top.1000", "CF.MIR_Top.100", "CF.MIR_Top.50", "CF.MIR_Top.10", "CF.MIR_Top.5", "CF.MIR_Top.1",
"CF.MUTA_All", "CF.MUTA_Top.1000", "CF.MUTA_Top.100", "CF.MUTA_Top.50", "CF.MUTA_Top.10", "CF.MUTA_Top.5", "CF.MUTA_Top.1")
library(ggplot2)
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle("Accuracy")
p + theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 8),
axis.text.y = element_text(size= 8),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10))
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 12, height = 10, units ="cm")
}
for ( ii in c(1:26) ) {
print(ii)
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv")
name
d <- read.table(name, sep="\t", header=T)
nC <- ncol(d)
length(unique(d$Sample_ID))
table(unique(data.frame(d$Sample_ID, d$Label))$d.Label)
library(tidyverse)
#df <- d
#fNames <- colnames(df[, 6:nC])
#colnames(df)[6:nC] <- str_replace_all(gsub('.{11}$', '', fNames), "[.]", "-")
dsub <- d %>%
gather(Feature, PredLabel, 6:nC) %>%
mutate(Pred = as.numeric(Label == PredLabel)) %>%
group_by(Repeat, Test, Label, Feature) %>%
summarise(sPred = sum(Pred)/n()) %>%
spread(Feature, sPred)
accuracy_train <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 0) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Train") #%>%
accuracy_test <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 1) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Test") #%>%
df <- rbind(accuracy_train, accuracy_test)
#df$Feature <- as.factor(as.numeric(sub("X", "", df$Feature)))
#df$Feature <- str_replace_all(gsub('.{11}$', '', df$Feature), "[.]", "-")
df$Feature <- gsub('.{11}$', '', df$Feature)
#library(gtools)
featureTypeOrder <- c("CF.All_All", "CF.All_Top.1000", "CF.All_Top.100", "CF.All_Top.50", "CF.All_Top.10", "CF.All_Top.5", "CF.All_Top.1",
"CF.CNVR_All", "CF.CNVR_Top.1000", "CF.CNVR_Top.100", "CF.CNVR_Top.50", "CF.CNVR_Top.10", "CF.CNVR_Top.5", "CF.CNVR_Top.1",
"CF.GEXP_All", "CF.GEXP_Top.1000", "CF.GEXP_Top.100", "CF.GEXP_Top.50", "CF.GEXP_Top.10", "CF.GEXP_Top.5", "CF.GEXP_Top.1",
"CF.METH_All", "CF.METH_Top.1000", "CF.METH_Top.100", "CF.METH_Top.50", "CF.METH_Top.10", "CF.METH_Top.5", "CF.METH_Top.1",
"CF.MIR_All", "CF.MIR_Top.1000", "CF.MIR_Top.100", "CF.MIR_Top.50", "CF.MIR_Top.10", "CF.MIR_Top.5", "CF.MIR_Top.1",
"CF.MUTA_All", "CF.MUTA_Top.1000", "CF.MUTA_Top.100", "CF.MUTA_Top.50", "CF.MUTA_Top.10", "CF.MUTA_Top.5", "CF.MUTA_Top.1")
library(ggplot2)
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle("Accuracy")
p + theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 6),
axis.text.y = element_text(size= 6),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8))
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 14, height = 12, units ="cm")
}
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 14, height = 8, units ="cm")
for ( ii in c(1:26) ) {
print(ii)
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv")
name
d <- read.table(name, sep="\t", header=T)
nC <- ncol(d)
length(unique(d$Sample_ID))
table(unique(data.frame(d$Sample_ID, d$Label))$d.Label)
library(tidyverse)
#df <- d
#fNames <- colnames(df[, 6:nC])
#colnames(df)[6:nC] <- str_replace_all(gsub('.{11}$', '', fNames), "[.]", "-")
dsub <- d %>%
gather(Feature, PredLabel, 6:nC) %>%
mutate(Pred = as.numeric(Label == PredLabel)) %>%
group_by(Repeat, Test, Label, Feature) %>%
summarise(sPred = sum(Pred)/n()) %>%
spread(Feature, sPred)
accuracy_train <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 0) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Train") #%>%
accuracy_test <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 1) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Test") #%>%
df <- rbind(accuracy_train, accuracy_test)
#df$Feature <- as.factor(as.numeric(sub("X", "", df$Feature)))
#df$Feature <- str_replace_all(gsub('.{11}$', '', df$Feature), "[.]", "-")
df$Feature <- gsub('.{11}$', '', df$Feature)
#library(gtools)
featureTypeOrder <- c("CF.All_All", "CF.All_Top.1000", "CF.All_Top.100", "CF.All_Top.50", "CF.All_Top.10", "CF.All_Top.5", "CF.All_Top.1",
"CF.CNVR_All", "CF.CNVR_Top.1000", "CF.CNVR_Top.100", "CF.CNVR_Top.50", "CF.CNVR_Top.10", "CF.CNVR_Top.5", "CF.CNVR_Top.1",
"CF.GEXP_All", "CF.GEXP_Top.1000", "CF.GEXP_Top.100", "CF.GEXP_Top.50", "CF.GEXP_Top.10", "CF.GEXP_Top.5", "CF.GEXP_Top.1",
"CF.METH_All", "CF.METH_Top.1000", "CF.METH_Top.100", "CF.METH_Top.50", "CF.METH_Top.10", "CF.METH_Top.5", "CF.METH_Top.1",
"CF.MIR_All", "CF.MIR_Top.1000", "CF.MIR_Top.100", "CF.MIR_Top.50", "CF.MIR_Top.10", "CF.MIR_Top.5", "CF.MIR_Top.1",
"CF.MUTA_All", "CF.MUTA_Top.1000", "CF.MUTA_Top.100", "CF.MUTA_Top.50", "CF.MUTA_Top.10", "CF.MUTA_Top.5", "CF.MUTA_Top.1")
library(ggplot2)
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle("Accuracy")
p + theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 6),
axis.text.y = element_text(size= 6),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8))
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 14, height = 8, units ="cm")
}
for ( ii in c(1:26) ) {
print(ii)
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv")
name
d <- read.table(name, sep="\t", header=T)
nC <- ncol(d)
length(unique(d$Sample_ID))
table(unique(data.frame(d$Sample_ID, d$Label))$d.Label)
library(tidyverse)
#df <- d
#fNames <- colnames(df[, 6:nC])
#colnames(df)[6:nC] <- str_replace_all(gsub('.{11}$', '', fNames), "[.]", "-")
dsub <- d %>%
gather(Feature, PredLabel, 6:nC) %>%
mutate(Pred = as.numeric(Label == PredLabel)) %>%
group_by(Repeat, Test, Label, Feature) %>%
summarise(sPred = sum(Pred)/n()) %>%
spread(Feature, sPred)
accuracy_train <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 0) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Train") #%>%
accuracy_test <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 1) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Test") #%>%
df <- rbind(accuracy_train, accuracy_test)
#df$Feature <- as.factor(as.numeric(sub("X", "", df$Feature)))
#df$Feature <- str_replace_all(gsub('.{11}$', '', df$Feature), "[.]", "-")
df$Feature <- gsub('.{11}$', '', df$Feature)
#library(gtools)
featureTypeOrder <- c("CF.All_All", "CF.All_Top.1000", "CF.All_Top.100", "CF.All_Top.50", "CF.All_Top.10", "CF.All_Top.5", "CF.All_Top.1",
"CF.CNVR_All", "CF.CNVR_Top.1000", "CF.CNVR_Top.100", "CF.CNVR_Top.50", "CF.CNVR_Top.10", "CF.CNVR_Top.5", "CF.CNVR_Top.1",
"CF.GEXP_All", "CF.GEXP_Top.1000", "CF.GEXP_Top.100", "CF.GEXP_Top.50", "CF.GEXP_Top.10", "CF.GEXP_Top.5", "CF.GEXP_Top.1",
"CF.METH_All", "CF.METH_Top.1000", "CF.METH_Top.100", "CF.METH_Top.50", "CF.METH_Top.10", "CF.METH_Top.5", "CF.METH_Top.1",
"CF.MIR_All", "CF.MIR_Top.1000", "CF.MIR_Top.100", "CF.MIR_Top.50", "CF.MIR_Top.10", "CF.MIR_Top.5", "CF.MIR_Top.1",
"CF.MUTA_All", "CF.MUTA_Top.1000", "CF.MUTA_Top.100", "CF.MUTA_Top.50", "CF.MUTA_Top.10", "CF.MUTA_Top.5", "CF.MUTA_Top.1")
library(ggplot2)
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle(paste0("Balanced Accuracy - ", tumorType[ii]))
p + theme(
plot.title = element_text(size = 20),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 6),
axis.text.y = element_text(size= 6),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8))
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 14, height = 8, units ="cm")
}
for ( ii in c(1:26) ) {
print(ii)
name <- paste0(tumorType[ii], "_cloudforest_sample_predictions.tsv")
name
d <- read.table(name, sep="\t", header=T)
nC <- ncol(d)
length(unique(d$Sample_ID))
table(unique(data.frame(d$Sample_ID, d$Label))$d.Label)
library(tidyverse)
#df <- d
#fNames <- colnames(df[, 6:nC])
#colnames(df)[6:nC] <- str_replace_all(gsub('.{11}$', '', fNames), "[.]", "-")
dsub <- d %>%
gather(Feature, PredLabel, 6:nC) %>%
mutate(Pred = as.numeric(Label == PredLabel)) %>%
group_by(Repeat, Test, Label, Feature) %>%
summarise(sPred = sum(Pred)/n()) %>%
spread(Feature, sPred)
accuracy_train <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 0) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Train") #%>%
accuracy_test <- dsub %>%
gather(Feature, sPred, 4:(nC-2)) %>%
filter(Test == 1) %>%
group_by(Repeat, Feature) %>%
summarise(accuracy = mean(sPred))  %>%
mutate(Type = "Test") #%>%
df <- rbind(accuracy_train, accuracy_test)
#df$Feature <- as.factor(as.numeric(sub("X", "", df$Feature)))
#df$Feature <- str_replace_all(gsub('.{11}$', '', df$Feature), "[.]", "-")
df$Feature <- gsub('.{11}$', '', df$Feature)
#library(gtools)
featureTypeOrder <- c("CF.All_All", "CF.All_Top.1000", "CF.All_Top.100", "CF.All_Top.50", "CF.All_Top.10", "CF.All_Top.5", "CF.All_Top.1",
"CF.CNVR_All", "CF.CNVR_Top.1000", "CF.CNVR_Top.100", "CF.CNVR_Top.50", "CF.CNVR_Top.10", "CF.CNVR_Top.5", "CF.CNVR_Top.1",
"CF.GEXP_All", "CF.GEXP_Top.1000", "CF.GEXP_Top.100", "CF.GEXP_Top.50", "CF.GEXP_Top.10", "CF.GEXP_Top.5", "CF.GEXP_Top.1",
"CF.METH_All", "CF.METH_Top.1000", "CF.METH_Top.100", "CF.METH_Top.50", "CF.METH_Top.10", "CF.METH_Top.5", "CF.METH_Top.1",
"CF.MIR_All", "CF.MIR_Top.1000", "CF.MIR_Top.100", "CF.MIR_Top.50", "CF.MIR_Top.10", "CF.MIR_Top.5", "CF.MIR_Top.1",
"CF.MUTA_All", "CF.MUTA_Top.1000", "CF.MUTA_Top.100", "CF.MUTA_Top.50", "CF.MUTA_Top.10", "CF.MUTA_Top.5", "CF.MUTA_Top.1")
library(ggplot2)
p <- ggplot(df, aes(x = Feature, y = accuracy, fill = Type)) +
geom_boxplot(lwd=0.01, outlier.alpha = 0.1) +
scale_x_discrete(limits = featureTypeOrder) +
ggtitle(paste0("Balanced Accuracy - ", tumorType[ii]))
p + theme(
plot.title = element_text(size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size= 6),
axis.text.y = element_text(size= 6),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8))
ggsave(paste0("../../", tumorType[ii], "_v12new_accuracy_03262021.tiff"), width = 14, height = 8, units ="cm")
}
