suppressWarnings({library(readr)
library(readxl)
library(car)
library(PMCMRplus)
library(ggplot2)
library(tidyr)
library(dplyr)
library(skimr)
library(janitor)
library(lattice)})
The goal is to compare models the performance of binary classification models: PLS-DA, KNN, RF, SVM, and Neural Networks Models using the Matthews correlation coefficient (MCC) values.
Why MCC? The MCC is particularly useful for its ability to deal with class imbalances, providing a single number that gives a comprehensive picture of the model’s effectiveness. It comprehensively includes all the observations from the confusion matrix TN, TP, FN and FP.
model_data<-read_excel("Compiled_Model_1SE.xlsx")
model_data<-clean_names(model_data)
#Convert the pre-processing and models to factors
model_data$pre_processing<-as.factor(model_data$pre_processing)
model_data$model<-as.factor(model_data$model)
model_data$model_preprocessing<-as.factor(model_data$model_preprocessing)
head(model_data)
# A tibble: 6 × 4
pre_processing model model_preprocessing mcc
<fct> <fct> <fct> <dbl>
1 Unprocessed PLS-DA PLS-DA&Unprocessed 1
2 SG PLS-DA PLS-DA&SG 1
3 SG_1D PLS-DA PLS-DA&SG_1D 1
4 SG_2D PLS-DA PLS-DA&SG_2D 0.91
5 SNV PLS-DA PLS-DA&SNV 1
6 SNV_SG PLS-DA PLS-DA&SNV_SG 1
colnames(model_data)
[1] "pre_processing" "model" "model_preprocessing"
[4] "mcc"
str(model_data)
tibble [72 × 4] (S3: tbl_df/tbl/data.frame)
$ pre_processing : Factor w/ 11 levels "MSC","MSC_SG",..: 11 5 6 7 8 9 9 10 1 2 ...
$ model : Factor w/ 6 levels "ANN","KNN","NB",..: 4 4 4 4 4 4 4 4 4 4 ...
$ model_preprocessing: Factor w/ 66 levels "ANN&MSC","ANN&MSC_SG",..: 44 38 39 40 41 42 42 43 34 35 ...
$ mcc : num [1:72] 1 1 1 0.91 1 1 0.92 0.97 1 1 ...
histogram(~mcc, data = model_data,main = "", xlab = "MCC")
q1<-qqnorm(model_data$mcc, pch =8)
qqline(model_data$mcc, col="blue",lwd = 2)
shapiro.test(model_data$mcc)
Shapiro-Wilk normality test
data: model_data$mcc
W = 0.73327, p-value = 3.998e-10
lev_test<-leveneTest(mcc~model, data = model_data) # for the model type
lev_test_2<-leveneTest(mcc~pre_processing, data = model_data) # for pre-processing method
leve_test_3<-leveneTest(mcc~model_preprocessing, data = model_data)
print(lev_test)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 4.857 0.0007646 ***
66
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(lev_test_2)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 10 1.5115 0.1572
61
print(leve_test_3)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 65 2.8492e+29 < 2.2e-16 ***
6
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
** Based on the MCC values and the Levene Test, the assumption of equal variance is violated
kruskalTest(mcc~model, data = model_data)
Kruskal-Wallis test
data: mcc by model
chi-squared = 27.159, df = 5, p-value = 5.312e-05
kruskalTest(mcc~pre_processing, data = model_data)
Kruskal-Wallis test
data: mcc by pre_processing
chi-squared = 11.894, df = 10, p-value = 0.2922
kruskalTest(mcc~model_preprocessing, data = model_data)
Kruskal-Wallis test
data: mcc by model_preprocessing
chi-squared = 62.601, df = 65, p-value = 0.5613
library(ARTool)
# Apply the aligned rank transform for interactions
model_interac<- art(mcc ~ model * pre_processing, data = model_data)
summary(model_interac)
Aligned Rank Transform of Factorial Model
Call:
art(formula = mcc ~ model * pre_processing, data = model_data)
Column sums of aligned responses (should all be ~0):
model pre_processing model:pre_processing
0 0 0
F values of ANOVAs on aligned responses not of interest (should all be ~0):
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.000000 0.000000 0.001091 0.000000 0.006548
# ANOVA on the transformed data
anova_results <- anova(model_interac)
# Display the ANOVA results for interactions
print(anova_results)
Analysis of Variance of Aligned Rank Transformed Data
Table Type: Anova Table (Type III tests)
Model: No Repeated Measures (lm)
Response: art(mcc)
Df Df.res F value Pr(>F)
1 model 5 6 6.48729 0.020719 *
2 pre_processing 10 6 1.89225 0.224624
3 model:pre_processing 50 6 0.47313 0.931974
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(dunn.test)
dunn_res <- dunn.test(model_data$mcc, model_data$model, method="bonferroni")
Kruskal-Wallis rank sum test
data: x and group
Kruskal-Wallis chi-squared = 27.1589, df = 5, p-value = 0
Comparison of x by group
(Bonferroni)
Col Mean-|
Row Mean | ANN KNN NB PLS-DA RF
---------+-------------------------------------------------------
KNN | 1.993088
| 0.3469
|
NB | 4.229721 2.236632
| 0.0002* 0.1898
|
PLS-DA | -0.407564 -2.400652 -4.637285
| 1.0000 0.1227 0.0000*
|
RF | 1.476177 -0.516910 -2.753543 1.883741
| 1.0000 1.0000 0.0442 0.4470
|
SVM | 1.386712 -0.606375 -2.843008 1.794276 -0.089465
| 1.0000 1.0000 0.0335 0.5458 1.0000
alpha = 0.05
Reject Ho if p <= alpha/2
dunn_res$comparisons
[1] "ANN - KNN" "ANN - NB" "KNN - NB" "ANN - PLS-DA" "KNN - PLS-DA"
[6] "NB - PLS-DA" "ANN - RF" "KNN - RF" "NB - RF" "PLS-DA - RF"
[11] "ANN - SVM" "KNN - SVM" "NB - SVM" "PLS-DA - SVM" "RF - SVM"
model_comb<-kruskalTest(model_data$mcc, model_data$model_preprocessing,
method = "bonferroni")
print(model_comb)
Kruskal-Wallis test
data: model_data$mcc and model_data$model_preprocessing
chi-squared = 62.601, df = 65, p-value = 0.5613
library(patchwork)
p1<-ggplot(data = model_data,mapping = aes(x=model, y = mcc))+
geom_boxplot(aes(fill= model))+labs(x = "Model", y = "MCC (Full-Spectra)")+theme_bw()+
theme(axis.title.x = element_text(color = "black",size= 9),
axis.text.x = element_text(color = "black", angle = 90, size= 8),
axis.text.y = element_text(color = "black",size =8),
axis.title.y = element_text(color = "black",size =9),
panel.grid.major = element_blank(),panel.grid.minor = element_blank(),
legend.position = "right",
legend.key.size = unit(0.3, "cm"),
legend.title = element_text(color = "black",size = 9),
legend.text = element_text(size = 8),
legend.spacing.y = unit(0.03, "cm"),
legend.margin = margin(0.5, 0.5, 0.5, 0.5, "pt"))+
scale_fill_brewer(palette = "Set1")+ stat_summary(fun=mean, geom="point", shape=18, size=2, color="red")
p1
my_colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b",
"#e377c2", "#7f7f7f", "#bcbd22", "#17becf", "#1a55FF", "#9c8305")
p2<-ggplot(data = model_data,mapping = aes(x=pre_processing, y = mcc))+
geom_boxplot(aes(fill = pre_processing))+labs(x = "Pre-processing", y = "MCC (Full-Spectra)")+
theme_bw()+
theme(axis.title.x = element_text(color = "black", size =9),
axis.title.y = element_text(colour = "black", size =9),
axis.text.y = element_text(color = "black",size =8),
axis.text.x = element_text(color = "black",size = 8,angle = 90),
panel.grid.major = element_blank(),panel.grid.minor = element_blank(),
legend.position = "right")+scale_color_manual(values = my_colors)+
scale_fill_manual(values = my_colors)+ theme(
legend.key.size = unit(0.3, "cm"), legend.text = element_text(size = 8),
legend.spacing.y = unit(0.03, "cm"),
legend.margin = margin(0.5, 0.5, 0.5, 0.5, "pt"))+
theme(legend.title = element_text(color = "black",size = 9),
aspect.ratio = 0.5)+
stat_summary(fun=mean, geom="point", shape=18, size=2, color="red")
p2
#Load data
model_data_2<-read_excel("Compiled_Model_1SE.xlsx",sheet = "Sheet2")
model_data_2<-clean_names(model_data_2)
colnames(model_data_2)
[1] "pre_processing" "model" "model_preprocessing"
[4] "mcc"
#Convert the pre-processing and models to factors
model_data_2$pre_processing<-as.factor(model_data_2$pre_processing)
model_data_2$model<-as.factor(model_data_2$model)
model_data_2$model_preprocessing<-as.factor(model_data_2$model_preprocessing)
histogram(~mcc, data = model_data_2, col = "lightblue")
qqnorm(model_data_2$mcc)
qqline(model_data_2$mcc, col = "red")
#Normality
shapiro.test(model_data_2$mcc)
Shapiro-Wilk normality test
data: model_data_2$mcc
W = 0.87179, p-value = 1.427e-05
#Homogeneity of variance
leveneTest(mcc~model, data = model_data_2)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 4 1.0166 0.4069
55
leveneTest(mcc~pre_processing, data = model_data_2)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 10 1.768 0.09207 .
49
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(mcc~model_preprocessing, data = model_data_2)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 54 5.0119e+28 < 2.2e-16 ***
5
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Apply the aligned rank transform for interactions
model_interac_2<- art(mcc ~ model * pre_processing, data = model_data_2)
summary(model_interac_2)
Aligned Rank Transform of Factorial Model
Call:
art(formula = mcc ~ model * pre_processing, data = model_data_2)
Column sums of aligned responses (should all be ~0):
model pre_processing model:pre_processing
0 0 0
F values of ANOVAs on aligned responses not of interest (should all be ~0):
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.02381 0.00000 0.14289
# ANOVA on the transformed data
anova_results_2 <- anova(model_interac_2)
# Display the ANOVA results for interactions
print(anova_results_2)
Analysis of Variance of Aligned Rank Transformed Data
Table Type: Anova Table (Type III tests)
Model: No Repeated Measures (lm)
Response: art(mcc)
Df Df.res F value Pr(>F)
1 model 4 5 8.5313 0.018568 *
2 pre_processing 10 5 1.1668 0.458921
3 model:pre_processing 40 5 1.3491 0.402827
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dunn_res_2 <- dunn.test(model_data_2$mcc, model_data_2$model, method="bonferroni")
Kruskal-Wallis rank sum test
data: x and group
Kruskal-Wallis chi-squared = 27.6808, df = 4, p-value = 0
Comparison of x by group
(Bonferroni)
Col Mean-|
Row Mean | ANN KNN PLS-DA RF
---------+--------------------------------------------
KNN | 3.917352
| 0.0004*
|
PLS-DA | 0.201802 -3.715549
| 1.0000 0.0010*
|
RF | 3.181364 -0.735987 2.979561
| 0.0073* 1.0000 0.0144*
|
SVM | 3.264460 -0.652892 3.062657 0.083095
| 0.0055* 1.0000 0.0110* 1.0000
alpha = 0.05
Reject Ho if p <= alpha/2
dunn_res_2$comparisons
[1] "ANN - KNN" "ANN - PLS-DA" "KNN - PLS-DA" "ANN - RF" "KNN - RF"
[6] "PLS-DA - RF" "ANN - SVM" "KNN - SVM" "PLS-DA - SVM" "RF - SVM"
p3<-ggplot(data = model_data_2,mapping = aes(x=model, y = mcc))+
geom_boxplot(aes(fill= model))+labs(x = "Model", y = "MCC (Variable Selection)")+
theme_bw()+
theme(axis.title.x = element_text(color = "black",size = 9),
axis.text.x = element_text(color = "black", angle = 90, size = 8),
axis.text.y = element_text(color = "black",size = 8),
axis.title.y = element_text(color = "black", size = 9),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "right",legend.key.size = unit(0.3, "cm"),
legend.title = element_text(color = "black",size = 9),
legend.text = element_text(size = 8),
legend.spacing.y = unit(0.03, "cm"),
legend.margin = margin(0.5, 0.5, 0.5, 0.5, "pt"))+
scale_fill_brewer(palette = "Pastel1")+
stat_summary(fun = mean,geom = "point",shape = 18, size = 2, color = "red")
p3
p4<-ggplot(data = model_data_2,mapping = aes(x=pre_processing, y = mcc))+
geom_boxplot(aes(fill = pre_processing))+labs(x = "Pre-processing", y = "MCC (Variable Selection)")+
theme_bw()+
theme(axis.title.x = element_text(color = "black", size = 9),
axis.title.y = element_text(color = "black", size = 9),
axis.text.x = element_text(color = "black",angle = 90, size = 8),
axis.text.y = element_text(color = "black",size = 8),
panel.grid.major = element_blank(),panel.grid.minor = element_blank(),
legend.position = "right")+
theme(legend.key.size = unit(0.3, "cm"),
legend.title = element_text(color = "black",size = 9),
legend.text = element_text(size = 8),
legend.spacing.y = unit(0.03, "cm"),
legend.margin = margin(0.5, 0.5, 0.5, 0.5, "pt"),aspect.ratio = 0.5)+
stat_summary(fun=mean, geom="point", shape=18, size=2, color="red")+
scale_fill_brewer(palette = "Set3")
p4
# Overall model performance
#Load data
model_data_3<-read_excel("Compiled_Model_1SE.xlsx",sheet = "Sheet3")
model_data_3<-clean_names(model_data_3)
colnames(model_data_3)
[1] "pre_processing" "model" "model_preprocessing"
[4] "mcc_1" "mcc_2"
#Convert the pre-processing and models to factors
model_data_3$pre_processing<-as.factor(model_data_3$pre_processing)
model_data_3$model<-as.factor(model_data_3$model)
model_data_3$model_preprocessing<-as.factor(model_data_3$model_preprocessing)
All the test comparison will be based on the ‘model type’ considering that it is the main significant factor influencing Matthews Correlation Coefficient (MCC).
Additionally, normality assumption is violated, we will therefore proceed with non-parametric t-tests
overall_wilcox_test<-wilcox.test(model_data_3$mcc_1,model_data_3$mcc_2,
paired = TRUE, alternative = "two.sided")
print(overall_wilcox_test)
Wilcoxon signed rank test with continuity correction
data: model_data_3$mcc_1 and model_data_3$mcc_2
V = 748.5, p-value = 0.4237
alternative hypothesis: true location shift is not equal to 0
wilcox.test(model_data_3$mcc_1,model_data_3$mcc_2,
paired = TRUE, alternative = "greater")
Wilcoxon signed rank test with continuity correction
data: model_data_3$mcc_1 and model_data_3$mcc_2
V = 748.5, p-value = 0.2119
alternative hypothesis: true location shift is greater than 0
wilcox.test(model_data_3$mcc_1,model_data_3$mcc_2,
paired = TRUE, alternative = "less")
Wilcoxon signed rank test with continuity correction
data: model_data_3$mcc_1 and model_data_3$mcc_2
V = 748.5, p-value = 0.7909
alternative hypothesis: true location shift is less than 0
pls_data<-subset(model_data_3,model == "PLS-DA",
select = c("model","mcc_1","mcc_2"))
head(pls_data)
# A tibble: 6 × 3
model mcc_1 mcc_2
<fct> <dbl> <dbl>
1 PLS-DA 1 0.97
2 PLS-DA 1 1
3 PLS-DA 1 0.99
4 PLS-DA 0.91 0.99
5 PLS-DA 1 0.95
6 PLS-DA 1 0.99
dim(pls_data)
[1] 12 3
plsda_wilcox_test<-wilcox.test(pls_data$mcc_1,pls_data$mcc_2,
paired = TRUE, alternative = "two.sided")
print(plsda_wilcox_test)
Wilcoxon signed rank test with continuity correction
data: pls_data$mcc_1 and pls_data$mcc_2
V = 30.5, p-value = 0.7972
alternative hypothesis: true location shift is not equal to 0
mean(pls_data$mcc_1)
[1] 0.9833333
mean(pls_data$mcc_2)
[1] 0.985
knn_data<-subset(model_data_3,model == "KNN",
select = c("model","mcc_1","mcc_2"))
head(knn_data)
# A tibble: 6 × 3
model mcc_1 mcc_2
<fct> <dbl> <dbl>
1 KNN 0.7 0.91
2 KNN 0.7 0.88
3 KNN 0.95 0.9
4 KNN 0.95 0.9
5 KNN 0.95 0.94
6 KNN 0.95 0.94
dim(knn_data)
[1] 12 3
knn_wilcox_test<-wilcox.test(knn_data$mcc_1,knn_data$mcc_2,
paired = TRUE, alternative = "two.sided")
print(knn_wilcox_test)
Wilcoxon signed rank test with continuity correction
data: knn_data$mcc_1 and knn_data$mcc_2
V = 42, p-value = 0.4444
alternative hypothesis: true location shift is not equal to 0
mean(knn_data$mcc_1)
[1] 0.9191667
mean(knn_data$mcc_2)
[1] 0.9291667
RF_data<-subset(model_data_3,model == "RF",
select = c("model","mcc_1","mcc_2"))
head(RF_data)
# A tibble: 6 × 3
model mcc_1 mcc_2
<fct> <dbl> <dbl>
1 RF 0.94 0.95
2 RF 0.94 0.95
3 RF 0.98 0.96
4 RF 1 0.96
5 RF 0.84 0.95
6 RF 0.84 0.84
dim(RF_data)
[1] 12 3
RF_wilcox_test<-wilcox.test(RF_data$mcc_1,RF_data$mcc_2,
paired = TRUE, alternative = "two.sided")
print(RF_wilcox_test)
Wilcoxon signed rank test with continuity correction
data: RF_data$mcc_1 and RF_data$mcc_2
V = 36, p-value = 0.8236
alternative hypothesis: true location shift is not equal to 0
mean(RF_data$mcc_1)
[1] 0.9416667
mean(RF_data$mcc_2)
[1] 0.9391667
SVM_data<-subset(model_data_3,model == "SVM",
select = c("model","mcc_1","mcc_2"))
head(SVM_data)
# A tibble: 6 × 3
model mcc_1 mcc_2
<fct> <dbl> <dbl>
1 SVM 0.96 0.96
2 SVM 0.95 0.96
3 SVM 1 0.85
4 SVM 1 0.84
5 SVM 0.79 0.96
6 SVM 0.79 0.96
dim(SVM_data)
[1] 12 3
SVM_wilcox_test<-wilcox.test(SVM_data$mcc_1,SVM_data$mcc_2,
paired = TRUE, alternative = "two.sided")
print(SVM_wilcox_test)
Wilcoxon signed rank test with continuity correction
data: SVM_data$mcc_1 and SVM_data$mcc_2
V = 20, p-value = 0.4724
alternative hypothesis: true location shift is not equal to 0
mean(SVM_data$mcc_1)
[1] 0.9183333
mean(SVM_data$mcc_2)
[1] 0.9266667
ANN_data<-subset(model_data_3,model == "ANN",
select = c("model","mcc_1","mcc_2"))
head(ANN_data)
# A tibble: 6 × 3
model mcc_1 mcc_2
<fct> <dbl> <dbl>
1 ANN 0.98 0.96
2 ANN 0.98 0.96
3 ANN 0.99 1
4 ANN 0.99 1
5 ANN 0.99 0.96
6 ANN 0.99 0.96
dim(ANN_data)
[1] 12 3
ANN_wilcox_test<-wilcox.test(ANN_data$mcc_1,ANN_data$mcc_2,
paired = TRUE, alternative = "two.sided")
print(ANN_wilcox_test)
Wilcoxon signed rank test with continuity correction
data: ANN_data$mcc_1 and ANN_data$mcc_2
V = 34, p-value = 0.1878
alternative hypothesis: true location shift is not equal to 0
mean(ANN_data$mcc_1)
[1] 0.99
mean(ANN_data$mcc_2)
[1] 0.9833333