diff --git a/build_models/evaluation.py b/build_models/evaluation.py
index b6a49fadee2b6cb35b2cb298ffb4319388dc83b0..efe864534043bfc96b2f19b01428a85fbff85cae 100644
--- a/build_models/evaluation.py
+++ b/build_models/evaluation.py
@@ -5,7 +5,7 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 import itertools
 
-from parameters import SAVE_MISCLASSIFIED_PATH, SAVE_CONFUSIONM_PATH, SAVE_PATH_LEARNING_CURVES, MODEL_NAME
+from parameters import SAVE_MISCLASSIFIED_PATH, SAVE_CONFUSIONM_PATH, SAVE_PATH_LEARNING_CURVES, MODEL_NAME, AXIS_CONVERTION
 
 def get_learning_curves(train_losses, val_losses, train_accuracies, val_accuracies, savename = None):
     epochs = range(1, len(train_losses) + 1)
@@ -33,7 +33,59 @@ def get_learning_curves(train_losses, val_losses, train_accuracies, val_accuraci
         plt.savefig(SAVE_PATH_LEARNING_CURVES + ".png", dpi=300, bbox_inches='tight')
 
 
-def averaged_misclassified_data(final_predictions, ground_truth_dict, grouped_predictions):
+def get_misclassified_data(idx_pred, y_true, prob_pred, gt_labels_conf, test_image_paths, test_labels):
+    error_indicator = idx_pred != y_true
+    idxs = [i for i, x in enumerate(error_indicator) if x]
+    misclassified_prob = list(itertools.compress(prob_pred, error_indicator))
+    misclassified_gt_prob = list(itertools.compress(gt_labels_conf, error_indicator))
+    misclassified_path = list(itertools.compress(test_image_paths, error_indicator))
+    mislabelled = {}
+
+    for i in range(0, len(idxs)):  # Loop through indexes directly
+        true_label = test_labels[y_true[idxs[i]]]  # Get the true label once
+
+        if isinstance(misclassified_path[i], str):
+            image_path = misclassified_path[i].rsplit("_", 2)[0]       # Get the wrong path once
+            axis = misclassified_path[i].rsplit("_", 2)[1] 
+        else:
+            image_path = misclassified_path[i][0].rsplit("_", 2)[0]
+            axis = misclassified_path[i][0].rsplit("_", 2)[1] 
+
+        model_name_entry = mislabelled.setdefault(true_label, {}).setdefault(image_path, {}).setdefault(AXIS_CONVERTION[axis], {})
+        # Check and append `predicted_layer`
+        if 'predicted_layer' in model_name_entry:
+            if isinstance(model_name_entry['predicted_layer'], list):
+                model_name_entry['predicted_layer'].append(test_labels[idx_pred[idxs[i]]])
+            else:
+                # Convert existing value to a list and append new value
+                model_name_entry['predicted_layer'] = [model_name_entry['predicted_layer'], test_labels[idx_pred[idxs[i]]]]
+        else:
+            model_name_entry['predicted_layer'] = test_labels[idx_pred[idxs[i]]]
+
+        # Check and append `confidence`
+        if 'confidence' in model_name_entry:
+            if isinstance(model_name_entry['confidence'], list):
+                model_name_entry['confidence'].append(float(misclassified_prob[i]))
+            else:
+                model_name_entry['confidence'] = [model_name_entry['confidence'], float(misclassified_prob[i])]
+        else:
+            model_name_entry['confidence'] = float(misclassified_prob[i])
+
+        # Check and append `gt_confidence`
+        if 'gt_confidence' in model_name_entry:
+            if isinstance(model_name_entry['gt_confidence'], list):
+                model_name_entry['gt_confidence'].append(float(misclassified_gt_prob[i]))
+            else:
+                model_name_entry['gt_confidence'] = [model_name_entry['gt_confidence'], float(misclassified_gt_prob[i])]
+        else:
+            model_name_entry['gt_confidence'] = float(misclassified_gt_prob[i])
+
+        # Save the mislabelled dictionary to a JSON file
+        with open(SAVE_MISCLASSIFIED_PATH + ".json", 'w') as json_file:
+            json.dump(mislabelled, json_file, indent=4)  # indent for pretty printing
+
+
+def averaged_misclassified_data(final_predictions, ground_truth_dict, grouped_predictions, grouped_gt_predictions, test_labels):
     y_true = []
     y_pred_major = []
     misclassified_path = []
@@ -46,52 +98,23 @@ def averaged_misclassified_data(final_predictions, ground_truth_dict, grouped_pr
         if predicted_class != ground_truth_dict[base_name]:
             misclassified_path.append(base_name)
             misclassified_data[base_name] = {
-                "gt": int(ground_truth_dict[base_name]),  # Convert to standard Python int
-                "each_pred": [int(np.argmax(arr)) for arr in grouped_predictions[base_name]],  # Ensure each_pred is a list of Python ints
+                "gt": test_labels[int(ground_truth_dict[base_name])],  # Convert to standard Python int
+                "each_pred": [test_labels[int(np.argmax(arr))] for arr in grouped_predictions[base_name]],  # Ensure each_pred is a list of Python ints
                 "each_conf": [float(np.max(arr)) for arr in grouped_predictions[base_name]],  # Ensure each_conf is a list of Python floats
+                "each_gt_conf": [float(np.max(arr)) for arr in grouped_gt_predictions[base_name]],  # Ensure each_conf is a list of Python floats
                 "final_pred": int(final_predictions[base_name]),  # Convert to standard Python int
                 "confidence": float(np.max(np.sum(np.array(grouped_predictions[base_name]), axis=0) / 3))  # Convert to Python float
             }
-            # without avergaing
-            """ misclassified_data[base_name] = {
-                "gt": ground_truth_dict[base_name],                                     # Ground truth value
-                "each_pred": [np.argmax(arr) for arr in grouped_predictions[base_name]],   # Each prediction
-                "each_conf": [np.max(arr) for arr in grouped_predictions[base_name]],   # Each confidence
-                "final_pred": final_predictions[base_name],                             # Final prediction
-                "confidence": np.max(np.sum(np.array(grouped_predictions[base_name]), axis = 0)/3)
-            }"""
 
     y_true = np.array(y_true)
     y_pred_major = np.array(y_pred_major)
 
     # Save the mislabelled dictionary to a JSON file
-    SAVE_MISCLASSIFIED_PATH_averaging = SAVE_MISCLASSIFIED_PATH + "averaging.json"
-    with open(SAVE_MISCLASSIFIED_PATH_averaging, 'w') as json_file:
+    with open(SAVE_MISCLASSIFIED_PATH + "_averaging.json", 'w') as json_file:
         json.dump(misclassified_data, json_file, indent=4)  # indent for pretty printing
 
     return y_true, y_pred_major
 
-def different_anatomical_plane_misclassified_data(idx_pred, y_true, prob_pred, gt_labels_conf, test_image_paths, test_labels):
-    error_indicator = idx_pred != y_true
-    idxs = [i for i, x in enumerate(error_indicator) if x]
-    misclassified_prob = list(itertools.compress(prob_pred, error_indicator))
-    misclassified_gt_prob = list(itertools.compress(gt_labels_conf, error_indicator))
-    misclassified_path = list(itertools.compress(test_image_paths, error_indicator))
-    mislabelled = {}
-
-    for i in range(0, len(idxs)):  # Loop through indexes directly
-        true_label = test_labels[y_true[idxs[i]]]  # Get the true label once
-        image_path = misclassified_path[i][0].rsplit("_", 2)[0]       # Get the wrong path once
-        model_name_entry = mislabelled.setdefault(true_label, {}).setdefault(image_path, {}).setdefault(MODEL_NAME, {})
-
-        # Assign predicted layer and confidence
-        model_name_entry['predicted_layer'] = test_labels[idx_pred[idxs[i]]]
-        model_name_entry['confidence'] = float(misclassified_prob[i])
-        model_name_entry['gt_confidence'] = float(misclassified_gt_prob[i])
-
-        # Save the mislabelled dictionary to a JSON file
-        with open(SAVE_MISCLASSIFIED_PATH, 'w') as json_file:
-            json.dump(mislabelled, json_file, indent=4)  # indent for pretty printing
 
 def get_confusionM(y_true, y_pred_major, labels, test_set_len, averaging = False):
     num_errors = np.sum(y_true != y_pred_major)
diff --git a/build_models/main.py b/build_models/main.py
index db10ef7a1c4388facdb1ee58d31ac98202c8bbdd..9a479bddbe85361085859e43a65e0002868db948 100644
--- a/build_models/main.py
+++ b/build_models/main.py
@@ -21,7 +21,7 @@ from own_dataloader import (different_anatomical_plane_in_rgb, same_anatomical_p
                             visualize_RGB_image_from_dataloader, get_data_distribution,visualize_ENSEMBLE_image_from_dataloader, RandomResample)
 from train_functions import train_with_scaler
 from test_functions import predict, averaging_prediction, predict_ENSEMBLE_with_confidence
-from evaluation import get_learning_curves, averaged_misclassified_data, different_anatomical_plane_misclassified_data, get_confusionM
+from evaluation import get_learning_curves, averaged_misclassified_data, get_misclassified_data, get_confusionM
 
 # FIX THE SEEDS
 random.seed(42)
@@ -239,10 +239,7 @@ elif PIPELINE == "different_anatomical_plane_ensemble" or PIPELINE == "ensemble_
   model1 = torch.load(checkpoints_folder1+f'/avp_{BEST_EPOCH1:03d}.pkl')
   model2 = torch.load(checkpoints_folder2+f'/avp_{BEST_EPOCH2:03d}.pkl')
   model3 = torch.load(checkpoints_folder3+f'/avp_{BEST_EPOCH3:03d}.pkl')
-  #print(f"Best epochs: \n model1: {BEST_EPOCH1} \n model2: {BEST_EPOCH2} \n model3: {BEST_EPOCH3}")
   y_pred_majority, y_true, confidence_scores_majority, gt_labels_conf = predict_ENSEMBLE_with_confidence(model1, model2, model3, test_axial_loader, test_coronial_loader, test_sagittal_loader, DEVICE)
-  #print(f"y_pred_majority: {y_pred_majority}, \n y_true: {y_true}, \n confidence_scores_majority: {confidence_scores_majority}, \n confidence_scores_true: {gt_labels_conf}")
-  #print(f"y_pred_majority: {len(y_pred_majority)}, \n y_true: {len(y_true)}, \n confidence_scores_majority: {len(confidence_scores_majority)}, \n confidence_scores_true: {len(gt_labels_conf)}")
 
 #EVALUATION
 if PIPELINE == "different_anatomical_plane_in_rgb":
@@ -250,25 +247,25 @@ if PIPELINE == "different_anatomical_plane_in_rgb":
   y_pred_majority = y_pred.argmax(axis=1) # find the argmax of each of the predictions
   confidence_scores_majority = y_pred.max(axis = 1) #get the confidence score
   y_true = [label for _, label in test_set.samples] # get the true labels and convert to numpy
-  get_confusionM(y_true, y_pred_majority, test_labels, len(test_set)/3)
-  different_anatomical_plane_misclassified_data(y_pred_majority, y_true, confidence_scores_majority, gt_labels_conf, test_image_paths, test_labels)
+  get_confusionM(y_true, y_pred_majority, test_labels, len(test_set))
+  get_misclassified_data(y_pred_majority, y_true, confidence_scores_majority, gt_labels_conf, test_image_paths, test_labels)
 
 elif PIPELINE == "different_anatomical_plane_ensemble" or PIPELINE == "ensemble_same_anatomical_planes_in_RGB":
   # different and same anatomical plane ensemble
   get_confusionM(y_true, y_pred_majority, test_labels, len(test_datasetAxial))
-  different_anatomical_plane_misclassified_data(y_pred_majority, y_true, confidence_scores_majority, gt_labels_conf, test_image_paths, test_labels)
+  get_misclassified_data(y_pred_majority, y_true, confidence_scores_majority, gt_labels_conf, test_image_paths, test_labels)
 
 elif PIPELINE == "basic" or PIPELINE == "same_anatomical_plane_in_rgb":
   y_pred_majority = y_pred.argmax(axis=1) # find the argmax of each of the predictions
   confidence_scores_majority = y_pred.max(axis = 1) #get the confidence score
   y_true = [label for _, label in test_set.samples] # get the true labels and convert to numpy
   get_confusionM(y_true, y_pred_majority, test_labels, len(test_set))
-  different_anatomical_plane_misclassified_data(y_pred_majority, y_true, confidence_scores_majority, gt_labels_conf, test_image_paths, test_labels)
+  get_misclassified_data(y_pred_majority, y_true, confidence_scores_majority, gt_labels_conf, test_image_paths, test_labels)
 
   # averaging evaluation -> simaknal meg a same anatomical planes in RGB
-  grouped_predictions, ground_truth_dict, final_predictions = averaging_prediction(y_pred, test_set, test_image_paths)
-  y_true, y_pred_major = averaged_misclassified_data(final_predictions, ground_truth_dict, grouped_predictions)
-  get_confusionM(y_true, y_pred_major, test_labels,len(test_set)/3 ,averaging=True)
+  grouped_predictions, ground_truth_dict, final_predictions, grouped_gt_predictions = averaging_prediction(y_pred, test_set, test_image_paths)
+  y_true, y_pred_major = averaged_misclassified_data(final_predictions, ground_truth_dict, grouped_predictions, grouped_gt_predictions, test_labels)
+  get_confusionM(y_true, y_pred_major, test_labels, len(test_set)/3, averaging=True)
 
 
 sys.stdout = sys.__stdout__  # Reset stdout back to normal
diff --git a/build_models/parameters.py b/build_models/parameters.py
index 40db229ffd9c82879aa25b0f3a0bbfd4e0765eca..d01d6a6ccf49b2e19d906cb7eb019297723fdb24 100644
--- a/build_models/parameters.py
+++ b/build_models/parameters.py
@@ -1,13 +1,15 @@
 #basic
-"""PIPELINE = "basic"
+"""
+PIPELINE = "basic"
 DATA_PATH = '/net/travail/bformanek/MRI_dataset'
 SPECIALIZATION = "_basic_N1"
 
+"""
 PIPELINE = "basic"
 DATA_PATH = '/net/travail/mvajay/TRDP/N2'
 SPECIALIZATION = "_basic_N2"
 
-
+"""
 PIPELINE = "basic"
 DATA_PATH = '/net/travail/mvajay/TRDP/N3'
 SPECIALIZATION = "_basic_N3"
@@ -16,12 +18,12 @@ SPECIALIZATION = "_basic_N3"
 PIPELINE = "different_anatomical_plane_in_rgb"
 DATA_PATH = '/net/travail/bformanek/MRI_dataset'
 SPECIALIZATION = "_different_anatomical_plane_in_rgb_N1"
-"""
+
 #same anatomical planes in RGB
 PIPELINE = "same_anatomical_plane_in_rgb"
 DATA_PATH = '/net/travail/mvajay/TRDP/N3_c1'
 SPECIALIZATION = "_same_anatomical_plane_in_rgb_N3_c1"
-""""
+
 PIPELINE = "same_anatomical_plane_in_rgb"
 DATA_PATH = '/net/travail/mvajay/TRDP/N3_c2'
 SPECIALIZATION = "_same_anatomical_plane_in_rgb_N3_c2"
@@ -47,7 +49,7 @@ TRAIN_FOLDER = DATA_PATH + '/train'
 VAL_FOLDER = DATA_PATH + '/val'
 TEST_FOLDER = DATA_PATH + '/test'
 
-MODEL_NAME = 'resnet18.a2_in1k'
+MODEL_NAME = 'resnet18'
 
 BATCH_SIZE = 64
 WORKERS = 8
@@ -59,10 +61,15 @@ SAVE_FOLDER_MAIN = '/net/travail/mvajay/TRDP/'
 
 CHECKPOINTS_FOLDER = SAVE_FOLDER_MAIN+'checkpoints/transfer_checkpoints_'+ MODEL_NAME + SPECIALIZATION
 SAVE_PREDICTION_PATH = '/net/cremi/mvajay/TRDP/predictions/' + MODEL_NAME + SPECIALIZATION + "_b_epoch_"
-SAVE_MISCLASSIFIED_PATH = '/net/cremi/mvajay/TRDP/misclassified_data/'+ MODEL_NAME +'_mislabelled_majority_voting'+'.json'
-SAVE_CONFUSIONM_PATH = '/net/cremi/mvajay/TRDP/confusionM/' + MODEL_NAME + '_majority_voting'+'.png'
+SAVE_MISCLASSIFIED_PATH = '/net/cremi/mvajay/TRDP/misclassified_data/'+ MODEL_NAME + SPECIALIZATION +'_misclassified'
+SAVE_CONFUSIONM_PATH = '/net/cremi/mvajay/TRDP/confusionM/' + MODEL_NAME + SPECIALIZATION +'.png'
 SAVE_INPUT_VISUALIZATION_PATH = '/net/cremi/mvajay/TRDP/input_visualization/' + MODEL_NAME + SPECIALIZATION
 SAVE_PATH_LEARNING_CURVES = SAVE_FOLDER_MAIN + "learning_curves/" + MODEL_NAME + SPECIALIZATION
 
 LOG_PATH = '/net/cremi/mvajay/TRDP/logs/'+ MODEL_NAME + SPECIALIZATION + ".log"
 
+AXIS_CONVERTION = {
+    '0': "axial",
+    '1': "coronial",
+    '2': "sagittal"
+}
\ No newline at end of file
diff --git a/build_models/run_combinations.py b/build_models/run_combinations.py
index 0c9d7441e27bbf17a4f5126579fcfb01c0a02cfa..10b6ebf4156899f46fc71eff9caa4c03871253ee 100644
--- a/build_models/run_combinations.py
+++ b/build_models/run_combinations.py
@@ -46,7 +46,7 @@ SAVE_FOLDER_MAIN = '/net/travail/mvajay/TRDP/'
 
 CHECKPOINTS_FOLDER = SAVE_FOLDER_MAIN+'checkpoints/transfer_checkpoints_'+ MODEL_NAME + SPECIALIZATION
 SAVE_PREDICTION_PATH = SAVE_FOLDER_MAIN+'predictions/' + MODEL_NAME + SPECIALIZATION + "_b_epoch_"
-SAVE_MISCLASSIFIED_PATH = SAVE_FOLDER_MAIN+'misclassified_data/'+ MODEL_NAME + SPECIALIZATION +'_mislabelled_majority_voting'+'.json'
+SAVE_MISCLASSIFIED_PATH = '/net/cremi/mvajay/TRDP/misclassified_data/'+ MODEL_NAME +'_misclassified'
 SAVE_CONFUSIONM_PATH = SAVE_FOLDER_MAIN+'confusionM/' + MODEL_NAME + SPECIALIZATION + '_majority_voting'+'.png'
 SAVE_INPUT_VISUALIZATION_PATH = SAVE_FOLDER_MAIN+'input_visualization/' + MODEL_NAME + SPECIALIZATION
 SAVE_PATH_LEARNING_CURVES = SAVE_FOLDER_MAIN + "learning_curves/" + MODEL_NAME + SPECIALIZATION
diff --git a/build_models/test_functions.py b/build_models/test_functions.py
index 915007c26491b45b2d5c4965df3e264607c0726e..ae4d80bace8cec793d335ef0787e93d5bfce68fa 100644
--- a/build_models/test_functions.py
+++ b/build_models/test_functions.py
@@ -44,7 +44,9 @@ def predict(model, data_loader):
 def averaging_prediction(y_pred, test_set, test_image_paths):
     gt_y = np.array(test_set.targets) # get the true labels and convert to numpy
     grouped_predictions = defaultdict(list)
-
+    grouped_gt_predictions = defaultdict(list)
+    ground_truth_dict = {}
+    
     for i, image_path in enumerate(test_image_paths):
         # Strip last 5 characters (this should group the same items)
         if isinstance(image_path, list):
@@ -53,15 +55,8 @@ def averaging_prediction(y_pred, test_set, test_image_paths):
             base_name = image_path[:-10]
         # Append the prediction probabilities for the current image to the corresponding group
         grouped_predictions[base_name].append(y_pred[i])
+        grouped_gt_predictions[base_name].append(y_pred[i][gt_y[i]])
 
-    ground_truth_dict = {}
-
-    for i, image_path in enumerate(test_image_paths):
-        if isinstance(image_path, list):
-            base_name = image_path[0][:-10]
-        else:
-            base_name = image_path[:-10]
-        
         if base_name not in ground_truth_dict:
             ground_truth_dict[base_name] = gt_y[i]
         elif ground_truth_dict[base_name] != gt_y[i]:
@@ -77,7 +72,7 @@ def averaging_prediction(y_pred, test_set, test_image_paths):
         final_class = np.argmax(combined_probs)
         final_predictions[base_name] = final_class
 
-    return grouped_predictions, ground_truth_dict, final_predictions
+    return grouped_predictions, ground_truth_dict, final_predictions, grouped_gt_predictions
 
 def predict_ENSEMBLE_with_confidence(model1, model2, model3, data_loader1, data_loader2, data_loader3, device):
     # Set models to evaluation mode