mpi2 · marinak-ebi · May 28, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
diff --git a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R
@@ -4551,87 +4551,57 @@ DirectionTagMM = function(x,
   return(tag)
 }
 
-DirectionTagFE = function(x,
-                          threshold = .0001,
-                          group = c('ABNORMAL', 'INCREASED', 'DECREASED')) {
-  tag = if (is.null(x) || length(x) < 1) {
-    'NoEffectCalculated'
-  } else if (x < threshold) {
-    group
-  } else {
-    paste0('NORMAL - Test not significant at the level of ',
-           threshold,
-           ' (pvalue = ',
-           x,
-           ')')
-  }
-  return(tag)
-}
-
-returnWhatBasedOnThreshold = function(x = NULL,
-                                      threshold = .0001,
-                                      Return = 'ABNORMAL') {
-  if (is.numeric(x) && x < threshold) {
-    return(Return)
-  } else {
-    return('IgnoreThisCaseAtALL')
-  }
-}
-
 GenotypeTag = function(obj,
                        threshold = 10 ^ -4,
-                       expDetailsForErrorOnly = NULL,
-                       rrlevel = 10 ^ -4) {
+                       rrlevel = 10 ^ -4,
+                       method = NULL) {
   if (is.null(obj))
     return(NULL)
-  method = GetMethodStPa(x = obj$`Applied method`)
   message('\t The analysis method = ', method)
   message('\t The decision threshold = ',
           threshold,
           ' and for the RR method it is ',
           rrlevel)
 
+  # Initialise an empty data frame.
+  tag <- data.frame(
+    Sex = character(),
+    StatisticalTestResult = character(),
+    Level = character(),
+    PValue = numeric(),
+    stringsAsFactors = FALSE
+  )
+
   if (method %in% 'MM') {
-    # Initialise an empty data frame.
-    tag <- data.frame(
-      Sex = character(),
-      StatisticalTestResult = character(),
-      Level = character()
-    )
     # Define sex/column prefix info.
     sex_column_prefix_pairs <- list(
       c("UNSPECIFIED", "Genotype"),
       c("FEMALE", "Sex FvKO"),
-      c("MALE", "Sex MvKO"),
-      stringsAsFactors = FALSE
+      c("MALE", "Sex MvKO")
     )
     # Iterate over sexes and their corresponding column prefixes.
     for (pair in sex_column_prefix_pairs) {
       sex <- pair[1]
       column_prefix <- pair[2]
+      pvalue = obj[[paste0(column_prefix, " p-value")]]
+      effect_size = obj[[paste0(column_prefix, " estimate")]]$Value
       # Append to existing dataframe.
       tag <- rbind(tag, data.frame(
         Sex = sex,
         StatisticalTestResult = c(
           # Statistical test based on genotype effect estimate and p-value.
           DirectionTagMM(
-            x = obj[[paste0(column_prefix, " estimate")]]$Value,
-            pvalue = obj[[paste0(column_prefix, " p-value")]],
+            x = effect_size,
+            pvalue = pvalue,
             threshold = threshold
           ),
-          # Simple statistical test based on p-value only.
-          returnWhatBasedOnThreshold(
-            x = obj[[paste0(column_prefix, " p-value")]],
-            threshold = threshold,
-            Return = "ABNORMAL"
-          ),
-          returnWhatBasedOnThreshold(
-            x = obj[[paste0(column_prefix, " p-value")]],
-            threshold = threshold,
-            Return = 'INFERRED'
-          )
+          # Comparing test based on p-value only.
+          if (is.numeric(pvalue) && pvalue < threshold) "ABNORMAL" else NA,
+          if (is.numeric(pvalue) && pvalue < threshold) "INFERRED" else NA
         ),
         Level = "OVERALL",
+        PValue = if (is.null(pvalue)) NA else pvalue,
+        EffectSize = if (is.null(effect_size)) NA else effect_size,
         stringsAsFactors = FALSE
       ))
     }
@@ -4646,13 +4616,6 @@ GenotypeTag = function(obj,
       fmodels$Genotype$`Complete table` = fmodels$`Complete table`
       fmodels$`Complete table` = NULL
     }
-    # Initialise an empty data frame.
-    tag <- data.frame(
-      Sex = character(),
-      StatisticalTestResult = character(),
-      Level = character(),
-      stringsAsFactors = FALSE
-    )
     # Define sex/column prefix info.
     sex_column_prefix_pairs <- list(
       c("UNSPECIFIED", "Genotype"),
@@ -4663,16 +4626,16 @@ GenotypeTag = function(obj,
     for (pair in sex_column_prefix_pairs) {
       sex <- pair[1]
       column_prefix <- pair[2]
+      pvalue <- fmodels[[column_prefix]]$`Complete table`$p.value
       # Append to existing dataframe.
       tag <- rbind(
         tag,
         data.frame(
           Sex = sex,
-          StatisticalTestResult = DirectionTagFE(
-            x = fmodels[[column_prefix]]$`Complete table`$p.value,
-            threshold = threshold
-          ),
+          StatisticalTestResult = if (is.numeric(pvalue) && pvalue < threshold) "ABNORMAL" else NA,
           Level = "OVERALL",
+          PValue = if (is.null(pvalue)) NA else pvalue,
+          EffectSize = NA,
           stringsAsFactors = FALSE
         )
       )
@@ -4683,13 +4646,6 @@ GenotypeTag = function(obj,
     fmodels = obj$`Additional information`$Analysis$`Further models`
     if (is.null(fmodels))
       return(NULL)
-    # Initialise an empty data frame.
-    tag <- data.frame(
-      Sex = character(),
-      StatisticalTestResult = character(),
-      Level = character(),
-      stringsAsFactors = FALSE
-    )
     # Define sex/column prefix info.
     sex_column_prefix_pairs <- list(
       c("UNSPECIFIED", "Genotype"),
@@ -4701,42 +4657,35 @@ GenotypeTag = function(obj,
       sex <- pair[1]
       column_prefix <- pair[2]
       # Get data for LOW and HIGH tails.
-      low_results <- DirectionTagFE(
-        x = fmodels[[paste0("Low.data_point.", column_prefix)]]$Genotype$Result$p.value,
-        threshold = rrlevel,
-        group = c("ABNORMAL", "DECREASED")
-      )
-      high_results <- DirectionTagFE(
-        x = fmodels[[paste0("High.data_point.", column_prefix)]]$Genotype$Result$p.value,
-        threshold = rrlevel,
-        group = c("ABNORMAL", "INCREASED")
-      )
-      all_results <- c(low_results, high_results)
+      low_pvalue <- fmodels[[paste0("Low.data_point.", column_prefix)]]$Genotype$Result$p.value
+      high_pvalue <- fmodels[[paste0("High.data_point.", column_prefix)]]$Genotype$Result$p.value
       # Append all to existing dataframe.
-      for (result in all_results) {
-        tag <- rbind(
-          tag,
-          data.frame(
-            Sex = sex,
-            StatisticalTestResult = result,
-            Level = "OVERALL",
-            stringsAsFactors = FALSE
-          )
+      tag <- rbind(
+        tag,
+        data.frame(
+          Sex = sex,
+          StatisticalTestResult = if (is.numeric(low_pvalue) && low_pvalue < rrlevel) "ABNORMAL" else NA,
+          Level = "OVERALL",
+          PValue = if (is.null(low_pvalue)) NA else low_pvalue,
+          EffectSize = NA,
+          stringsAsFactors = FALSE
         )
-      }
+      )
+      tag <- rbind(
+        tag,
+        data.frame(
+          Sex = sex,
+          StatisticalTestResult = if (is.numeric(high_pvalue) && high_pvalue < rrlevel) "ABNORMAL" else NA,
+          Level = "OVERALL",
+          PValue = if (is.null(high_pvalue)) NA else high_pvalue,
+          EffectSize = NA,
+          stringsAsFactors = FALSE
+        )
+      )
     }
 
   } else {
     tag = NULL
-    write(
-      x = paste(
-        head(expDetailsForErrorOnly, 18),
-        sep = '\t',
-        collapse = '\t'
-      ),
-      file = 'ErrorneousCases.tsv.err',
-      ncolumns = 5000
-    )
   }
   return(tag)
 }
@@ -4758,11 +4707,7 @@ flatten_mp_chooser <- function(d) {
   }))
 }
 
-match_mp_terms <- function(Gtag, d, allowed_results = c()) {
-  # If provided, restrict the data to a list of allowed statistical test results.
-  if (length(allowed_results) > 0) {
-    Gtag <- subset(Gtag, StatisticalTestResult %in% allowed_results)
-  }
+match_mp_terms <- function(Gtag, d) {
   # First, try to join by exact sex, for example M call to M term.
   GtagExact <- merge(
     Gtag,
@@ -4773,9 +4718,10 @@ match_mp_terms <- function(Gtag, d, allowed_results = c()) {
   # Separate rows where we did / did not find a MP term.
   GtagExactMatched <- subset(GtagExact, !is.na(MpTerm))
   GtagExactUnmatched <- subset(GtagExact, is.na(MpTerm))
+  GtagExactUnmatched <- subset(GtagExactUnmatched, select = -MpTerm)
   # Now, for the unmatched records, try to find a U term as a fallback.
   GtagUnspecified <- merge(
-    Gtag,
+    GtagExactUnmatched,
     subset(d, Sex == "UNSPECIFIED", select = -Sex),
     by = c("StatisticalTestResult", "Level"),
     all.x = TRUE
@@ -4796,7 +4742,7 @@ annotationChooser = function(statpacket = NULL,
   requireNamespace("jsonlite")
   library(dplyr)
 
-  # Handle unsuccessful StatPackages.
+  # Handle unsuccessful StatPackets.
   if (
     is.null(statpacket) ||
     length(statpacket) < 1 ||
@@ -4821,7 +4767,8 @@ annotationChooser = function(statpacket = NULL,
   Gtag = GenotypeTag(
     obj = json$Result$`Vector output`[[resultKey]],
     threshold = level,
-    rrlevel = rrlevel
+    rrlevel = rrlevel,
+    method = method
   )
 
   # Load mp_chooser Rdata file.
@@ -4849,21 +4796,7 @@ annotationChooser = function(statpacket = NULL,
     d <- flatten_mp_chooser(d)
 
     # 2. Join MP term information from mp_chooser.
-    if (method %in% "MM") {
-      Gtag <- match_mp_terms(Gtag, d)
-    } else if (method %in% "FE") {
-      Gtag <- match_mp_terms(Gtag, d, c("ABNORMAL"))
-    } else if (method %in% "RR") {
-      # By default, only ABNORMAL calls are used for RR.
-      GtagAbnormal <- match_mp_terms(Gtag, d, c("ABNORMAL"))
-      # In case no ABNORMAL MP terms were found, try to match INCREASED/DECREASED terms.
-      # This approach is left unchanged from the original annotation pipeline.
-      if (nrow(subset(GtagAbnormal, !is.na(MpTerm))) > 0) {
-        Gtag <- GtagAbnormal
-      } else {
-        Gtag <- match_mp_terms(Gtag, d, c("INCREASED", "DECREASED"))
-      }
-    }
+    Gtag <- match_mp_terms(Gtag, d)
 
     # 3. Remove records with no assigned MP terms.
     # This filters out all rows with no statistically significant results.
@@ -4876,59 +4809,44 @@ annotationChooser = function(statpacket = NULL,
     } else {
       # Define the priority order for StatisticalTestResult.
       priority_order <- c("INCREASED", "DECREASED", "INFERRED", "ABNORMAL")
-      # Check if the input dataframe is empty.
-      if (nrow(Gtag) == 0) {
-        MPTERMS <- list()
-      } else {
-        # Group by Sex and select one row per group based on priority.
-        filtered_data <- Gtag %>%
-          group_by(Sex) %>%
-          arrange(match(StatisticalTestResult, priority_order)) %>%
-          slice(1) %>%
-          ungroup()
-        # Drop UNSPECIFIED if MALE or FEMALE data is present.
-        if (any(filtered_data$Sex %in% c("MALE", "FEMALE"))) {
-          filtered_data <- filtered_data %>% filter(Sex != "UNSPECIFIED")
-        }
-        # Handle UNSPECIFIED case if no MALE or FEMALE data is present.
-        if (!any(filtered_data$Sex %in% c("MALE", "FEMALE"))) {
-          if (length(sex_levels) == 1) {
-            filtered_data$Sex <- toupper(sex_levels)
-          }
-        }
-        # Convert Sex column to lowercase and replace "unspecified" with "not_considered".
-        filtered_data <- filtered_data %>%
-          mutate(Sex = tolower(Sex),
-                  Sex = ifelse(Sex == "unspecified", "not_considered", Sex))
-        # Ensure female comes before male if both are present.
-        filtered_data <- filtered_data %>%
-          arrange(Sex)
-        # Convert to the desired list format.
-        MPTERMS <- filtered_data %>%
-          mutate(sex = Sex, event = StatisticalTestResult, term_id = MpTerm) %>%
-          select(term_id, event, sex) %>%
-          # Use `purrr::transpose()` to create an unnamed list of objects.
-          purrr::transpose() %>%
-          # Remove names from the list
-          unname()
-        # Special case for RR: add an empty `otherPossibilities` field for compatibility.
-        if (method == "RR") {
-          MPTERMS <- purrr::map(MPTERMS, ~ {
-            .x$otherPossibilities <- ""
-            .x
-          })
+      # Group by Sex and select one row per group based on priority.
+      filtered_data <- Gtag %>%
+        group_by(Sex) %>%
+        arrange(match(StatisticalTestResult, priority_order)) %>%
+        slice(1) %>%
+        ungroup()
+      # Drop UNSPECIFIED if MALE or FEMALE data is present.
+      if (any(filtered_data$Sex %in% c("MALE", "FEMALE"))) {
+        filtered_data <- filtered_data %>% filter(Sex != "UNSPECIFIED")
+      }
+      # Handle UNSPECIFIED case if no MALE or FEMALE data is present.
+      if (!any(filtered_data$Sex %in% c("MALE", "FEMALE"))) {
+        if (length(sex_levels) == 1) {
+          filtered_data$Sex <- toupper(sex_levels)
         }
       }
+      # Convert Sex column to lowercase and replace "unspecified" with "not_considered".
+      filtered_data <- filtered_data %>%
+        mutate(Sex = tolower(Sex),
+                Sex = ifelse(Sex == "unspecified", "not_considered", Sex))
+      # Ensure female comes before male if both are present.
+      filtered_data <- filtered_data %>%
+        arrange(Sex)
+      # Convert to the desired list format.
+      MPTERMS <- filtered_data %>%
+        mutate(sex = Sex, event = StatisticalTestResult, term_id = MpTerm, p_value = PValue, effect_size = EffectSize) %>%
+        select(term_id, event, sex, p_value, effect_size) %>%
+        # Use `purrr::transpose()` to create an unnamed list of objects.
+        purrr::transpose() %>%
+        # Remove names from the list.
+        unname()
     }
-
   }
 
   # Return the final data structure.
   if (length(MPTERMS) > 0) {
     json$Result$Details[[TermKey]] = MPTERMS
     statpacket$V20 = FinalJson2ObjectCreator(FinalList = json)
   }
-  return(invisible(list(
-    MPTERM = MPTERMS, statpacket = statpacket
-  )))
+  return(invisible(list(MPTERM = MPTERMS, statpacket = statpacket)))
 }
diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh
@@ -247,6 +247,12 @@ for file in $(find . -maxdepth 1 -type f -name "split_index*"); do
 done
 chmod 775 annotation_jobs.bch
 
+message0 "Installing Python dependencies..."
+module load python-3.10.2-gcc-9.3.0-gswnsij
+python3.10 -m pip install rpy2
+python3.10 -m pip install numpy
+python3.10 -m pip install pandas
+
 message0 "Downloading the action script..."
 fetch_script loader.py annotation_pipeline
 submit_limit_jobs annotation_jobs.bch ../../../../compressed_logs/annotation_job_id.txt