From 1c2a01f6fc81dbdaad3f1255d2d0b8297e8df8c2 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 13 Jun 2025 13:55:14 +0100 Subject: [PATCH 01/46] Increase number of rows per file and add shuf --- orchestration/orchestration.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index e50584d8..a2500bc4 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -226,10 +226,10 @@ mv *.Ind AnnotationExtractorAndHadoopLoader cd AnnotationExtractorAndHadoopLoader message0 "Concatenating single index files to create a global index for the results..." -cat *.Ind >> AllResultsIndeces.txt +cat *.Ind | shuf >> AllResultsIndeces.txt message0 "Zipping the single indeces..." zip -q -rm allsingleindeces.zip *.Ind -split -50 AllResultsIndeces.txt split_index_ +split -1000 AllResultsIndeces.txt split_index_ message0 "Convert the mp_chooser JSON file to Rdata..." R --quiet -e "a = jsonlite::fromJSON('../../../../mp_chooser.json');save(a,file='../../../../mp_chooser.json.Rdata')" From c5f5b018373d6362d1855688c1795cb4134fcfa6 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 13 Jun 2025 15:26:06 +0100 Subject: [PATCH 02/46] Create folder for input_parquet_files --- orchestration/orchestration.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index a2500bc4..767f9b08 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -97,7 +97,7 @@ cd SP message0 "Phase I. Convert parquet files into Rdata..." message0 "Step 1. Create jobs" -step1_files=$(find .. -type f -name '*.parquet' -exec realpath {} \;) +step1_files=$(find ../input_parquet_files -type f -name '*.parquet' -exec realpath {} \;) for file in $step1_files; do echo "sbatch --job-name=impc_stats_pipeline_job --mem=10G --time=00:10:00 -e ${file}.err -o ${file}.log --wrap='Rscript Step2Parquet2Rdata.R $file'" >> jobs_step2_Parquet2Rdata.bch done @@ -107,8 +107,8 @@ fetch_script 0-ETL/Step2Parquet2Rdata.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step2_job_id.txt --wrap="bash jobs_step2_Parquet2Rdata.bch" waitTillCommandFinish rm Step2Parquet2Rdata.R -find ../ -type f -name '*.log' -exec zip -q -m ../compressed_logs/step2_logs.zip {} + -find ../ -type f -name '*.err' -exec zip -q -m ../compressed_logs/step2_logs.zip {} + +find ../input_parquet_files -type f -name '*.log' -exec zip -q -m ../compressed_logs/step2_logs.zip {} + +find ../input_parquet_files -type f -name '*.err' -exec zip -q -m ../compressed_logs/step2_logs.zip {} + message0 "Step 3. Merging pseudo Rdata files into single file for each procedure - jobs creator" dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -type d) From a4b3c5f35266b366ec0fc28f1f2b220f32011ac7 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 13 Jun 2025 15:35:52 +0100 Subject: [PATCH 03/46] Flatten job folder structure --- orchestration/orchestration.sh | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 767f9b08..28888f43 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -146,11 +146,11 @@ zip -q -rm phase2_logs.zip DataGeneratingLog/ mv phase2_logs.zip ../compressed_logs/ message0 "Appending all procedure based jobs into one single file..." -mkdir jobs -find ./*/*_RawData/*.bch -type f | xargs cat >> jobs/AllJobs.bch +mkdir ../jobs +find ./*/*_RawData/*.bch -type f | xargs cat >> ../jobs/AllJobs.bch message0 "Phase III. Initialising the statistical analysis..." -cd jobs +cd ../jobs message0 "Updating the dynamic contents from the IMPReSS..." R --quiet -e \ "DRrequiredAgeing:::updateImpress( \ @@ -162,20 +162,20 @@ R --quiet -e \ message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then - fetch_script jobs/function_windowed.R + fetch_script ../jobs/function_windowed.R mv function_windowed.R function.R else - fetch_script jobs/function.R + fetch_script ../jobs/function.R fi R --quiet -e \ "DRrequiredAgeing:::ReplaceWordInFile( \ - '$(realpath function.R)', \ + '$(realpath ../jobs/function.R)', \ 'DRversionNotSpecified', \ ${VERSION} \ )" chmod 775 AllJobs.bch -submit_limit_jobs AllJobs.bch ../../compressed_logs/phase3_job_id.txt +submit_limit_jobs AllJobs.bch ../compressed_logs/phase3_job_id.txt waitTillCommandFinish message0 "Postprocessing the IMPC statistical analysis results..." @@ -203,7 +203,7 @@ message0 "This is the last step. If you see no file in the list below, the SP is # Annotation pipeline. message0 "Starting the IMPC annotation pipeline..." -cd jobs/Results_IMPC_SP_Windowed/ +cd ../jobs/Results_IMPC_SP_Windowed/ message0 "Step 1: Clean ups and creating the global index for the results." message0 "Indexing the results..." for dir in $(find . -mindepth 2 -maxdepth 2 -type d); do @@ -213,12 +213,12 @@ for dir in $(find . -mindepth 2 -maxdepth 2 -type d); do -e ${base_dir}_error.err -o ${base_dir}_output.log --wrap=\"find $dir -type f -name '*.tsv' -exec realpath {} \; > $output_file\"" >> minijobs.bch done chmod 775 minijobs.bch -submit_limit_jobs minijobs.bch ../../../compressed_logs/minijobs_job_id.txt +submit_limit_jobs minijobs.bch ../../compressed_logs/minijobs_job_id.txt waitTillCommandFinish -mv minijobs.bch ../../../compressed_logs +mv minijobs.bch ../../compressed_logs -find . -type f -name '*_output.log' -exec zip -q -m ../../../compressed_logs/minijobs_logs.zip {} + -find . -type f -name '*_error.err' -exec zip -q -m ../../../compressed_logs/minijobs_logs.zip {} + +find . -type f -name '*_output.log' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + +find . -type f -name '*_error.err' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + message0 "Moving single indeces into a separate directory called AnnotationExtractorAndHadoopLoader..." mkdir AnnotationExtractorAndHadoopLoader chmod 775 AnnotationExtractorAndHadoopLoader From 869801da0cf9646c63f281387b1448a1fbb169a8 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 13 Jun 2025 15:39:08 +0100 Subject: [PATCH 04/46] Rename AnnotationExtractorAndHadoopLoader to annotation_extractor --- orchestration/orchestration.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 28888f43..63501255 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -219,11 +219,11 @@ mv minijobs.bch ../../compressed_logs find . -type f -name '*_output.log' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + find . -type f -name '*_error.err' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + -message0 "Moving single indeces into a separate directory called AnnotationExtractorAndHadoopLoader..." -mkdir AnnotationExtractorAndHadoopLoader -chmod 775 AnnotationExtractorAndHadoopLoader -mv *.Ind AnnotationExtractorAndHadoopLoader -cd AnnotationExtractorAndHadoopLoader +message0 "Moving single indeces into a separate directory called annotation_extractor..." +mkdir annotation_extractor +chmod 775 annotation_extractor +mv *.Ind annotation_extractor +cd annotation_extractor message0 "Concatenating single index files to create a global index for the results..." cat *.Ind | shuf >> AllResultsIndeces.txt From 5af5109a6e9dfa80cc205f8899f5bde0f4248adf Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 13 Jun 2025 15:42:15 +0100 Subject: [PATCH 05/46] Flatten annotation_extractor --- orchestration/orchestration.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 63501255..7c7cab1c 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -220,10 +220,10 @@ mv minijobs.bch ../../compressed_logs find . -type f -name '*_output.log' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + find . -type f -name '*_error.err' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + message0 "Moving single indeces into a separate directory called annotation_extractor..." -mkdir annotation_extractor -chmod 775 annotation_extractor -mv *.Ind annotation_extractor -cd annotation_extractor +mkdir ../annotation_extractor +chmod 775 ../annotation_extractor +mv *.Ind ../annotation_extractor +cd ../annotation_extractor message0 "Concatenating single index files to create a global index for the results..." cat *.Ind | shuf >> AllResultsIndeces.txt @@ -232,8 +232,8 @@ zip -q -rm allsingleindeces.zip *.Ind split -1000 AllResultsIndeces.txt split_index_ message0 "Convert the mp_chooser JSON file to Rdata..." -R --quiet -e "a = jsonlite::fromJSON('../../../../mp_chooser.json');save(a,file='../../../../mp_chooser.json.Rdata')" -export MP_CHOOSER_FILE=$(realpath ../../../../mp_chooser.json.Rdata | tr -d '\n') +R --quiet -e "a = jsonlite::fromJSON('../mp_chooser.json');save(a,file='../mp_chooser.json.Rdata')" +export MP_CHOOSER_FILE=$(realpath ../mp_chooser.json.Rdata | tr -d '\n') if [[ -z "${MP_CHOOSER_FILE}" || ! -f "${MP_CHOOSER_FILE}" ]]; then echo -e "ERROR: mp_chooser not found at location\n\t${MP_CHOOSER_FILE}" @@ -255,11 +255,11 @@ python3.10 -m pip install pandas message0 "Downloading the action script..." fetch_script loader.py annotation_pipeline -submit_limit_jobs annotation_jobs.bch ../../../../compressed_logs/annotation_job_id.txt +submit_limit_jobs annotation_jobs.bch ../compressed_logs/annotation_job_id.txt waitTillCommandFinish message0 "Zipping logs..." -mv annotation_jobs.bch ../../../../compressed_logs -zip -q -rm ../../../../compressed_logs/annotation_logs.zip log/* err/* out/* +mv annotation_jobs.bch ../compressed_logs +zip -q -rm ../compressed_logs/annotation_logs.zip log/* err/* out/* zip -q -rm splits.zip split_index_* message0 "Job done." From b292c7c9e6bbf5268df114103f7547c8d4177fa6 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 13 Jun 2025 15:43:27 +0100 Subject: [PATCH 06/46] Rename tmp to annotation_pipeline_output --- annotation_pipeline/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotation_pipeline/loader.py b/annotation_pipeline/loader.py index 1427e686..4548f684 100644 --- a/annotation_pipeline/loader.py +++ b/annotation_pipeline/loader.py @@ -87,7 +87,7 @@ def main(): total_files = len(file_list) # Store StatPackets temporary. - output_dir = Path("tmp") + output_dir = Path("annotation_pipeline_output") if not output_dir.exists(): output_dir.mkdir() From 4988caacb7448b5ac9a9d74e8cf71f9ab35f9769 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 13 Jun 2025 16:04:33 +0100 Subject: [PATCH 07/46] Create folder input_parquet_files --- orchestration/orchestration.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 7c7cab1c..71da0841 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -76,7 +76,8 @@ function submit_limit_jobs() { # Preparation. mkdir --mode=775 ${KOMP_PATH}/impc_statistical_pipeline/IMPC_DRs/stats_pipeline_input_dr${VERSION} cd ${KOMP_PATH}/impc_statistical_pipeline/IMPC_DRs/stats_pipeline_input_dr${VERSION} -cp ${PARQUET_FOLDER}/*.parquet ./ +mkdir input_parquet_files +cp ${PARQUET_FOLDER}/*.parquet ./input_parquet_files cp ${MP_CHOOSER_FOLDER}/part*.txt ./mp_chooser.json message0 "Update packages to the latest version" From 808b0bf01cdfe6a2944293d2d7d1a52b203dd22e Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 18 Jun 2025 09:45:35 +0100 Subject: [PATCH 08/46] Fix paths --- orchestration/orchestration.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 71da0841..31b306c3 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -88,12 +88,12 @@ message0 "Update completed" # Statistical pipeline. message0 "Starting the IMPC statistical pipeline..." -mkdir SP compressed_logs +mkdir stats_batching compressed_logs export input_path=$(realpath .) -export sp_results=$(realpath SP) +export sp_results=$(realpath stats_batching) message0 "Parquet files path: ${input_path}" message0 "Output path: ${sp_results}" -cd SP +cd stats_batching message0 "Phase I. Convert parquet files into Rdata..." @@ -147,11 +147,11 @@ zip -q -rm phase2_logs.zip DataGeneratingLog/ mv phase2_logs.zip ../compressed_logs/ message0 "Appending all procedure based jobs into one single file..." -mkdir ../jobs -find ./*/*_RawData/*.bch -type f | xargs cat >> ../jobs/AllJobs.bch +mkdir ../stats_results +find ./*/*_RawData/*.bch -type f | xargs cat >> ../stats_results/AllJobs.bch message0 "Phase III. Initialising the statistical analysis..." -cd ../jobs +cd ../stats_results message0 "Updating the dynamic contents from the IMPReSS..." R --quiet -e \ "DRrequiredAgeing:::updateImpress( \ @@ -163,15 +163,15 @@ R --quiet -e \ message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then - fetch_script ../jobs/function_windowed.R - mv function_windowed.R function.R + fetch_script ../stats_results/function_windowed.R + mv ../stats_results/function_windowed.R ../stats_results/function.R else - fetch_script ../jobs/function.R + fetch_script ../stats_results/function.R fi R --quiet -e \ "DRrequiredAgeing:::ReplaceWordInFile( \ - '$(realpath ../jobs/function.R)', \ + '$(realpath ../stats_results/function.R)', \ 'DRversionNotSpecified', \ ${VERSION} \ )" @@ -204,7 +204,7 @@ message0 "This is the last step. If you see no file in the list below, the SP is # Annotation pipeline. message0 "Starting the IMPC annotation pipeline..." -cd ../jobs/Results_IMPC_SP_Windowed/ +cd ../stats_results/Results_IMPC_SP_Windowed/ message0 "Step 1: Clean ups and creating the global index for the results." message0 "Indexing the results..." for dir in $(find . -mindepth 2 -maxdepth 2 -type d); do From 24bb576ab17382ac972b3dde462a3072517c3d39 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 18 Jun 2025 09:49:44 +0100 Subject: [PATCH 09/46] Temporally turn off ipdateImpress function --- orchestration/orchestration.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 31b306c3..6572731b 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -152,14 +152,14 @@ find ./*/*_RawData/*.bch -type f | xargs cat >> ../stats_results/AllJobs.bch message0 "Phase III. Initialising the statistical analysis..." cd ../stats_results -message0 "Updating the dynamic contents from the IMPReSS..." -R --quiet -e \ -"DRrequiredAgeing:::updateImpress( \ - updateImpressFileInThePackage = TRUE, \ - updateOptionalParametersList = TRUE, \ - updateTheSkipList = TRUE, \ - saveRdata = FALSE \ -)" +# message0 "Updating the dynamic contents from the IMPReSS..." +# R --quiet -e \ +# "DRrequiredAgeing:::updateImpress( \ +# updateImpressFileInThePackage = TRUE, \ +# updateOptionalParametersList = TRUE, \ +# updateTheSkipList = TRUE, \ +# saveRdata = FALSE \ +# )" message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then From a9ca8a636a58f2ea93344e69e1138be67bf64a24 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 18 Jun 2025 11:18:10 +0100 Subject: [PATCH 10/46] Fix path for function_windowed --- orchestration/orchestration.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 6572731b..47cd94aa 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -163,10 +163,10 @@ cd ../stats_results message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then - fetch_script ../stats_results/function_windowed.R - mv ../stats_results/function_windowed.R ../stats_results/function.R + fetch_script function_windowed.R + mv function_windowed.R function.R else - fetch_script ../stats_results/function.R + fetch_script function.R fi R --quiet -e \ From 161368015ff4642c818a26f628bd0545aad28101 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 18 Jun 2025 11:30:04 +0100 Subject: [PATCH 11/46] Add additional logs --- orchestration/orchestration.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 47cd94aa..dc62939f 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -163,18 +163,23 @@ cd ../stats_results message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then + message0 "Inside loop" fetch_script function_windowed.R + message0 "Downloaded" mv function_windowed.R function.R + message0 "Renamed" else fetch_script function.R fi +message0 "Before replace" R --quiet -e \ "DRrequiredAgeing:::ReplaceWordInFile( \ - '$(realpath ../stats_results/function.R)', \ + '$(realpath function.R)', \ 'DRversionNotSpecified', \ ${VERSION} \ )" +message0 "After replace" chmod 775 AllJobs.bch submit_limit_jobs AllJobs.bch ../compressed_logs/phase3_job_id.txt waitTillCommandFinish From 29207ff71612fb402bce7630c4b8f96f09838d8e Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 18 Jun 2025 12:13:01 +0100 Subject: [PATCH 12/46] Fix path to function_windowed.R --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index dc62939f..48ef7517 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -164,7 +164,7 @@ cd ../stats_results message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then message0 "Inside loop" - fetch_script function_windowed.R + fetch_script jobs/function_windowed.R message0 "Downloaded" mv function_windowed.R function.R message0 "Renamed" From 3e1acfd18406dcbed8c1f649dd4a25f5836cc31e Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 18 Jun 2025 12:31:38 +0100 Subject: [PATCH 13/46] Fix mp_chooser path --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 48ef7517..2ef732f2 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -238,7 +238,7 @@ zip -q -rm allsingleindeces.zip *.Ind split -1000 AllResultsIndeces.txt split_index_ message0 "Convert the mp_chooser JSON file to Rdata..." -R --quiet -e "a = jsonlite::fromJSON('../mp_chooser.json');save(a,file='../mp_chooser.json.Rdata')" +R --quiet -e "a = jsonlite::fromJSON('../../mp_chooser.json');save(a,file='../mp_chooser.json.Rdata')" export MP_CHOOSER_FILE=$(realpath ../mp_chooser.json.Rdata | tr -d '\n') if [[ -z "${MP_CHOOSER_FILE}" || ! -f "${MP_CHOOSER_FILE}" ]]; then From 59e4b3b11181ba211eabb89f41ca25592bf1612b Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 18 Jun 2025 12:42:05 +0100 Subject: [PATCH 14/46] Change path to annotation_extractor --- orchestration/orchestration.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 2ef732f2..415df71f 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -226,10 +226,10 @@ mv minijobs.bch ../../compressed_logs find . -type f -name '*_output.log' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + find . -type f -name '*_error.err' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + message0 "Moving single indeces into a separate directory called annotation_extractor..." -mkdir ../annotation_extractor -chmod 775 ../annotation_extractor -mv *.Ind ../annotation_extractor -cd ../annotation_extractor +mkdir ../../annotation_extractor +chmod 775 ../../annotation_extractor +cd ../../annotation_extractor +mv stats_results/Results_IMPC_SP_Windowed/*.Ind . message0 "Concatenating single index files to create a global index for the results..." cat *.Ind | shuf >> AllResultsIndeces.txt From 6774f5df544e09dfe11716443d661bf254460cbe Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 25 Jun 2025 09:51:19 +0100 Subject: [PATCH 15/46] Fix path for indeces --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 415df71f..12fbed68 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -229,7 +229,7 @@ message0 "Moving single indeces into a separate directory called annotation_extr mkdir ../../annotation_extractor chmod 775 ../../annotation_extractor cd ../../annotation_extractor -mv stats_results/Results_IMPC_SP_Windowed/*.Ind . +mv ../stats_results/Results_IMPC_SP_Windowed/*.Ind . message0 "Concatenating single index files to create a global index for the results..." cat *.Ind | shuf >> AllResultsIndeces.txt From 36759c4c16af00cfc072ff4e7146e5f83a2db58c Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 25 Jun 2025 10:15:33 +0100 Subject: [PATCH 16/46] Fix path to mp_chooser --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 12fbed68..f84fd275 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -238,7 +238,7 @@ zip -q -rm allsingleindeces.zip *.Ind split -1000 AllResultsIndeces.txt split_index_ message0 "Convert the mp_chooser JSON file to Rdata..." -R --quiet -e "a = jsonlite::fromJSON('../../mp_chooser.json');save(a,file='../mp_chooser.json.Rdata')" +R --quiet -e "a = jsonlite::fromJSON('../mp_chooser.json');save(a,file='../mp_chooser.json.Rdata')" export MP_CHOOSER_FILE=$(realpath ../mp_chooser.json.Rdata | tr -d '\n') if [[ -z "${MP_CHOOSER_FILE}" || ! -f "${MP_CHOOSER_FILE}" ]]; then From 5ced41900babdf180d67083913e1939c7d543daf Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Wed, 25 Jun 2025 15:28:18 +0100 Subject: [PATCH 17/46] Change path of step 2 logs --- orchestration/orchestration.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index f84fd275..832b7caf 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -100,7 +100,8 @@ message0 "Phase I. Convert parquet files into Rdata..." message0 "Step 1. Create jobs" step1_files=$(find ../input_parquet_files -type f -name '*.parquet' -exec realpath {} \;) for file in $step1_files; do - echo "sbatch --job-name=impc_stats_pipeline_job --mem=10G --time=00:10:00 -e ${file}.err -o ${file}.log --wrap='Rscript Step2Parquet2Rdata.R $file'" >> jobs_step2_Parquet2Rdata.bch + file_name=$(basename "${file}" .parquet) + echo "sbatch --job-name=impc_stats_pipeline_job --mem=10G --time=00:10:00 -e ../compressed_logs/step2_logs/${file_name}.err -o ../compressed_logs/step2_logs${file_name}.log --wrap='Rscript Step2Parquet2Rdata.R $file'" >> jobs_step2_Parquet2Rdata.bch done message0 "Step 2. Read parquet files and create pseudo Rdata" @@ -108,8 +109,6 @@ fetch_script 0-ETL/Step2Parquet2Rdata.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step2_job_id.txt --wrap="bash jobs_step2_Parquet2Rdata.bch" waitTillCommandFinish rm Step2Parquet2Rdata.R -find ../input_parquet_files -type f -name '*.log' -exec zip -q -m ../compressed_logs/step2_logs.zip {} + -find ../input_parquet_files -type f -name '*.err' -exec zip -q -m ../compressed_logs/step2_logs.zip {} + message0 "Step 3. Merging pseudo Rdata files into single file for each procedure - jobs creator" dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -type d) From 30188dda0c7bd04f7b1ea190702bfb0f0b79ab29 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 13:36:27 +0100 Subject: [PATCH 18/46] Fix path --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 832b7caf..a8a03ef3 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -101,7 +101,7 @@ message0 "Step 1. Create jobs" step1_files=$(find ../input_parquet_files -type f -name '*.parquet' -exec realpath {} \;) for file in $step1_files; do file_name=$(basename "${file}" .parquet) - echo "sbatch --job-name=impc_stats_pipeline_job --mem=10G --time=00:10:00 -e ../compressed_logs/step2_logs/${file_name}.err -o ../compressed_logs/step2_logs${file_name}.log --wrap='Rscript Step2Parquet2Rdata.R $file'" >> jobs_step2_Parquet2Rdata.bch + echo "sbatch --job-name=impc_stats_pipeline_job --mem=10G --time=00:10:00 -e ../compressed_logs/step2_logs/${file_name}.err -o ../compressed_logs/step2_logs/${file_name}.log --wrap='Rscript Step2Parquet2Rdata.R $file'" >> jobs_step2_Parquet2Rdata.bch done message0 "Step 2. Read parquet files and create pseudo Rdata" From fed0a031d712dea8dc135facaca3c24998ef9269 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 13:43:44 +0100 Subject: [PATCH 19/46] Compress files in step 2 --- orchestration/orchestration.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index a8a03ef3..ef646cac 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -109,6 +109,7 @@ fetch_script 0-ETL/Step2Parquet2Rdata.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step2_job_id.txt --wrap="bash jobs_step2_Parquet2Rdata.bch" waitTillCommandFinish rm Step2Parquet2Rdata.R +sbatch --job-name=zip_step2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step2.txt --wrap="zip -r -m -q ../compressed_logs/step2_logs.zip ../compressed_logs/step2_logs/" message0 "Step 3. Merging pseudo Rdata files into single file for each procedure - jobs creator" dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -type d) From 5277621af5e7a79a6c94afd8901c701152dc1fc0 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 14:10:54 +0100 Subject: [PATCH 20/46] Change path of step 4 logs --- orchestration/orchestration.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index ef646cac..9bf37f40 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -114,7 +114,8 @@ sbatch --job-name=zip_step2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_s message0 "Step 3. Merging pseudo Rdata files into single file for each procedure - jobs creator" dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -type d) for dir in $dirs; do - echo "sbatch --job-name=impc_stats_pipeline_job --mem=50G --time=01:30:00 -e ${dir}/step4_merge_rdatas.err -o ${dir}/step4_merge_rdatas.log --wrap='Rscript Step4MergingRdataFiles.R ${dir}'" >> jobs_step4_MergeRdatas.bch + file_name=$(basename "${dir}") + echo "sbatch --job-name=impc_stats_pipeline_job --mem=50G --time=01:30:00 -e ../compressed_logs/step4_logs/${file_name}_step4.err -o ../compressed_logs/step4_logs/${file_name}_step4.log --wrap='Rscript Step4MergingRdataFiles.R ${dir}'" >> jobs_step4_MergeRdatas.bch done message0 "Step 4. Merging pseudo Rdata files into single files per procedure" @@ -122,8 +123,7 @@ fetch_script 0-ETL/Step4MergingRdataFiles.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step4_job_id.txt --wrap="bash jobs_step4_MergeRdatas.bch" waitTillCommandFinish rm Step4MergingRdataFiles.R -find . -type f -name '*.log' -exec zip -q -m ../compressed_logs/step4_logs.zip {} + -find . -type f -name '*.err' -exec zip -q -m ../compressed_logs/step4_logs.zip {} + +sbatch --job-name=zip_step2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step4.txt --wrap="zip -r -m -q ../compressed_logs/step4_logs.zip ../compressed_logs/step4_logs/" message0 "Phase I. Compressing the log files and house cleaning..." zip -q -rm ../compressed_logs/phase1_jobs.zip *.bch From 8c7d2b85d89e38b8af92e1c1ef50ed4d642f42bc Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 14:12:12 +0100 Subject: [PATCH 21/46] Add mindepth to fix minor error --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 9bf37f40..9f43bb6d 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -112,7 +112,7 @@ rm Step2Parquet2Rdata.R sbatch --job-name=zip_step2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step2.txt --wrap="zip -r -m -q ../compressed_logs/step2_logs.zip ../compressed_logs/step2_logs/" message0 "Step 3. Merging pseudo Rdata files into single file for each procedure - jobs creator" -dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -type d) +dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -mindepth 1 -type d) for dir in $dirs; do file_name=$(basename "${dir}") echo "sbatch --job-name=impc_stats_pipeline_job --mem=50G --time=01:30:00 -e ../compressed_logs/step4_logs/${file_name}_step4.err -o ../compressed_logs/step4_logs/${file_name}_step4.log --wrap='Rscript Step4MergingRdataFiles.R ${dir}'" >> jobs_step4_MergeRdatas.bch From 24dd1bca37848b200cb3f11ca5518f0dfaa6c091 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 14:17:43 +0100 Subject: [PATCH 22/46] Fix typo --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 9f43bb6d..b93f2dfb 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -123,7 +123,7 @@ fetch_script 0-ETL/Step4MergingRdataFiles.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step4_job_id.txt --wrap="bash jobs_step4_MergeRdatas.bch" waitTillCommandFinish rm Step4MergingRdataFiles.R -sbatch --job-name=zip_step2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step4.txt --wrap="zip -r -m -q ../compressed_logs/step4_logs.zip ../compressed_logs/step4_logs/" +sbatch --job-name=zip_step4 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step4.txt --wrap="zip -r -m -q ../compressed_logs/step4_logs.zip ../compressed_logs/step4_logs/" message0 "Phase I. Compressing the log files and house cleaning..." zip -q -rm ../compressed_logs/phase1_jobs.zip *.bch From 1471b790102cb153807398b5fe219628e447541f Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 14:35:30 +0100 Subject: [PATCH 23/46] Change path of phase II logs --- orchestration/orchestration.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index b93f2dfb..5e3cf03b 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -130,10 +130,9 @@ zip -q -rm ../compressed_logs/phase1_jobs.zip *.bch rm -rf ProcedureScatterRdata message0 "Starting Phase II, packaging the big data into small packages ..." -mkdir DataGeneratingLog for file in $(find Rdata -type f -exec realpath {} \;); do file_basename=$(basename $file .Rdata) - echo "sbatch --job-name=impc_stats_pipeline_job --mem=45G --time=6-00 -e DataGeneratingLog/${file_basename}_errorlog.log -o DataGeneratingLog/${file_basename}_outputlog.log --wrap='Rscript InputDataGenerator.R ${file} ${file_basename}'" >> DataGenerationJobList.bch + echo "sbatch --job-name=impc_stats_pipeline_job --mem=45G --time=6-00 -e ../compressed_logs/phase2_logs/${file_basename}.err -o ../compressed_logs/phase2_logs/${file_basename}.log --wrap='Rscript InputDataGenerator.R ${file} ${file_basename}'" >> DataGenerationJobList.bch done fetch_script jobs/InputDataGenerator.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/phase2_job_id.txt --wrap="bash DataGenerationJobList.bch" @@ -142,9 +141,8 @@ rm InputDataGenerator.R message0 "End of packaging data." message0 "Phase II. Compressing the log files and house cleaning..." -mv *.bch DataGeneratingLog/ -zip -q -rm phase2_logs.zip DataGeneratingLog/ -mv phase2_logs.zip ../compressed_logs/ +mv *.bch ../compressed_logs/phase2_logs/ +sbatch --job-name=zip_phase2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_phase2.txt --wrap="zip -r -m -q ../compressed_logs/phase2_logs.zip ../compressed_logs/phase2_logs/" message0 "Appending all procedure based jobs into one single file..." mkdir ../stats_results From 69f2230d9fb1981b4b76afe0ee3c9cf1a2f07306 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 14:36:49 +0100 Subject: [PATCH 24/46] Rename slurm jobs --- orchestration/orchestration.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 5e3cf03b..503e8668 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -109,7 +109,7 @@ fetch_script 0-ETL/Step2Parquet2Rdata.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step2_job_id.txt --wrap="bash jobs_step2_Parquet2Rdata.bch" waitTillCommandFinish rm Step2Parquet2Rdata.R -sbatch --job-name=zip_step2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step2.txt --wrap="zip -r -m -q ../compressed_logs/step2_logs.zip ../compressed_logs/step2_logs/" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step2.txt --wrap="zip -r -m -q ../compressed_logs/step2_logs.zip ../compressed_logs/step2_logs/" message0 "Step 3. Merging pseudo Rdata files into single file for each procedure - jobs creator" dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -mindepth 1 -type d) @@ -123,7 +123,7 @@ fetch_script 0-ETL/Step4MergingRdataFiles.R sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step4_job_id.txt --wrap="bash jobs_step4_MergeRdatas.bch" waitTillCommandFinish rm Step4MergingRdataFiles.R -sbatch --job-name=zip_step4 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step4.txt --wrap="zip -r -m -q ../compressed_logs/step4_logs.zip ../compressed_logs/step4_logs/" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step4.txt --wrap="zip -r -m -q ../compressed_logs/step4_logs.zip ../compressed_logs/step4_logs/" message0 "Phase I. Compressing the log files and house cleaning..." zip -q -rm ../compressed_logs/phase1_jobs.zip *.bch @@ -142,7 +142,7 @@ rm InputDataGenerator.R message0 "End of packaging data." message0 "Phase II. Compressing the log files and house cleaning..." mv *.bch ../compressed_logs/phase2_logs/ -sbatch --job-name=zip_phase2 --time=15:00:00 --mem=1G -o ../compressed_logs/zip_phase2.txt --wrap="zip -r -m -q ../compressed_logs/phase2_logs.zip ../compressed_logs/phase2_logs/" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_phase2.txt --wrap="zip -r -m -q ../compressed_logs/phase2_logs.zip ../compressed_logs/phase2_logs/" message0 "Appending all procedure based jobs into one single file..." mkdir ../stats_results From 233bc71fe28dd8ed0994d0d30321955fa4f662f7 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 15:00:42 +0100 Subject: [PATCH 25/46] Change compression of phase 3 logs --- orchestration/orchestration.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 503e8668..8efbd980 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -198,10 +198,9 @@ R --quiet -e \ storepath='$(realpath RPackage_backup)' \ )" -message0 "Compress phase III log files" -find . -type f -name '*.ClusterOut' -exec zip -q -m ../compressed_logs/phase3_logs.zip {} + -message0 "Compress phase III error files" -find . -type f -name '*.ClusterErr' -exec zip -q -m ../compressed_logs/phase3_errs.zip {} + +message0 "Submit phase III log and err files compression" +sbatch --job-name=compress_logs --time=1-00:00:00 --mem=1G -o ../compressed_logs/zip_phase3_logs.txt --wrap="find . -type d -name 'ClusterOut' -exec zip -q -r -m ../compressed_logs/phase3_logs.zip {} \;" +sbatch --job-name=compress_logs --time=1-00:00:00 --mem=1G -o ../compressed_logs/zip_phase3_errs.txt --wrap="find . -type d -name 'ClusterErr' -exec zip -q -r -m ../compressed_logs/phase3_errs.zip {} \;" message0 "This is the last step. If you see no file in the list below, the SP is successfully completed." From c6769ed974ed5945bf6cccc8483568c006992845 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 15:17:56 +0100 Subject: [PATCH 26/46] Remove intermediate logs --- orchestration/orchestration.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 8efbd980..eea3c1fb 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -161,23 +161,18 @@ cd ../stats_results message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then - message0 "Inside loop" fetch_script jobs/function_windowed.R - message0 "Downloaded" mv function_windowed.R function.R - message0 "Renamed" else fetch_script function.R fi - -message0 "Before replace" R --quiet -e \ "DRrequiredAgeing:::ReplaceWordInFile( \ '$(realpath function.R)', \ 'DRversionNotSpecified', \ ${VERSION} \ )" -message0 "After replace" + chmod 775 AllJobs.bch submit_limit_jobs AllJobs.bch ../compressed_logs/phase3_job_id.txt waitTillCommandFinish From 9db99215a4244446961360a0e4bc1f0e6950fd85 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 15:34:25 +0100 Subject: [PATCH 27/46] Edit path of annotation pipeline logs --- orchestration/orchestration.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index eea3c1fb..2e11f538 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -208,15 +208,15 @@ for dir in $(find . -mindepth 2 -maxdepth 2 -type d); do base_dir=$(basename "$dir") output_file="FileIndex_${base_dir}_$(printf "%.6f" $(echo $RANDOM/32767 | bc -l)).Ind" echo "sbatch --job-name=impc_stats_pipeline_job --mem=1G --time=2-00 \ --e ${base_dir}_error.err -o ${base_dir}_output.log --wrap=\"find $dir -type f -name '*.tsv' -exec realpath {} \; > $output_file\"" >> minijobs.bch +-e ../../compressed_logs/minijobs_logs/ ${base_dir}.err -o ../../compressed_logs/minijobs_logs/${base_dir}.log \ +--wrap=\"find $dir -type f -name '*.tsv' -exec realpath {} \; > $output_file\"" >> minijobs.bch done chmod 775 minijobs.bch submit_limit_jobs minijobs.bch ../../compressed_logs/minijobs_job_id.txt waitTillCommandFinish mv minijobs.bch ../../compressed_logs +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_minijobs.txt --wrap="zip -r -m -q ../../compressed_logs/minijobs_logs.zip ../../compressed_logs/minijobs_logs/" -find . -type f -name '*_output.log' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + -find . -type f -name '*_error.err' -exec zip -q -m ../../compressed_logs/minijobs_logs.zip {} + message0 "Moving single indeces into a separate directory called annotation_extractor..." mkdir ../../annotation_extractor chmod 775 ../../annotation_extractor @@ -226,7 +226,7 @@ mv ../stats_results/Results_IMPC_SP_Windowed/*.Ind . message0 "Concatenating single index files to create a global index for the results..." cat *.Ind | shuf >> AllResultsIndeces.txt message0 "Zipping the single indeces..." -zip -q -rm allsingleindeces.zip *.Ind +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="zip -r -m -q allsingleindeces.zip *.Ind" split -1000 AllResultsIndeces.txt split_index_ message0 "Convert the mp_chooser JSON file to Rdata..." From d1255597eda909e540624e0ab28bbddc6316eef5 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 15:41:43 +0100 Subject: [PATCH 28/46] Create slurm job for annotation pipeline logs compression --- orchestration/orchestration.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 2e11f538..99c7ec04 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -238,10 +238,9 @@ if [[ -z "${MP_CHOOSER_FILE}" || ! -f "${MP_CHOOSER_FILE}" ]]; then exit 1 fi -mkdir err log out for file in $(find . -maxdepth 1 -type f -name "split_index*"); do echo "sbatch --job-name=impc_stats_pipeline_job --mem=5G --time=2-00 \ - -e err/$(basename "$file").err -o out/$(basename "$file").out --wrap='python3 loader.py $(basename "$file") ${MP_CHOOSER_FILE}'" >> annotation_jobs.bch +-e ../compressed_logs/annotation_logs/$(basename "$file").err -o ../compressed_logs/annotation_logs/$(basename "$file").out --wrap='python3 loader.py $(basename "$file") ${MP_CHOOSER_FILE}'" >> annotation_jobs.bch done chmod 775 annotation_jobs.bch @@ -256,8 +255,8 @@ fetch_script loader.py annotation_pipeline submit_limit_jobs annotation_jobs.bch ../compressed_logs/annotation_job_id.txt waitTillCommandFinish -message0 "Zipping logs..." +message0 "Running Slurm jobs to compress logs..." mv annotation_jobs.bch ../compressed_logs -zip -q -rm ../compressed_logs/annotation_logs.zip log/* err/* out/* -zip -q -rm splits.zip split_index_* +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_annotations.txt --wrap="zip -r -m -q ../compressed_logs/annotation_logs/.zip ../compressed_logs/annotation_logs/" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="find ../ -type f -name 'split_index_*' -exec zip -q -m splits.zip {} +" message0 "Job done." From ab336e7f8fecbd33e40a5e64d24440e0f39ef349 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 15:43:15 +0100 Subject: [PATCH 29/46] Change path to annotation_pipeline_output --- annotation_pipeline/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annotation_pipeline/loader.py b/annotation_pipeline/loader.py index 4548f684..24362a84 100644 --- a/annotation_pipeline/loader.py +++ b/annotation_pipeline/loader.py @@ -86,8 +86,8 @@ def main(): file_list = [line.strip() for line in f] total_files = len(file_list) - # Store StatPackets temporary. - output_dir = Path("annotation_pipeline_output") + # Store StatPackets. + output_dir = Path("../annotation_pipeline_output") if not output_dir.exists(): output_dir.mkdir() From e8e17bdfa26d9ef67497b131130c15a6eb0f490b Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 16:01:03 +0100 Subject: [PATCH 30/46] Fix Typo --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 99c7ec04..65919ac6 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -208,7 +208,7 @@ for dir in $(find . -mindepth 2 -maxdepth 2 -type d); do base_dir=$(basename "$dir") output_file="FileIndex_${base_dir}_$(printf "%.6f" $(echo $RANDOM/32767 | bc -l)).Ind" echo "sbatch --job-name=impc_stats_pipeline_job --mem=1G --time=2-00 \ --e ../../compressed_logs/minijobs_logs/ ${base_dir}.err -o ../../compressed_logs/minijobs_logs/${base_dir}.log \ +-e ../../compressed_logs/minijobs_logs/${base_dir}.err -o ../../compressed_logs/minijobs_logs/${base_dir}.log \ --wrap=\"find $dir -type f -name '*.tsv' -exec realpath {} \; > $output_file\"" >> minijobs.bch done chmod 775 minijobs.bch From b558dd72e5c1d1204fb0cfe0a96e76dfc2bb452e Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 16:25:28 +0100 Subject: [PATCH 31/46] Fix Typo --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 65919ac6..af97f0a7 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -258,5 +258,5 @@ waitTillCommandFinish message0 "Running Slurm jobs to compress logs..." mv annotation_jobs.bch ../compressed_logs sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_annotations.txt --wrap="zip -r -m -q ../compressed_logs/annotation_logs/.zip ../compressed_logs/annotation_logs/" -sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="find ../ -type f -name 'split_index_*' -exec zip -q -m splits.zip {} +" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="find . -type f -name 'split_index_*' -exec zip -q -m splits.zip {} +" message0 "Job done." From 7d15079e915818fa18f7b3d6728347a4a8896bb2 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 16:29:32 +0100 Subject: [PATCH 32/46] Fix Typo --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index af97f0a7..aa0f518c 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -215,7 +215,7 @@ chmod 775 minijobs.bch submit_limit_jobs minijobs.bch ../../compressed_logs/minijobs_job_id.txt waitTillCommandFinish mv minijobs.bch ../../compressed_logs -sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_minijobs.txt --wrap="zip -r -m -q ../../compressed_logs/minijobs_logs.zip ../../compressed_logs/minijobs_logs/" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../../compressed_logs/zip_minijobs.txt --wrap="zip -r -m -q ../../compressed_logs/minijobs_logs.zip ../../compressed_logs/minijobs_logs/" message0 "Moving single indeces into a separate directory called annotation_extractor..." mkdir ../../annotation_extractor From d76e66889cfc193c8629f6648343fe9365a21064 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 16:41:38 +0100 Subject: [PATCH 33/46] Fix path to annotation_logs --- orchestration/orchestration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index aa0f518c..c22b5972 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -257,6 +257,6 @@ waitTillCommandFinish message0 "Running Slurm jobs to compress logs..." mv annotation_jobs.bch ../compressed_logs -sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_annotations.txt --wrap="zip -r -m -q ../compressed_logs/annotation_logs/.zip ../compressed_logs/annotation_logs/" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_annotations.txt --wrap="zip -r -m -q ../compressed_logs/annotation_logs.zip ../compressed_logs/annotation_logs/" sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="find . -type f -name 'split_index_*' -exec zip -q -m splits.zip {} +" message0 "Job done." From 28a3785208e289172610ee9c878e459944af38d6 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 16:53:19 +0100 Subject: [PATCH 34/46] Rename AllResultsIndeces.txt to global_results_index.txt --- orchestration/orchestration.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index c22b5972..6c56e2a0 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -224,10 +224,10 @@ cd ../../annotation_extractor mv ../stats_results/Results_IMPC_SP_Windowed/*.Ind . message0 "Concatenating single index files to create a global index for the results..." -cat *.Ind | shuf >> AllResultsIndeces.txt +cat *.Ind | shuf --random-source=<(yes "42") >> global_results_index.txt message0 "Zipping the single indeces..." sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="zip -r -m -q allsingleindeces.zip *.Ind" -split -1000 AllResultsIndeces.txt split_index_ +split -1000 global_results_index.txt split_index_ message0 "Convert the mp_chooser JSON file to Rdata..." R --quiet -e "a = jsonlite::fromJSON('../mp_chooser.json');save(a,file='../mp_chooser.json.Rdata')" From 052460600fc146f4872e60b0903acc79d0a10be1 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 16:55:43 +0100 Subject: [PATCH 35/46] Remove unneeded underscore --- annotation_pipeline/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotation_pipeline/loader.py b/annotation_pipeline/loader.py index 24362a84..3b4c9310 100644 --- a/annotation_pipeline/loader.py +++ b/annotation_pipeline/loader.py @@ -91,7 +91,7 @@ def main(): if not output_dir.exists(): output_dir.mkdir() - output_file = output_dir / (file_list_path.name + "_.statpackets") + output_file = output_dir / (file_list_path.name + ".statpackets") for i, file in enumerate(file_list): file_path = Path(file) From 255327f1429d669bd08c6c2db6a274d9445b1eab Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 17:00:07 +0100 Subject: [PATCH 36/46] Rename allsingleindeces.zip --- orchestration/orchestration.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 6c56e2a0..671ca2f1 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -217,7 +217,7 @@ waitTillCommandFinish mv minijobs.bch ../../compressed_logs sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../../compressed_logs/zip_minijobs.txt --wrap="zip -r -m -q ../../compressed_logs/minijobs_logs.zip ../../compressed_logs/minijobs_logs/" -message0 "Moving single indeces into a separate directory called annotation_extractor..." +message0 "Moving single indices into a separate directory called annotation_extractor..." mkdir ../../annotation_extractor chmod 775 ../../annotation_extractor cd ../../annotation_extractor @@ -225,8 +225,8 @@ mv ../stats_results/Results_IMPC_SP_Windowed/*.Ind . message0 "Concatenating single index files to create a global index for the results..." cat *.Ind | shuf --random-source=<(yes "42") >> global_results_index.txt -message0 "Zipping the single indeces..." -sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="zip -r -m -q allsingleindeces.zip *.Ind" +message0 "Zipping the single indices..." +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indices.txt --wrap="zip -r -m -q individual_indices.zip *.Ind" split -1000 global_results_index.txt split_index_ message0 "Convert the mp_chooser JSON file to Rdata..." @@ -258,5 +258,5 @@ waitTillCommandFinish message0 "Running Slurm jobs to compress logs..." mv annotation_jobs.bch ../compressed_logs sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_annotations.txt --wrap="zip -r -m -q ../compressed_logs/annotation_logs.zip ../compressed_logs/annotation_logs/" -sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_indeces.txt --wrap="find . -type f -name 'split_index_*' -exec zip -q -m splits.zip {} +" +sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_splits.txt --wrap="find . -type f -name 'split_index_*' -exec zip -q -m splits.zip {} +" message0 "Job done." From 2fa86a12a2cd64ba2b529411eafd1a04b4975661 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 17:17:22 +0100 Subject: [PATCH 37/46] Rename AllJobs.bch --- orchestration/orchestration.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 671ca2f1..62dbc7d9 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -146,7 +146,7 @@ sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/z message0 "Appending all procedure based jobs into one single file..." mkdir ../stats_results -find ./*/*_RawData/*.bch -type f | xargs cat >> ../stats_results/AllJobs.bch +find ./*/*_RawData/*.bch -type f | xargs cat >> ../stats_results/phase3_jobs.bch message0 "Phase III. Initialising the statistical analysis..." cd ../stats_results @@ -173,8 +173,8 @@ R --quiet -e \ ${VERSION} \ )" -chmod 775 AllJobs.bch -submit_limit_jobs AllJobs.bch ../compressed_logs/phase3_job_id.txt +chmod 775 phase3_jobs.bch +submit_limit_jobs phase3_jobs.bch ../compressed_logs/phase3_job_id.txt waitTillCommandFinish message0 "Postprocessing the IMPC statistical analysis results..." From 09071f0a0f3fb19fd5fc2aabcec37f15dcdba4e8 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Thu, 26 Jun 2025 17:20:26 +0100 Subject: [PATCH 38/46] Rewrite log messages --- orchestration/orchestration.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 62dbc7d9..7c4d23d4 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -238,6 +238,7 @@ if [[ -z "${MP_CHOOSER_FILE}" || ! -f "${MP_CHOOSER_FILE}" ]]; then exit 1 fi +message0 "Generate annotation jobs..." for file in $(find . -maxdepth 1 -type f -name "split_index*"); do echo "sbatch --job-name=impc_stats_pipeline_job --mem=5G --time=2-00 \ -e ../compressed_logs/annotation_logs/$(basename "$file").err -o ../compressed_logs/annotation_logs/$(basename "$file").out --wrap='python3 loader.py $(basename "$file") ${MP_CHOOSER_FILE}'" >> annotation_jobs.bch @@ -250,7 +251,7 @@ python3.10 -m pip install rpy2 python3.10 -m pip install numpy python3.10 -m pip install pandas -message0 "Downloading the action script..." +message0 "Downloading the action script loader.py..." fetch_script loader.py annotation_pipeline submit_limit_jobs annotation_jobs.bch ../compressed_logs/annotation_job_id.txt waitTillCommandFinish From 40d23c0f275ef77bf28b7018757251c559ee27d2 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 09:44:35 +0100 Subject: [PATCH 39/46] Uncomment updateImpress --- orchestration/orchestration.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 7c4d23d4..7820b29b 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -150,14 +150,14 @@ find ./*/*_RawData/*.bch -type f | xargs cat >> ../stats_results/phase3_jobs.bc message0 "Phase III. Initialising the statistical analysis..." cd ../stats_results -# message0 "Updating the dynamic contents from the IMPReSS..." -# R --quiet -e \ -# "DRrequiredAgeing:::updateImpress( \ -# updateImpressFileInThePackage = TRUE, \ -# updateOptionalParametersList = TRUE, \ -# updateTheSkipList = TRUE, \ -# saveRdata = FALSE \ -# )" +message0 "Updating the dynamic contents from the IMPReSS..." +R --quiet -e \ +"DRrequiredAgeing:::updateImpress( \ + updateImpressFileInThePackage = TRUE, \ + updateOptionalParametersList = TRUE, \ + updateTheSkipList = TRUE, \ + saveRdata = FALSE \ +)" message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then From 6787277fce79f4c5006da2e43a44fc9e22fce218 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 09:50:22 +0100 Subject: [PATCH 40/46] Rename upper level folders --- orchestration/orchestration.sh | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 7820b29b..ce68dcd7 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -88,12 +88,12 @@ message0 "Update completed" # Statistical pipeline. message0 "Starting the IMPC statistical pipeline..." -mkdir stats_batching compressed_logs +mkdir 01_batching compressed_logs export input_path=$(realpath .) -export sp_results=$(realpath stats_batching) +export sp_results=$(realpath 01_batching) message0 "Parquet files path: ${input_path}" message0 "Output path: ${sp_results}" -cd stats_batching +cd 01_batching message0 "Phase I. Convert parquet files into Rdata..." @@ -145,11 +145,11 @@ mv *.bch ../compressed_logs/phase2_logs/ sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_phase2.txt --wrap="zip -r -m -q ../compressed_logs/phase2_logs.zip ../compressed_logs/phase2_logs/" message0 "Appending all procedure based jobs into one single file..." -mkdir ../stats_results -find ./*/*_RawData/*.bch -type f | xargs cat >> ../stats_results/phase3_jobs.bch +mkdir ../02_sp_output +find ./*/*_RawData/*.bch -type f | xargs cat >> ../02_sp_output/phase3_jobs.bch message0 "Phase III. Initialising the statistical analysis..." -cd ../stats_results +cd ../02_sp_output message0 "Updating the dynamic contents from the IMPReSS..." R --quiet -e \ "DRrequiredAgeing:::updateImpress( \ @@ -201,7 +201,7 @@ message0 "This is the last step. If you see no file in the list below, the SP is # Annotation pipeline. message0 "Starting the IMPC annotation pipeline..." -cd ../stats_results/Results_IMPC_SP_Windowed/ +cd ../02_sp_output/Results_IMPC_SP_Windowed/ message0 "Step 1: Clean ups and creating the global index for the results." message0 "Indexing the results..." for dir in $(find . -mindepth 2 -maxdepth 2 -type d); do @@ -217,11 +217,11 @@ waitTillCommandFinish mv minijobs.bch ../../compressed_logs sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../../compressed_logs/zip_minijobs.txt --wrap="zip -r -m -q ../../compressed_logs/minijobs_logs.zip ../../compressed_logs/minijobs_logs/" -message0 "Moving single indices into a separate directory called annotation_extractor..." -mkdir ../../annotation_extractor -chmod 775 ../../annotation_extractor -cd ../../annotation_extractor -mv ../stats_results/Results_IMPC_SP_Windowed/*.Ind . +message0 "Moving single indices into a separate directory called 03_indices_and_splits..." +mkdir ../../03_indices_and_splits +chmod 775 ../../03_indices_and_splits +cd ../../03_indices_and_splits +mv ../02_sp_output/Results_IMPC_SP_Windowed/*.Ind . message0 "Concatenating single index files to create a global index for the results..." cat *.Ind | shuf --random-source=<(yes "42") >> global_results_index.txt From 5b58c83767e7ede00f09e0fa54ffe3ca2cf0194a Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 10:05:04 +0100 Subject: [PATCH 41/46] Make job name version specific --- orchestration/orchestration.sh | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index ce68dcd7..d5f0e1f0 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -18,6 +18,7 @@ WINDOWING_PIPELINE=${7:-"true"} # Redirect all output and errors to the log file. LOGFILE=${KOMP_PATH}/impc_statistical_pipeline/IMPC_DRs/stats_pipeline_logs/orchestration_${VERSION}.log +JOBNAME="impc_job_${VERSION}" exec > >(tee -a "$LOGFILE") 2>&1 echo "Starting pipeline run. Data release $VERSION. Fetching packages from $REMOTE / $BRANCH." @@ -33,7 +34,7 @@ function message0() { # Function to wait until all jobs on SLURM complete. function waitTillCommandFinish() { while true; do - if ! (squeue --format="%A %.30j" | grep -q "impc_stats_pipeline_job"); then + if ! (squeue --format="%A %.30j" | grep -q "$JOBNAME"); then message0 "Done waiting for SLURM jobs to complete." break fi @@ -101,12 +102,12 @@ message0 "Step 1. Create jobs" step1_files=$(find ../input_parquet_files -type f -name '*.parquet' -exec realpath {} \;) for file in $step1_files; do file_name=$(basename "${file}" .parquet) - echo "sbatch --job-name=impc_stats_pipeline_job --mem=10G --time=00:10:00 -e ../compressed_logs/step2_logs/${file_name}.err -o ../compressed_logs/step2_logs/${file_name}.log --wrap='Rscript Step2Parquet2Rdata.R $file'" >> jobs_step2_Parquet2Rdata.bch + echo "sbatch --job-name=${JOBNAME} --mem=10G --time=00:10:00 -e ../compressed_logs/step2_logs/${file_name}.err -o ../compressed_logs/step2_logs/${file_name}.log --wrap='Rscript Step2Parquet2Rdata.R $file'" >> jobs_step2_Parquet2Rdata.bch done message0 "Step 2. Read parquet files and create pseudo Rdata" fetch_script 0-ETL/Step2Parquet2Rdata.R -sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step2_job_id.txt --wrap="bash jobs_step2_Parquet2Rdata.bch" +sbatch --job-name=${JOBNAME} --time=01:00:00 --mem=1G -o ../compressed_logs/step2_job_id.txt --wrap="bash jobs_step2_Parquet2Rdata.bch" waitTillCommandFinish rm Step2Parquet2Rdata.R sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step2.txt --wrap="zip -r -m -q ../compressed_logs/step2_logs.zip ../compressed_logs/step2_logs/" @@ -115,12 +116,12 @@ message0 "Step 3. Merging pseudo Rdata files into single file for each procedure dirs=$(find "${sp_results}/ProcedureScatterRdata" -maxdepth 1 -mindepth 1 -type d) for dir in $dirs; do file_name=$(basename "${dir}") - echo "sbatch --job-name=impc_stats_pipeline_job --mem=50G --time=01:30:00 -e ../compressed_logs/step4_logs/${file_name}_step4.err -o ../compressed_logs/step4_logs/${file_name}_step4.log --wrap='Rscript Step4MergingRdataFiles.R ${dir}'" >> jobs_step4_MergeRdatas.bch + echo "sbatch --job-name=${JOBNAME} --mem=50G --time=01:30:00 -e ../compressed_logs/step4_logs/${file_name}_step4.err -o ../compressed_logs/step4_logs/${file_name}_step4.log --wrap='Rscript Step4MergingRdataFiles.R ${dir}'" >> jobs_step4_MergeRdatas.bch done message0 "Step 4. Merging pseudo Rdata files into single files per procedure" fetch_script 0-ETL/Step4MergingRdataFiles.R -sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/step4_job_id.txt --wrap="bash jobs_step4_MergeRdatas.bch" +sbatch --job-name=${JOBNAME} --time=01:00:00 --mem=1G -o ../compressed_logs/step4_job_id.txt --wrap="bash jobs_step4_MergeRdatas.bch" waitTillCommandFinish rm Step4MergingRdataFiles.R sbatch --job-name=compress_logs --time=15:00:00 --mem=1G -o ../compressed_logs/zip_step4.txt --wrap="zip -r -m -q ../compressed_logs/step4_logs.zip ../compressed_logs/step4_logs/" @@ -132,10 +133,10 @@ rm -rf ProcedureScatterRdata message0 "Starting Phase II, packaging the big data into small packages ..." for file in $(find Rdata -type f -exec realpath {} \;); do file_basename=$(basename $file .Rdata) - echo "sbatch --job-name=impc_stats_pipeline_job --mem=45G --time=6-00 -e ../compressed_logs/phase2_logs/${file_basename}.err -o ../compressed_logs/phase2_logs/${file_basename}.log --wrap='Rscript InputDataGenerator.R ${file} ${file_basename}'" >> DataGenerationJobList.bch + echo "sbatch --job-name=${JOBNAME} --mem=45G --time=6-00 -e ../compressed_logs/phase2_logs/${file_basename}.err -o ../compressed_logs/phase2_logs/${file_basename}.log --wrap='Rscript InputDataGenerator.R ${file} ${file_basename}'" >> DataGenerationJobList.bch done fetch_script jobs/InputDataGenerator.R -sbatch --job-name=impc_stats_pipeline_job --time=01:00:00 --mem=1G -o ../compressed_logs/phase2_job_id.txt --wrap="bash DataGenerationJobList.bch" +sbatch --job-name=${JOBNAME} --time=01:00:00 --mem=1G -o ../compressed_logs/phase2_job_id.txt --wrap="bash DataGenerationJobList.bch" waitTillCommandFinish rm InputDataGenerator.R @@ -207,7 +208,7 @@ message0 "Indexing the results..." for dir in $(find . -mindepth 2 -maxdepth 2 -type d); do base_dir=$(basename "$dir") output_file="FileIndex_${base_dir}_$(printf "%.6f" $(echo $RANDOM/32767 | bc -l)).Ind" - echo "sbatch --job-name=impc_stats_pipeline_job --mem=1G --time=2-00 \ + echo "sbatch --job-name=${JOBNAME} --mem=1G --time=2-00 \ -e ../../compressed_logs/minijobs_logs/${base_dir}.err -o ../../compressed_logs/minijobs_logs/${base_dir}.log \ --wrap=\"find $dir -type f -name '*.tsv' -exec realpath {} \; > $output_file\"" >> minijobs.bch done @@ -240,7 +241,7 @@ fi message0 "Generate annotation jobs..." for file in $(find . -maxdepth 1 -type f -name "split_index*"); do - echo "sbatch --job-name=impc_stats_pipeline_job --mem=5G --time=2-00 \ + echo "sbatch --job-name=${JOBNAME} --mem=5G --time=2-00 \ -e ../compressed_logs/annotation_logs/$(basename "$file").err -o ../compressed_logs/annotation_logs/$(basename "$file").out --wrap='python3 loader.py $(basename "$file") ${MP_CHOOSER_FILE}'" >> annotation_jobs.bch done chmod 775 annotation_jobs.bch From ba4cf84310ebbfb6da835e8a8408e06e03a60fc3 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 11:50:02 +0100 Subject: [PATCH 42/46] Rename job for phase III --- .../DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R | 1 + .../DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R index 0f9243cc..f29010a5 100644 --- a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R +++ b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R @@ -300,6 +300,7 @@ mainAgeing = function(file = NULL , cpu = cpu , memory = memory , time = time , + jobname = "impc_job_" + DRversion, extraBatchParameters = extraBatchParameters ) write( diff --git a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R index 13cacd5c..fe814d47 100644 --- a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R +++ b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R @@ -688,6 +688,7 @@ BatchGenerator = function(file , cpu = 1 , memory = "8G" , time = "10:00:00" , + jobname = NULL , extraBatchParameters = NULL) { dirOut = file.path(dir, 'ClusterOut') dirErr = file.path(dir, 'ClusterErr') @@ -701,7 +702,7 @@ BatchGenerator = function(file , ro = paste(' -o ', paste0('"', oname, '.ClusterOut', '"'), sep = '') re = paste(' -e ', paste0('"', ename, '.ClusterErr', '"'), sep = '') rf = paste( - "sbatch --job-name=impc_stats_pipeline_job --mem=", memory, + "sbatch --job-name= ", jobname, " --mem=", memory, " --time=", time, extraBatchParameters , ' --cpus-per-task=' , From 54b588a0467197b9322d5990d360eb1e8ca173c1 Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 13:32:04 +0100 Subject: [PATCH 43/46] Fix concatenation bug --- .../DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R index f29010a5..2eb6423b 100644 --- a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R +++ b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/main.R @@ -300,7 +300,7 @@ mainAgeing = function(file = NULL , cpu = cpu , memory = memory , time = time , - jobname = "impc_job_" + DRversion, + jobname = paste0("impc_job_", DRversion), extraBatchParameters = extraBatchParameters ) write( From 9005b614934783cca336f4670d5e9fd5f250bbfd Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 13:32:44 +0100 Subject: [PATCH 44/46] Temporarily disable updateImpress --- orchestration/orchestration.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index d5f0e1f0..7fa5863c 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -152,13 +152,13 @@ find ./*/*_RawData/*.bch -type f | xargs cat >> ../02_sp_output/phase3_jobs.bch message0 "Phase III. Initialising the statistical analysis..." cd ../02_sp_output message0 "Updating the dynamic contents from the IMPReSS..." -R --quiet -e \ -"DRrequiredAgeing:::updateImpress( \ - updateImpressFileInThePackage = TRUE, \ - updateOptionalParametersList = TRUE, \ - updateTheSkipList = TRUE, \ - saveRdata = FALSE \ -)" +# R --quiet -e \ +# "DRrequiredAgeing:::updateImpress( \ +# updateImpressFileInThePackage = TRUE, \ +# updateOptionalParametersList = TRUE, \ +# updateTheSkipList = TRUE, \ +# saveRdata = FALSE \ +# )" message0 "Running the IMPC statistical pipeline by submitting jobs..." if [ "${WINDOWING_PIPELINE}" = true ]; then From ccba35684cf77c0da4119b6a95698c9f66f339de Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 13:43:46 +0100 Subject: [PATCH 45/46] Fix bug for mainAgeing --- .../inst/extdata/StatsPipeline/jobs/InputDataGenerator.R | 3 ++- orchestration/orchestration.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/inst/extdata/StatsPipeline/jobs/InputDataGenerator.R b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/inst/extdata/StatsPipeline/jobs/InputDataGenerator.R index ce241a4e..cdf352d0 100644 --- a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/inst/extdata/StatsPipeline/jobs/InputDataGenerator.R +++ b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/inst/extdata/StatsPipeline/jobs/InputDataGenerator.R @@ -20,7 +20,8 @@ generate_data <- function(args, thresh = 4) { controlSize = 1500, extraBatchParameters = NULL, combineEAandLA = FALSE, - solrBaseURL = NULL + solrBaseURL = NULL, + DRversion = args[3] ) trash <- NULL gc() diff --git a/orchestration/orchestration.sh b/orchestration/orchestration.sh index 7fa5863c..23167a92 100755 --- a/orchestration/orchestration.sh +++ b/orchestration/orchestration.sh @@ -133,7 +133,7 @@ rm -rf ProcedureScatterRdata message0 "Starting Phase II, packaging the big data into small packages ..." for file in $(find Rdata -type f -exec realpath {} \;); do file_basename=$(basename $file .Rdata) - echo "sbatch --job-name=${JOBNAME} --mem=45G --time=6-00 -e ../compressed_logs/phase2_logs/${file_basename}.err -o ../compressed_logs/phase2_logs/${file_basename}.log --wrap='Rscript InputDataGenerator.R ${file} ${file_basename}'" >> DataGenerationJobList.bch + echo "sbatch --job-name=${JOBNAME} --mem=45G --time=6-00 -e ../compressed_logs/phase2_logs/${file_basename}.err -o ../compressed_logs/phase2_logs/${file_basename}.log --wrap='Rscript InputDataGenerator.R ${file} ${file_basename} ${VERSION}'" >> DataGenerationJobList.bch done fetch_script jobs/InputDataGenerator.R sbatch --job-name=${JOBNAME} --time=01:00:00 --mem=1G -o ../compressed_logs/phase2_job_id.txt --wrap="bash DataGenerationJobList.bch" From bebc4e71a2c2bea5d5104aa7f0a7a90e5e7db9dc Mon Sep 17 00:00:00 2001 From: Marina Kan Date: Fri, 27 Jun 2025 14:23:02 +0100 Subject: [PATCH 46/46] Remove extra space --- .../DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R index fe814d47..e1ec3174 100644 --- a/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R +++ b/Late adults stats pipeline/DRrequiredAgeing/DRrequiredAgeingPackage/R/sideFunctions.R @@ -702,7 +702,7 @@ BatchGenerator = function(file , ro = paste(' -o ', paste0('"', oname, '.ClusterOut', '"'), sep = '') re = paste(' -e ', paste0('"', ename, '.ClusterErr', '"'), sep = '') rf = paste( - "sbatch --job-name= ", jobname, " --mem=", memory, + "sbatch --job-name=", jobname, " --mem=", memory, " --time=", time, extraBatchParameters , ' --cpus-per-task=' ,