Merge remote-tracking branch 'origin/dev' into feature/impc-spa

francisco-ebi · francisco-ebi · commit 8f94f9fdba0c · 2025-11-18T12:58:03.000Z
diff --git a/impc_etl/jobs/ingest/data_ingestion.py b/impc_etl/jobs/ingest/data_ingestion.py
@@ -314,7 +314,7 @@ def fetch_dcc_xml_data(context):
 
     # Define the output file path
     zip_file_path = f"{data_archive_path}/{dr_tag}/{dcc_xml_zip_file_name}"
-    extract_path = f"{data_archive_path}/{dr_tag}/dcc_xml_extracted"
+    extract_path = f"{data_archive_path}/{dr_tag}/dcc_xml_extracted/"
     impc_xml_output_path = f"{input_data_path}/xml/impc/"
 
     try:
@@ -358,10 +358,11 @@ def fetch_dcc_xml_data(context):
     task_logger.info(f"File extracted successfully: {extract_path}")
 
     # Copy extracted data to final output directory skipping the top-level directory 'extracted'
-    for item in os.listdir(extract_path):
+    data_path = f"{extract_path}/data/"
+    for item in os.listdir(data_path):
         if item.startswith("._"):
             continue  # Skip macOS dot files
-        s = os.path.join(extract_path, item)
+        s = os.path.join(data_path, item)
         d = os.path.join(impc_xml_output_path, item)
         if os.path.isdir(s):
             shutil.copytree(s, d, dirs_exist_ok=True)
diff --git a/impc_etl/utils/spark.py b/impc_etl/utils/spark.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import sys
 from functools import wraps
 
 from airflow.hooks.base import BaseHook
@@ -26,7 +27,9 @@ def wrapper():
             spark_master = spark_connection.get("spark_master", "localhost")
             environment = Variable.get("environment", "development")
             task_logger.info(f"Spark connection URL: {spark_connection}")
-            task_logger.info("::group::SPARK LOGS")
+            task_logger.info(f"Initializing Spark session for function: {func.__name__} on {environment} environment")
+            task_logger.info(f"Driver Python executable: {sys.executable}")
+            task_logger.info(f"Driver Python version: {sys.version}")
 
             conf = (
                 SparkConf()
@@ -40,15 +43,15 @@ def wrapper():
                     "spark.driver.extraJavaOptions",
                     "-Dlog4j.configuration=file:/opt/airflow/log4j.properties",
                 )
-                .set("spark.pyspark.python", ".")
+
             )
 
             if environment == "development":
                 conf.set("spark.driver.memory", "2g").set(
                     "spark.executor.memory", "3g"
                 ).set("spark.driver.maxResultSize", "2g").set(
                     "spark.executor.cores", "2"
-                )
+                ).set("spark.pyspark.python", ".")
             else:
                 # Production configuration matching Luigi settings
                 conf.set("spark.driver.memory", "30g").set(
@@ -65,7 +68,7 @@ def wrapper():
                     "spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED"
                 ).set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED").set(
                     "spark.sql.session.timeZone", "UTC"
-                )
+                ).set("spark.pyspark.python", sys.executable).set("spark.pyspark.driver.python", sys.executable)
 
             jars_packages = []
             if postgres_database:
@@ -81,37 +84,56 @@ def wrapper():
                 conf.set("spark.mongodb.read.connection.uri", mongo_url)
                 conf.set("spark.mongodb.write.connection.uri", mongo_url)
 
-            # Add the impc_etl module to Spark workers using PYTHONPATH
-            # Since all workers run on the same file system, we can use PYTHONPATH
+            # Add the impc_etl module to Spark workers
             import impc_etl
+            import zipfile
+            import tempfile
 
             impc_etl_path = os.path.dirname(impc_etl.__file__)
             parent_path = os.path.dirname(impc_etl_path)
             task_logger.info(
-                f"Adding impc_etl module path to PYTHONPATH: {parent_path}"
+                f"Preparing impc_etl module from: {parent_path}"
             )
 
-            # Get an existing PYTHONPATH if any and append our path
-            existing_python_path = os.environ.get("PYTHONPATH", "")
-            if existing_python_path:
-                updated_python_path = f"{parent_path}:{existing_python_path}"
-            else:
-                updated_python_path = parent_path
+            # Create a temporary zip file of the impc_etl module
+            temp_zip_path = None
+            temp_zip_fd, temp_zip_path = tempfile.mkstemp(suffix='.zip', prefix='impc_etl_')
+            os.close(temp_zip_fd)  # Close the file descriptor
 
-            # Set PYTHONPATH for executors through Spark configuration
-            conf.set("spark.executorEnv.PYTHONPATH", updated_python_path)
-            conf.set("spark.yarn.appMasterEnv.PYTHONPATH", updated_python_path)
-
-            # Also set it for the current environment to ensure consistency
-            os.environ["PYTHONPATH"] = updated_python_path
-
-            spark = SparkSession.builder.config(conf=conf).getOrCreate()
+            try:
+                # Create zip file containing all Python files
+                with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    for root, dirs, files in os.walk(impc_etl_path):
+                        # Skip __pycache__ directories
+                        dirs[:] = [d for d in dirs if d != '__pycache__']
+                        for file in files:
+                            if file.endswith('.py'):
+                                file_path = os.path.join(root, file)
+                                arcname = os.path.join('impc_etl', os.path.relpath(file_path, impc_etl_path))
+                                zipf.write(file_path, arcname)
+
+                task_logger.info(f"Created impc_etl zip: {temp_zip_path} (size: {os.path.getsize(temp_zip_path)} bytes)")
+
+                # Add zip to Spark configuration before creating session
+                conf.set("spark.submit.pyFiles", temp_zip_path)
+
+                spark = SparkSession.builder.config(conf=conf).getOrCreate()
+
+                # Also add via sparkContext for runtime distribution
+                spark.sparkContext.addPyFile(temp_zip_path)
+                task_logger.info(f"Added impc_etl module to Spark workers")
+            except Exception as e:
+                task_logger.error(f"Failed to prepare impc_etl module: {e}")
+                if temp_zip_path and os.path.exists(temp_zip_path):
+                    os.unlink(temp_zip_path)
+                raise
 
             spark_logger = logging.getLogger("spark")
             spark_logger.setLevel(logging.WARNING)
             spark_logger.handlers = task_logger.handlers
             spark.sparkContext.setLogLevel("WARN")
 
+            task_logger.info("::group::SPARK LOGS")
             try:
                 result = func()
                 return result
@@ -124,6 +146,13 @@ def wrapper():
                     task_logger.warning(f"Error stopping Spark session: {e}")
                     # Continue execution as the task may have completed successfully
                 finally:
+                    # Clean up temporary zip file
+                    if temp_zip_path and os.path.exists(temp_zip_path):
+                        try:
+                            os.unlink(temp_zip_path)
+                            task_logger.info("Cleaned up temporary impc_etl zip file")
+                        except Exception as e:
+                            task_logger.warning(f"Failed to clean up zip file: {e}")
                     task_logger.info("::endgroup::")
 
         return wrapper