Change pyspark path setting for production environment

ficolo · ficolo · commit 56fa69f9ca08 · 2025-11-14T14:54:45.000Z
diff --git a/impc_etl/utils/spark.py b/impc_etl/utils/spark.py
@@ -84,36 +84,48 @@ def wrapper():
                 conf.set("spark.mongodb.read.connection.uri", mongo_url)
                 conf.set("spark.mongodb.write.connection.uri", mongo_url)
 
-            # Add the impc_etl module to Spark workers using PYTHONPATH
-            # Since all workers run on the same file system, we can use PYTHONPATH
+            # Add the impc_etl module to Spark workers
             import impc_etl
+            import zipfile
+            import tempfile
 
             impc_etl_path = os.path.dirname(impc_etl.__file__)
             parent_path = os.path.dirname(impc_etl_path)
             task_logger.info(
-                f"Adding impc_etl module path to PYTHONPATH: {parent_path}"
+                f"Preparing impc_etl module from: {parent_path}"
             )
 
-            # Get an existing PYTHONPATH if any and append our path
-            existing_python_path = os.environ.get("PYTHONPATH", "")
-            if existing_python_path:
-                updated_python_path = f"{parent_path}:{existing_python_path}"
-            else:
-                updated_python_path = parent_path
-
-            # Set PYTHONPATH for executors through Spark configuration
-            conf.set("spark.executorEnv.PYTHONPATH", updated_python_path)
-            conf.set("spark.yarn.appMasterEnv.PYTHONPATH", updated_python_path)
-
-            # Also set it for the current environment to ensure consistency
-            os.environ["PYTHONPATH"] = updated_python_path
+            # Create a temporary zip file of the impc_etl module
+            temp_zip_fd, temp_zip_path = tempfile.mkstemp(suffix='.zip', prefix='impc_etl_')
+            os.close(temp_zip_fd)  # Close the file descriptor
 
-            spark = SparkSession.builder.config(conf=conf).getOrCreate()
-            task_logger.info(f"Adding impc_etl module path: {parent_path}")
-            spark.sparkContext.addPyFile(parent_path)
-            task_logger.info(
-                f"Added impc_etl parent directory to Spark workers: {parent_path}"
-            )
+            try:
+                # Create zip file containing all Python files
+                with zipfile.ZipFile(temp_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    for root, dirs, files in os.walk(impc_etl_path):
+                        # Skip __pycache__ directories
+                        dirs[:] = [d for d in dirs if d != '__pycache__']
+                        for file in files:
+                            if file.endswith('.py'):
+                                file_path = os.path.join(root, file)
+                                arcname = os.path.join('impc_etl', os.path.relpath(file_path, impc_etl_path))
+                                zipf.write(file_path, arcname)
+
+                task_logger.info(f"Created impc_etl zip: {temp_zip_path} (size: {os.path.getsize(temp_zip_path)} bytes)")
+
+                # Add zip to Spark configuration before creating session
+                conf.set("spark.submit.pyFiles", temp_zip_path)
+
+                spark = SparkSession.builder.config(conf=conf).getOrCreate()
+
+                # Also add via sparkContext for runtime distribution
+                spark.sparkContext.addPyFile(temp_zip_path)
+                task_logger.info(f"Added impc_etl module to Spark workers")
+            except Exception as e:
+                task_logger.error(f"Failed to prepare impc_etl module: {e}")
+                if os.path.exists(temp_zip_path):
+                    os.unlink(temp_zip_path)
+                raise
 
             spark_logger = logging.getLogger("spark")
             spark_logger.setLevel(logging.WARNING)
@@ -133,6 +145,13 @@ def wrapper():
                     task_logger.warning(f"Error stopping Spark session: {e}")
                     # Continue execution as the task may have completed successfully
                 finally:
+                    # Clean up temporary zip file
+                    if 'temp_zip_path' in locals() and os.path.exists(temp_zip_path):
+                        try:
+                            os.unlink(temp_zip_path)
+                            task_logger.info("Cleaned up temporary impc_etl zip file")
+                        except Exception as e:
+                            task_logger.warning(f"Failed to clean up zip file: {e}")
                     task_logger.info("::endgroup::")
 
         return wrapper