MITLibraries
diff --git a/‎Pipfile.lock‎
Lines changed: 418 additions & 430 deletions b/‎Pipfile.lock‎
Lines changed: 418 additions & 430 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 3 deletions b/‎README.md‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎lambdas/alma_prep.py‎
Lines changed: 4 additions & 2 deletions b/‎lambdas/alma_prep.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎lambdas/commands.py‎
Lines changed: 17 additions & 9 deletions b/‎lambdas/commands.py‎
Lines changed: 17 additions & 9 deletions
diff --git a/‎lambdas/config.py‎
Lines changed: 68 additions & 78 deletions b/‎lambdas/config.py‎
Lines changed: 68 additions & 78 deletions
diff --git a/‎lambdas/format_input.py‎
Lines changed: 33 additions & 15 deletions b/‎lambdas/format_input.py‎
Lines changed: 33 additions & 15 deletions
@@ -178,9 +178,7 @@ WORKSPACE=### Set to `dev` for local development; this will be set to `stage` an
 
 ### Optional
 
-```shell
-ETL_VERSION=### Version number of the TIMDEX ETL infrastructure. This can be used to align application behavior with the requirements of other applications in the TIMDEX ETL pipeline.
-```
+None at this time.
 
 
 
@@ -1,5 +1,4 @@
 import logging
-import os
 import tarfile
 from collections.abc import Generator
 from typing import IO, TYPE_CHECKING
@@ -11,9 +10,12 @@
     from mypy_boto3_s3.client import S3Client  # pragma: no cover
 
 from lambdas import helpers
+from lambdas.config import Config
 
 logger = logging.getLogger(__name__)
 
+CONFIG = Config()
+
 
 def extract_file_from_source_bucket_to_target_bucket(
     s3_client: "S3Client",
@@ -100,7 +102,7 @@ def prepare_alma_export_files(run_date: str, run_type: str, timdex_bucket: str)
     then performs the extract, unzip, rename and upload steps.
     """
     export_job_date = run_date.replace("-", "")
-    alma_bucket = os.environ["TIMDEX_ALMA_EXPORT_BUCKET_ID"]
+    alma_bucket = CONFIG.alma_export_bucket
     alma_export_files = helpers.list_s3_files_by_prefix(
         alma_bucket,
         f"exlibris/timdex/TIMDEX_ALMA_EXPORT_{run_type.upper()}_{export_job_date}",
 
@@ -1,9 +1,12 @@
 import logging
 
-from lambdas import config, helpers
+from lambdas import helpers
+from lambdas.config import Config
 
 logger = logging.getLogger(__name__)
 
+CONFIG = Config()
+
 
 def generate_extract_command(
     # ruff: noqa: FBT001
@@ -28,7 +31,7 @@ def generate_extract_command(
     if verbose:
         extract_command.append("--verbose")
 
-    if source in config.GIS_SOURCES:
+    if source in CONFIG.GIS_SOURCES:
         extract_command.append("harvest")
         if run_type == "daily":
             extract_command.append("--harvest-type=incremental")
@@ -80,7 +83,7 @@ def generate_transform_commands(
     for extract_output_file in extract_output_files:
         transform_command = [
             f"--input-file=s3://{timdex_bucket}/{extract_output_file}",
-            f"--output-location=s3://{timdex_bucket}/dataset",
+            f"--output-location={CONFIG.s3_timdex_dataset_data_location}",
             f"--source={source}",
             f"--run-id={run_id}",
             f"--run-timestamp={run_timestamp}",
@@ -94,11 +97,8 @@ def generate_load_commands(
     run_date: str,
     run_type: str,
     run_id: str,
-    timdex_bucket: str,
 ) -> dict:
     """Generate task run command for TIMDEX load."""
-    dataset_location = f"s3://{timdex_bucket}/dataset"
-
     update_command = [
         "bulk-update",
         "--run-date",
@@ -108,14 +108,22 @@ def generate_load_commands(
     ]
 
     if run_type == "daily":
-        update_command.extend(["--source", source, dataset_location])
+        update_command.extend(
+            [
+                "--source",
+                source,
+                CONFIG.s3_timdex_dataset_data_location,
+            ]
+        )
         return {"bulk-update-command": update_command}
 
     if run_type == "full":
         new_index_name = helpers.generate_index_name(source)
-        update_command.extend(["--index", new_index_name, dataset_location])
+        update_command.extend(
+            ["--index", new_index_name, CONFIG.s3_timdex_dataset_data_location]
+        )
         promote_index_command = ["promote", "--index", new_index_name]
-        for alias, sources in config.INDEX_ALIASES.items():
+        for alias, sources in CONFIG.INDEX_ALIASES.items():
             if source in sources:
                 promote_index_command.append("--alias")
                 promote_index_command.append(alias)
 
@@ -1,29 +1,71 @@
+# ruff: noqa: EM102, TRY003
+
 import logging
 import os
+from typing import Any, ClassVar
+
 
-GIS_SOURCES = ["gismit", "gisogm"]
-INDEX_ALIASES = {
-    "rdi": ["jpal", "whoas", "zenodo"],
-    "timdex": ["alma", "aspace", "dspace"],
-    "geo": GIS_SOURCES,
-}
-REQUIRED_ENV = {
-    "TIMDEX_ALMA_EXPORT_BUCKET_ID",
-    "TIMDEX_S3_EXTRACT_BUCKET_ID",
-    "WORKSPACE",
-}
-REQUIRED_FIELDS = ("next-step", "run-date", "run-type", "source")
-REQUIRED_OAI_HARVEST_FIELDS = ("oai-pmh-host", "oai-metadata-format")
-VALID_DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ")
-VALID_RUN_TYPES = ("full", "daily")
-VALID_STEPS = ("extract", "transform", "load")
-
-
-def check_verbosity(verbose: bool | str) -> bool:  # noqa: FBT001
-    """Determine whether verbose is True or False given a boolean or string value."""
-    if isinstance(verbose, bool):
-        return verbose
-    return verbose.lower() == "true"
+class Config:
+    REQUIRED_ENV_VARS = (
+        "TIMDEX_ALMA_EXPORT_BUCKET_ID",
+        "TIMDEX_S3_EXTRACT_BUCKET_ID",
+        "WORKSPACE",
+    )
+    OPTIONAL_ENV_VARS = ()
+
+    GIS_SOURCES = ("gismit", "gisogm")
+    INDEX_ALIASES: ClassVar = {
+        "rdi": ["jpal", "whoas", "zenodo"],
+        "timdex": ["alma", "aspace", "dspace"],
+        "geo": GIS_SOURCES,
+    }
+    REQUIRED_FIELDS = ("next-step", "run-date", "run-type", "source")
+    REQUIRED_OAI_HARVEST_FIELDS = ("oai-pmh-host", "oai-metadata-format")
+    VALID_DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ")
+    VALID_RUN_TYPES = ("full", "daily")
+    VALID_STEPS = ("extract", "transform", "load")
+
+    def __getattr__(self, name: str) -> Any:  # noqa: ANN401
+        """Provide dot notation access to configurations and env vars on this class."""
+        if name in self.REQUIRED_ENV_VARS or name in self.OPTIONAL_ENV_VARS:
+            return os.getenv(name)
+        message = f"'{name}' not a valid configuration variable"
+        raise AttributeError(message)
+
+    def check_required_env_vars(self) -> None:
+        """Method to raise exception if required env vars not set."""
+        missing_vars = [var for var in self.REQUIRED_ENV_VARS if not os.getenv(var)]
+        if missing_vars:
+            message = f"Missing required environment variables: {', '.join(missing_vars)}"
+            raise OSError(message)
+
+    @staticmethod
+    def get_verbose_flag(verbose: bool | str) -> bool:  # noqa: FBT001
+        """Determine whether verbose is True or False given a boolean or string value."""
+        if isinstance(verbose, bool):
+            return verbose
+        return verbose.lower() == "true"
+
+    @property
+    def alma_export_bucket(self) -> str:
+        var = "TIMDEX_ALMA_EXPORT_BUCKET_ID"
+        value = os.getenv(var)
+        if not value:
+            raise OSError(f"Env var '{var}' must be defined")
+        return value
+
+    @property
+    def timdex_bucket(self) -> str:
+        var = "TIMDEX_S3_EXTRACT_BUCKET_ID"
+        value = os.getenv(var)
+        if not value:
+            raise OSError(f"Env var '{var}' must be defined")
+        return value
+
+    @property
+    def s3_timdex_dataset_data_location(self) -> str:
+        """Return full S3 URI (bucket + prefix) of ETL records data location."""
+        return f"s3://{self.timdex_bucket}/dataset/data/records"
 
 
 def configure_logger(
@@ -34,9 +76,9 @@ def configure_logger(
 ) -> str:
     """Configure application via passed application root logger.
 
-    If verbose=True, 3rd party libraries can be quite chatty.  For convenience, they can
-    be set to WARNING level by either passing a comma seperated list of logger names to
-    'warning_only_loggers' or by setting the env var WARNING_ONLY_LOGGERS.
+    If verbose=True, 3rd party libraries can be quite chatty.  For convenience, they
+    can be set to WARNING level by either passing a comma seperated list of logger
+    names to 'warning_only_loggers' or by setting the env var WARNING_ONLY_LOGGERS.
     """
     if verbose:
         root_logger.setLevel(logging.DEBUG)
@@ -61,55 +103,3 @@ def configure_logger(
         f"Logger '{root_logger.name}' configured with level="
         f"{logging.getLevelName(root_logger.getEffectiveLevel())}"
     )
-
-
-def validate_input(input_data: dict) -> None:
-    """Validate input to the lambda function.
-
-    Ensures that all requiered input fields are present and contain valid data.
-    """
-    # All required fields are present
-    if missing_fields := [field for field in REQUIRED_FIELDS if field not in input_data]:
-        message = (
-            f"Input must include all required fields. Missing fields: {missing_fields}"
-        )
-        raise ValueError(message)
-
-    # Valid next step
-    next_step = input_data["next-step"]
-    if next_step not in VALID_STEPS:
-        message = (
-            f"Input 'next-step' value must be one of: {VALID_STEPS}. Value "
-            f"provided was '{next_step}'"
-        )
-        raise ValueError(message)
-
-    # Valid run type
-    run_type = input_data["run-type"]
-    if run_type not in VALID_RUN_TYPES:
-        message = (
-            f"Input 'run-type' value must be one of: {VALID_RUN_TYPES}. Value "
-            f"provided was '{run_type}'"
-        )
-        raise ValueError(message)
-
-    # If next step is extract step, required harvest fields are present
-    # ruff: noqa: SIM102
-    if input_data["next-step"] == "extract":
-        if input_data["source"] not in GIS_SOURCES:
-            if missing_harvest_fields := [
-                field for field in REQUIRED_OAI_HARVEST_FIELDS if field not in input_data
-            ]:
-                message = (
-                    "Input must include all required harvest fields when starting with "
-                    f"harvest step. Missing fields: {missing_harvest_fields}"
-                )
-                raise ValueError(message)
-
-
-def verify_env() -> None:
-    """Confirm that required env variables are set."""
-    for key in REQUIRED_ENV:
-        if not os.getenv(key):
-            message = f"Required env variable {key} is not set"
-            raise RuntimeError(message)
@@ -1,29 +1,30 @@
 import json
 import logging
-import os
 import uuid
 from datetime import UTC, datetime
 
-from lambdas import alma_prep, commands, config, errors, helpers
+from lambdas import alma_prep, commands, errors, helpers
+from lambdas.config import Config, configure_logger
 
 logger = logging.getLogger(__name__)
 
+CONFIG = Config()
+
 
 def lambda_handler(event: dict, _context: dict) -> dict:
     """Format data into the necessary input for TIMDEX pipeline processing."""
-    config.verify_env()
-    verbose = config.check_verbosity(event.get("verbose", False))
-    config.configure_logger(logging.getLogger(), verbose=verbose)
+    verbose = CONFIG.get_verbose_flag(event.get("verbose", False))
+    configure_logger(logging.getLogger(), verbose=verbose)
     logger.debug(json.dumps(event))
-    config.validate_input(event)
+
+    helpers.validate_input(event)
 
     run_date = helpers.format_run_date(event["run-date"])
     run_type = event["run-type"]
     source = event["source"]
     next_step = event["next-step"]
     run_id = event.get("run-id", str(uuid.uuid4()))
     run_timestamp = event.get("run-timestamp", datetime.now(UTC).isoformat())
-    timdex_bucket = os.environ["TIMDEX_S3_EXTRACT_BUCKET_ID"]
 
     result = {
         "run-date": run_date,
@@ -33,24 +34,34 @@ def lambda_handler(event: dict, _context: dict) -> dict:
     }
 
     if next_step == "extract":
-        if source in config.GIS_SOURCES:
+        if source in CONFIG.GIS_SOURCES:
             result["harvester-type"] = "geo"
         else:
             result["harvester-type"] = "oai"
         result["next-step"] = "transform"
         result["extract"] = commands.generate_extract_command(
-            event, run_date, timdex_bucket, verbose
+            event,
+            run_date,
+            CONFIG.timdex_bucket,
+            verbose,
         )
         return result
 
     if next_step == "transform":
         try:
             if source == "alma":
-                alma_prep.prepare_alma_export_files(run_date, run_type, timdex_bucket)
+                alma_prep.prepare_alma_export_files(
+                    run_date,
+                    run_type,
+                    CONFIG.timdex_bucket,
+                )
             extract_output_files = helpers.list_s3_files_by_prefix(
-                timdex_bucket,
+                CONFIG.timdex_bucket,
                 helpers.generate_step_output_prefix(
-                    source, run_date, run_type, "extract"
+                    source,
+                    run_date,
+                    run_type,
+                    "extract",
                 ),
             )
         except errors.NoFilesError:
@@ -72,19 +83,26 @@ def lambda_handler(event: dict, _context: dict) -> dict:
         )
         result["next-step"] = "load"
         result["transform"] = commands.generate_transform_commands(
-            extract_output_files, event, timdex_bucket, run_id, run_timestamp
+            extract_output_files,
+            event,
+            CONFIG.timdex_bucket,
+            run_id,
+            run_timestamp,
         )
         return result
 
     if next_step == "load":
-        if not helpers.dataset_records_exist_for_run(timdex_bucket, run_date, run_id):
+        if not helpers.dataset_records_exist_for_run(run_date, run_id):
             result["failure"] = (
                 "No records were found in the TIMDEX dataset for run_date "
                 f"'{run_date}', run_id '{run_id}'."
             )
             return result
         result["load"] = commands.generate_load_commands(
-            source, run_date, run_type, run_id, timdex_bucket
+            source,
+            run_date,
+            run_type,
+            run_id,
         )
         return result