Skip to content

Commit 5619aad

Browse files
authored
Merge pull request #320 from MITLibraries/TIMX-509-run-timestamp-argument
TIMX 509 - run-timestamp argument for all transform commands
2 parents ff2b58a + 8ab6fff commit 5619aad

File tree

9 files changed

+518
-457
lines changed

9 files changed

+518
-457
lines changed

Pipfile.lock

Lines changed: 401 additions & 410 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 40 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ Takes input JSON (usually from EventBridge although it can be passed to a manual
2727

2828
- `oai-set-spec`: optional, only used when limiting the OAI-PMH record harvest to a single set from the source repository.
2929
- `verbose`: optional, if provided with value `"true"` (case-insensitive) will pass the `--verbose` option (debug level logging) to all pipeline task run commands.
30+
- `run-id`: an ETL run id that gets included for CLI commands generated; minted if not provided
31+
- `run-timestamp`: an ETL timestamp that gets included for CLI commands generated; minted if not provided
3032

3133
### Example Format Input Event
3234

@@ -110,50 +112,51 @@ GitHub Actions is configured to update the Lambda function with every push to th
110112

111113
- Run the default handler for the container
112114

113-
```bash
114-
docker run -e TIMDEX_ALMA_EXPORT_BUCKET_ID=alma-bucket-name \
115-
-e TIMDEX_S3_EXTRACT_BUCKET_ID=timdex-bucket-name \
116-
-e WORKSPACE=dev \
117-
-p 9000:8080 timdex-pipeline-lambdas-dev:latest
118-
```
115+
```bash
116+
docker run -e TIMDEX_ALMA_EXPORT_BUCKET_ID=alma-bucket-name \
117+
-e TIMDEX_S3_EXTRACT_BUCKET_ID=timdex-bucket-name \
118+
-e WORKSPACE=dev \
119+
-p 9000:8080 timdex-pipeline-lambdas-dev:latest
120+
```
119121

120122
- POST to the container
121123
Note: running this with next-step transform or load involves an actual S3 connection and is thus tricky to test locally. Better to push the image to Dev1 and test there.
122124

123-
```bash
124-
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{
125-
"next-step": "extract",
126-
"run-date": "2022-03-10T16:30:23Z",
127-
"run-type": "daily",
128-
"source": "YOURSOURCE",
129-
"verbose": "true",
130-
"oai-pmh-host": "https://YOUR-OAI-SOURCE/oai",
131-
"oai-metadata-format": "oai_dc",
132-
"oai-set-spec": "YOUR-SET-SPEC"
133-
}'
134-
```
125+
```bash
126+
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{
127+
"next-step": "extract",
128+
"run-date": "2022-03-10T16:30:23Z",
129+
"run-type": "daily",
130+
"source": "YOURSOURCE",
131+
"verbose": "true",
132+
"oai-pmh-host": "https://YOUR-OAI-SOURCE/oai",
133+
"oai-metadata-format": "oai_dc",
134+
"oai-set-spec": "YOUR-SET-SPEC"
135+
}'
136+
```
135137

136138
- Observe output
137-
```json
138-
{
139-
"run-date": "2022-03-10",
140-
"run-type": "daily",
141-
"source": "YOURSOURCE",
142-
"verbose": true,
143-
"next-step": "transform",
144-
"extract": {
145-
"extract-command": [
146-
"--host=https://YOUR-OAI-SOURCE/oai",
147-
"--output-file=s3://timdex-bucket-name/YOURSOURCE/YOURSOURCE-2022-03-09-daily-extracted-records-to-index.xml",
148-
"--verbose",
149-
"harvest",
150-
"--metadata-format=oai_dc",
151-
"--set-spec=YOUR-SET-SPEC",
152-
"--from-date=2022-03-09"
153-
]
154-
}
139+
-
140+
```json
141+
{
142+
"run-date": "2022-03-10",
143+
"run-type": "daily",
144+
"source": "YOURSOURCE",
145+
"verbose": true,
146+
"next-step": "transform",
147+
"extract": {
148+
"extract-command": [
149+
"--host=https://YOUR-OAI-SOURCE/oai",
150+
"--output-file=s3://timdex-bucket-name/YOURSOURCE/YOURSOURCE-2022-03-09-daily-extracted-records-to-index.xml",
151+
"--verbose",
152+
"harvest",
153+
"--metadata-format=oai_dc",
154+
"--set-spec=YOUR-SET-SPEC",
155+
"--from-date=2022-03-09"
156+
]
155157
}
156-
```
158+
}
159+
```
157160

158161
### Running a Specific Handler Locally with Docker
159162
You can call any handler you copy into the container (see Dockerfile) by name as part of the `docker run` command.

lambdas/commands.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def generate_transform_commands(
7272
input_data: dict,
7373
timdex_bucket: str,
7474
run_id: str,
75+
run_timestamp: str,
7576
) -> dict[str, list[dict]]:
7677
"""Generate task run command for TIMDEX transform."""
7778
files_to_transform: list[dict] = []
@@ -82,6 +83,7 @@ def generate_transform_commands(
8283
f"--output-location=s3://{timdex_bucket}/dataset",
8384
f"--source={source}",
8485
f"--run-id={run_id}",
86+
f"--run-timestamp={run_timestamp}",
8587
]
8688
files_to_transform.append({"transform-command": transform_command})
8789
return {"files-to-transform": files_to_transform}

lambdas/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
VALID_STEPS = ("extract", "transform", "load")
2020

2121

22-
def check_verbosity(verbose: bool | str) -> bool:
22+
def check_verbosity(verbose: bool | str) -> bool: # noqa: FBT001
2323
"""Determine whether verbose is True or False given a boolean or string value."""
2424
if isinstance(verbose, bool):
2525
return verbose

lambdas/format_input.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
# ruff: noqa: PLR0911, PLR2004
2-
31
import json
42
import logging
53
import os
64
import uuid
5+
from datetime import UTC, datetime
76

87
from lambdas import alma_prep, commands, config, errors, helpers
98

@@ -23,6 +22,7 @@ def lambda_handler(event: dict, _context: dict) -> dict:
2322
source = event["source"]
2423
next_step = event["next-step"]
2524
run_id = event.get("run-id", str(uuid.uuid4()))
25+
run_timestamp = event.get("run-timestamp", datetime.now(UTC).isoformat())
2626
timdex_bucket = os.environ["TIMDEX_S3_EXTRACT_BUCKET_ID"]
2727

2828
result = {
@@ -72,7 +72,7 @@ def lambda_handler(event: dict, _context: dict) -> dict:
7272
)
7373
result["next-step"] = "load"
7474
result["transform"] = commands.generate_transform_commands(
75-
extract_output_files, event, timdex_bucket, run_id
75+
extract_output_files, event, timdex_bucket, run_id, run_timestamp
7676
)
7777
return result
7878

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ ignore = [
3838
"PLR0912",
3939
"PLR0913",
4040
"PLR0915",
41-
"S320",
4241
"S321",
4342
]
4443

tests/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,8 @@ def s3_client():
6363
@pytest.fixture
6464
def run_id():
6565
return "run-abc-123"
66+
67+
68+
@pytest.fixture
69+
def run_timestamp():
70+
return "2025-06-18T12:34:56.789000"

tests/test_commands.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def test_generate_extract_command_geoharvester():
7777
}
7878

7979

80-
def test_generate_transform_commands_required_input_fields(run_id):
80+
def test_generate_transform_commands_required_input_fields(run_id, run_timestamp):
8181
input_data = {
8282
"next-step": "transform",
8383
"run-date": "2022-01-02T12:13:14Z",
@@ -92,6 +92,7 @@ def test_generate_transform_commands_required_input_fields(run_id):
9292
input_data,
9393
"test-timdex-bucket",
9494
run_id,
95+
run_timestamp,
9596
) == {
9697
"files-to-transform": [
9798
{
@@ -101,13 +102,14 @@ def test_generate_transform_commands_required_input_fields(run_id):
101102
"--output-location=s3://test-timdex-bucket/dataset",
102103
"--source=testsource",
103104
f"--run-id={run_id}",
105+
f"--run-timestamp={run_timestamp}",
104106
]
105107
}
106108
]
107109
}
108110

109111

110-
def test_generate_transform_commands_all_input_fields(run_id):
112+
def test_generate_transform_commands_all_input_fields(run_id, run_timestamp):
111113
input_data = {
112114
"next-step": "transform",
113115
"run-date": "2022-01-02T12:13:14Z",
@@ -120,7 +122,11 @@ def test_generate_transform_commands_all_input_fields(run_id):
120122
"testsource/testsource-2022-01-02-daily-extracted-records-to-delete.xml",
121123
]
122124
assert commands.generate_transform_commands(
123-
extract_output_files, input_data, "test-timdex-bucket", run_id
125+
extract_output_files,
126+
input_data,
127+
"test-timdex-bucket",
128+
run_id,
129+
run_timestamp,
124130
) == {
125131
"files-to-transform": [
126132
{
@@ -130,6 +136,7 @@ def test_generate_transform_commands_all_input_fields(run_id):
130136
"--output-location=s3://test-timdex-bucket/dataset",
131137
"--source=testsource",
132138
f"--run-id={run_id}",
139+
f"--run-timestamp={run_timestamp}",
133140
]
134141
},
135142
{
@@ -139,6 +146,7 @@ def test_generate_transform_commands_all_input_fields(run_id):
139146
"--output-location=s3://test-timdex-bucket/dataset",
140147
"--source=testsource",
141148
f"--run-id={run_id}",
149+
f"--run-timestamp={run_timestamp}",
142150
]
143151
},
144152
{
@@ -148,6 +156,7 @@ def test_generate_transform_commands_all_input_fields(run_id):
148156
"--output-location=s3://test-timdex-bucket/dataset",
149157
"--source=testsource",
150158
f"--run-id={run_id}",
159+
f"--run-timestamp={run_timestamp}",
151160
]
152161
},
153162
]

tests/test_format_input.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_lambda_handler_with_next_step_extract():
3434
}
3535

3636

37-
def test_lambda_handler_with_next_step_transform_files_present(s3_client):
37+
def test_lambda_handler_with_next_step_transform_files_present(s3_client, run_timestamp):
3838
s3_client.put_object(
3939
Bucket="test-timdex-bucket",
4040
Key="testsource/testsource-2022-01-02-daily-extracted-records-to-index.xml",
@@ -46,6 +46,7 @@ def test_lambda_handler_with_next_step_transform_files_present(s3_client):
4646
"next-step": "transform",
4747
"source": "testsource",
4848
"run-id": "run-abc-123",
49+
"run-timestamp": run_timestamp,
4950
"verbose": "true",
5051
}
5152
assert format_input.lambda_handler(event, {}) == {
@@ -63,20 +64,22 @@ def test_lambda_handler_with_next_step_transform_files_present(s3_client):
6364
"--output-location=s3://test-timdex-bucket/dataset",
6465
"--source=testsource",
6566
"--run-id=run-abc-123",
67+
f"--run-timestamp={run_timestamp}",
6668
]
6769
}
6870
]
6971
},
7072
}
7173

7274

73-
def test_lambda_handler_with_next_step_transform_alma_files_present():
75+
def test_lambda_handler_with_next_step_transform_alma_files_present(run_timestamp):
7476
event = {
7577
"run-date": "2022-09-12",
7678
"run-type": "daily",
7779
"next-step": "transform",
7880
"source": "alma",
7981
"run-id": "run-abc-123",
82+
"run-timestamp": run_timestamp,
8083
"verbose": "False",
8184
}
8285
assert format_input.lambda_handler(event, {}) == {
@@ -94,6 +97,7 @@ def test_lambda_handler_with_next_step_transform_alma_files_present():
9497
"--output-location=s3://test-timdex-bucket/dataset",
9598
"--source=alma",
9699
"--run-id=run-abc-123",
100+
f"--run-timestamp={run_timestamp}",
97101
]
98102
},
99103
{
@@ -103,6 +107,7 @@ def test_lambda_handler_with_next_step_transform_alma_files_present():
103107
"--output-location=s3://test-timdex-bucket/dataset",
104108
"--source=alma",
105109
"--run-id=run-abc-123",
110+
f"--run-timestamp={run_timestamp}",
106111
]
107112
},
108113
{
@@ -112,13 +117,60 @@ def test_lambda_handler_with_next_step_transform_alma_files_present():
112117
"--output-location=s3://test-timdex-bucket/dataset",
113118
"--source=alma",
114119
"--run-id=run-abc-123",
120+
f"--run-timestamp={run_timestamp}",
115121
]
116122
},
117123
]
118124
},
119125
}
120126

121127

128+
def test_lambda_handler_with_next_step_transform_auto_generated_timestamp(s3_client):
129+
s3_client.put_object(
130+
Bucket="test-timdex-bucket",
131+
Key="testsource/testsource-2022-01-02-daily-extracted-records-to-index.xml",
132+
Body="I am a file",
133+
)
134+
event = {
135+
"run-date": "2022-01-02T12:13:14Z",
136+
"run-type": "daily",
137+
"next-step": "transform",
138+
"source": "testsource",
139+
"run-id": "run-abc-123",
140+
"verbose": "true",
141+
}
142+
143+
with patch("lambdas.format_input.datetime") as mock_datetime:
144+
mock_datetime.now.return_value.isoformat.return_value = (
145+
"2025-06-18T12:34:56.789000"
146+
)
147+
mock_datetime.UTC = format_input.datetime.UTC
148+
149+
result = format_input.lambda_handler(event, {})
150+
151+
assert result == {
152+
"run-date": "2022-01-02",
153+
"run-type": "daily",
154+
"source": "testsource",
155+
"verbose": True,
156+
"next-step": "load",
157+
"transform": {
158+
"files-to-transform": [
159+
{
160+
"transform-command": [
161+
"--input-file=s3://test-timdex-bucket/testsource/"
162+
"testsource-2022-01-02-daily-extracted-records-to-index.xml",
163+
"--output-location=s3://test-timdex-bucket/dataset",
164+
"--source=testsource",
165+
"--run-id=run-abc-123",
166+
"--run-timestamp=2025-06-18T12:34:56.789000",
167+
]
168+
}
169+
]
170+
},
171+
}
172+
173+
122174
def test_lambda_handler_with_next_step_transform_no_files_present_alma():
123175
event = {
124176
"run-date": "2022-01-02",

0 commit comments

Comments
 (0)