Skip to content

Commit 113c7a7

Browse files
authored
Merge pull request #322 from MITLibraries/TIMX-537-531-535
TIMX 537 531 535 - Updates for TDA v3
2 parents b0c1b91 + b35bbb0 commit 113c7a7

File tree

7 files changed

+624
-489
lines changed

7 files changed

+624
-489
lines changed

Pipfile.lock

Lines changed: 587 additions & 460 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lambdas/commands.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def generate_transform_commands(
8383
for extract_output_file in extract_output_files:
8484
transform_command = [
8585
f"--input-file=s3://{timdex_bucket}/{extract_output_file}",
86-
f"--output-location={CONFIG.s3_timdex_dataset_data_location}",
86+
f"--output-location={CONFIG.s3_timdex_dataset_location}",
8787
f"--source={source}",
8888
f"--run-id={run_id}",
8989
f"--run-timestamp={run_timestamp}",
@@ -112,15 +112,15 @@ def generate_load_commands(
112112
[
113113
"--source",
114114
source,
115-
CONFIG.s3_timdex_dataset_data_location,
115+
CONFIG.s3_timdex_dataset_location,
116116
]
117117
)
118118
return {"bulk-update-command": update_command}
119119

120120
if run_type == "full":
121121
new_index_name = helpers.generate_index_name(source)
122122
update_command.extend(
123-
["--index", new_index_name, CONFIG.s3_timdex_dataset_data_location]
123+
["--index", new_index_name, CONFIG.s3_timdex_dataset_location]
124124
)
125125
promote_index_command = ["promote", "--index", new_index_name]
126126
for alias, sources in CONFIG.INDEX_ALIASES.items():

lambdas/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@ def timdex_bucket(self) -> str:
6363
return value
6464

6565
@property
66-
def s3_timdex_dataset_data_location(self) -> str:
67-
"""Return full S3 URI (bucket + prefix) of ETL records data location."""
68-
return f"s3://{self.timdex_bucket}/dataset/data/records"
66+
def s3_timdex_dataset_location(self) -> str:
67+
"""Return full S3 URI (bucket + prefix) of dataset root location."""
68+
return f"s3://{self.timdex_bucket}/dataset"
6969

7070

7171
def configure_logger(

lambdas/format_input.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,9 @@ def lambda_handler(event: dict, _context: dict) -> dict:
9292
return result
9393

9494
if next_step == "load":
95-
if not helpers.dataset_records_exist_for_run(run_date, run_id):
95+
if not helpers.dataset_records_exist_for_run(run_id):
9696
result["failure"] = (
97-
"No records were found in the TIMDEX dataset for run_date "
98-
f"'{run_date}', run_id '{run_id}'."
97+
f"No records were found in the TIMDEX dataset for run_id '{run_id}'."
9998
)
10099
return result
101100
result["load"] = commands.generate_load_commands(

lambdas/helpers.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# ruff: noqa: S608
2+
13
import contextlib
24
import logging
35
from datetime import UTC, datetime, timedelta
@@ -157,14 +159,22 @@ def list_s3_files_by_prefix(bucket: str, prefix: str) -> list[str]:
157159
return s3_files
158160

159161

160-
def dataset_records_exist_for_run(run_date: str, run_id: str) -> bool:
161-
"""Query TIMDEX dataset to confirm records to load and/or delete.
162+
def dataset_records_exist_for_run(run_id: str) -> bool:
163+
"""Query TIMDEX dataset metadata to confirm records to load and/or delete.
162164
163165
A "run" is defined by a run-date + run-id, both provided as inputs to this lambda
164166
invocation provided by the StepFunction. We are interested only in records where
165167
action is "index" or "delete". If zero records exist, or have action "skip" or
166168
"error", we do not need to perform any load commands.
167169
"""
168-
td = TIMDEXDataset(location=CONFIG.s3_timdex_dataset_data_location)
169-
td.load(run_date=run_date, run_id=run_id, action=["index", "delete"])
170-
return td.row_count > 0
170+
td = TIMDEXDataset(location=CONFIG.s3_timdex_dataset_location)
171+
172+
etl_run_count = td.metadata.conn.query(
173+
f"""
174+
select count(*)
175+
from metadata.records
176+
where run_id = '{run_id}'
177+
and action in ('index','delete')
178+
"""
179+
).fetchone()[0]
180+
return etl_run_count > 0

tests/test_commands.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_generate_transform_commands_required_input_fields(run_id, run_timestamp
9999
"transform-command": [
100100
"--input-file=s3://test-timdex-bucket/testsource/"
101101
"testsource-2022-01-02-full-extracted-records-to-index.xml",
102-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
102+
"--output-location=s3://test-timdex-bucket/dataset",
103103
"--source=testsource",
104104
f"--run-id={run_id}",
105105
f"--run-timestamp={run_timestamp}",
@@ -133,7 +133,7 @@ def test_generate_transform_commands_all_input_fields(run_id, run_timestamp):
133133
"transform-command": [
134134
"--input-file=s3://test-timdex-bucket/testsource/"
135135
"testsource-2022-01-02-daily-extracted-records-to-index_01.xml",
136-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
136+
"--output-location=s3://test-timdex-bucket/dataset",
137137
"--source=testsource",
138138
f"--run-id={run_id}",
139139
f"--run-timestamp={run_timestamp}",
@@ -143,7 +143,7 @@ def test_generate_transform_commands_all_input_fields(run_id, run_timestamp):
143143
"transform-command": [
144144
"--input-file=s3://test-timdex-bucket/testsource/"
145145
"testsource-2022-01-02-daily-extracted-records-to-index_02.xml",
146-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
146+
"--output-location=s3://test-timdex-bucket/dataset",
147147
"--source=testsource",
148148
f"--run-id={run_id}",
149149
f"--run-timestamp={run_timestamp}",
@@ -153,7 +153,7 @@ def test_generate_transform_commands_all_input_fields(run_id, run_timestamp):
153153
"transform-command": [
154154
"--input-file=s3://test-timdex-bucket/testsource/"
155155
"testsource-2022-01-02-daily-extracted-records-to-delete.xml",
156-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
156+
"--output-location=s3://test-timdex-bucket/dataset",
157157
"--source=testsource",
158158
f"--run-id={run_id}",
159159
f"--run-timestamp={run_timestamp}",
@@ -178,7 +178,7 @@ def test_generate_load_commands_daily(run_id):
178178
"run-abc-123",
179179
"--source",
180180
"testsource",
181-
"s3://test-timdex-bucket/dataset/data/records",
181+
"s3://test-timdex-bucket/dataset",
182182
]
183183
}
184184

@@ -200,7 +200,7 @@ def test_generate_load_commands_full_no_alias(run_id):
200200
"run-abc-123",
201201
"--index",
202202
"testsource-2022-01-02t12-13-14",
203-
"s3://test-timdex-bucket/dataset/data/records",
203+
"s3://test-timdex-bucket/dataset",
204204
],
205205
"promote-index-command": ["promote", "--index", "testsource-2022-01-02t12-13-14"],
206206
}
@@ -223,7 +223,7 @@ def test_generate_load_commands_full_with_alias(run_id):
223223
"run-abc-123",
224224
"--index",
225225
"alma-2022-01-02t12-13-14",
226-
"s3://test-timdex-bucket/dataset/data/records",
226+
"s3://test-timdex-bucket/dataset",
227227
],
228228
"promote-index-command": [
229229
"promote",

tests/test_format_input.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_lambda_handler_with_next_step_transform_files_present(s3_client, run_ti
6161
"transform-command": [
6262
"--input-file=s3://test-timdex-bucket/testsource/"
6363
"testsource-2022-01-02-daily-extracted-records-to-index.xml",
64-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
64+
"--output-location=s3://test-timdex-bucket/dataset",
6565
"--source=testsource",
6666
"--run-id=run-abc-123",
6767
f"--run-timestamp={run_timestamp}",
@@ -94,7 +94,7 @@ def test_lambda_handler_with_next_step_transform_alma_files_present(run_timestam
9494
"transform-command": [
9595
"--input-file=s3://test-timdex-bucket/alma/"
9696
"alma-2022-09-12-daily-extracted-records-to-delete.xml",
97-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
97+
"--output-location=s3://test-timdex-bucket/dataset",
9898
"--source=alma",
9999
"--run-id=run-abc-123",
100100
f"--run-timestamp={run_timestamp}",
@@ -104,7 +104,7 @@ def test_lambda_handler_with_next_step_transform_alma_files_present(run_timestam
104104
"transform-command": [
105105
"--input-file=s3://test-timdex-bucket/alma/"
106106
"alma-2022-09-12-daily-extracted-records-to-index_01.xml",
107-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
107+
"--output-location=s3://test-timdex-bucket/dataset",
108108
"--source=alma",
109109
"--run-id=run-abc-123",
110110
f"--run-timestamp={run_timestamp}",
@@ -114,7 +114,7 @@ def test_lambda_handler_with_next_step_transform_alma_files_present(run_timestam
114114
"transform-command": [
115115
"--input-file=s3://test-timdex-bucket/alma/"
116116
"alma-2022-09-12-daily-extracted-records-to-index_02.xml",
117-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
117+
"--output-location=s3://test-timdex-bucket/dataset",
118118
"--source=alma",
119119
"--run-id=run-abc-123",
120120
f"--run-timestamp={run_timestamp}",
@@ -160,7 +160,7 @@ def test_lambda_handler_with_next_step_transform_auto_generated_timestamp(s3_cli
160160
"transform-command": [
161161
"--input-file=s3://test-timdex-bucket/testsource/"
162162
"testsource-2022-01-02-daily-extracted-records-to-index.xml",
163-
"--output-location=s3://test-timdex-bucket/dataset/data/records",
163+
"--output-location=s3://test-timdex-bucket/dataset",
164164
"--source=testsource",
165165
"--run-id=run-abc-123",
166166
"--run-timestamp=2025-06-18T12:34:56.789000",
@@ -253,7 +253,7 @@ def test_lambda_handler_with_next_step_load_files_present(s3_client):
253253
"run-abc-123",
254254
"--source",
255255
"testsource",
256-
"s3://test-timdex-bucket/dataset/data/records",
256+
"s3://test-timdex-bucket/dataset",
257257
]
258258
},
259259
}
@@ -280,7 +280,6 @@ def test_lambda_handler_with_next_step_load_no_files_present():
280280
"source": "testsource",
281281
"verbose": False,
282282
"failure": (
283-
"No records were found in the TIMDEX dataset for "
284-
"run_date '2022-01-02', run_id 'run-abc-123'."
283+
"No records were found in the TIMDEX dataset for run_id 'run-abc-123'."
285284
),
286285
}

0 commit comments

Comments
 (0)