Skip to content
This repository was archived by the owner on Sep 23, 2024. It is now read-only.

Commit eddcc77

Browse files
committed
Add ability to use wal2json format-version 2
This option removes the need to use `write-in-chunks`.
1 parent 443e67d commit eddcc77

File tree

6 files changed

+256
-41
lines changed

6 files changed

+256
-41
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Full list of options in `config.json`:
6969
| use_secondary | Boolean | No | Use a database replica for `INCREMENTAL` and `FULL_TABLE` replication (Default : False) |
7070
| secondary_host | String | No | PostgreSQL Replica host (required if `use_secondary` is `True`) |
7171
| secondary_port | Integer | No | PostgreSQL Replica port (required if `use_secondary` is `True`) |
72+
| wal2json_message_format | Integer | No | Which `wal2json` message format to use (1 or 2). |
7273

7374

7475
### Run the tap in Discovery Mode

tap_postgres/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,7 @@ def main_impl():
407407
'break_at_end_lsn': args.config.get('break_at_end_lsn', True),
408408
'logical_poll_total_seconds': float(args.config.get('logical_poll_total_seconds', 0)),
409409
'use_secondary': args.config.get('use_secondary', False),
410+
'wal2json_message_format': args.config.get('wal2json_message_format', 1)
410411
}
411412

412413
if conn_config['use_secondary']:

tap_postgres/sync_strategies/logical_replication.py

+128-28
Original file line numberDiff line numberDiff line change
@@ -377,18 +377,24 @@ def row_to_singer_message(stream, row, version, columns, time_extracted, md_map,
377377
time_extracted=time_extracted)
378378

379379

380-
# pylint: disable=unused-argument,too-many-locals
381-
def consume_message(streams, state, msg, time_extracted, conn_info):
382-
# Strip leading comma generated by write-in-chunks and parse valid JSON
383-
try:
384-
payload = json.loads(msg.payload.lstrip(','))
385-
except Exception:
386-
return state
380+
def check_for_new_columns(columns, target_stream, conn_info):
381+
diff = set(columns).difference(target_stream['schema']['properties'].keys())
387382

388-
lsn = msg.data_start
383+
if diff:
384+
LOGGER.info('Detected new columns "%s", refreshing schema of stream %s', diff, target_stream['stream'])
385+
# encountered a column that is not in the schema
386+
# refresh the stream schema and metadata by running discovery
387+
refresh_streams_schema(conn_info, [target_stream])
389388

390-
streams_lookup = {s['tap_stream_id']: s for s in streams}
389+
# add the automatic properties back to the stream
390+
add_automatic_properties(target_stream, conn_info.get('debug_lsn', False))
391391

392+
# publish new schema
393+
sync_common.send_schema_message(target_stream, ['lsn'])
394+
395+
396+
# pylint: disable=too-many-locals
397+
def consume_message_format_1(payload, conn_info, streams_lookup, state, time_extracted, lsn):
392398
tap_stream_id = post_db.compute_tap_stream_id(payload['schema'], payload['table'])
393399
if streams_lookup.get(tap_stream_id) is None:
394400
return state
@@ -400,22 +406,8 @@ def consume_message(streams, state, msg, time_extracted, conn_info):
400406

401407
# Get the additional fields in payload that are not in schema properties:
402408
# only inserts and updates have the list of columns that can be used to detect any different in columns
403-
diff = set()
404409
if payload['kind'] in {'insert', 'update'}:
405-
diff = set(payload['columnnames']).difference(target_stream['schema']['properties'].keys())
406-
407-
# if there is new columns in the payload that are not in the schema properties then refresh the stream schema
408-
if diff:
409-
LOGGER.info('Detected new columns "%s", refreshing schema of stream %s', diff, target_stream['stream'])
410-
# encountered a column that is not in the schema
411-
# refresh the stream schema and metadata by running discovery
412-
refresh_streams_schema(conn_info, [target_stream])
413-
414-
# add the automatic properties back to the stream
415-
add_automatic_properties(target_stream, conn_info.get('debug_lsn', False))
416-
417-
# publish new schema
418-
sync_common.send_schema_message(target_stream, ['lsn'])
410+
check_for_new_columns(payload['columnnames'], target_stream, conn_info)
419411

420412
stream_version = get_stream_version(target_stream['tap_stream_id'], state)
421413
stream_md_map = metadata.to_map(target_stream['metadata'])
@@ -476,6 +468,109 @@ def consume_message(streams, state, msg, time_extracted, conn_info):
476468
return state
477469

478470

471+
def consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn):
472+
## Action Types:
473+
# I = Insert
474+
# U = Update
475+
# D = Delete
476+
# B = Begin Transaction
477+
# C = Commit Transaction
478+
# M = Message
479+
# T = Truncate
480+
action = payload['action']
481+
if action not in {'U', 'I', 'D'}:
482+
raise UnsupportedPayloadKindError(f"unrecognized replication operation: {action}")
483+
484+
tap_stream_id = post_db.compute_tap_stream_id(payload['schema'], payload['table'])
485+
if streams_lookup.get(tap_stream_id) is not None:
486+
target_stream = streams_lookup[tap_stream_id]
487+
488+
# Get the additional fields in payload that are not in schema properties:
489+
# only inserts and updates have the list of columns that can be used to detect any different in columns
490+
if payload['action'] in {'I', 'U'}:
491+
check_for_new_columns({column['name'] for column in payload['columns']}, target_stream, conn_info)
492+
493+
stream_version = get_stream_version(target_stream['tap_stream_id'], state)
494+
stream_md_map = metadata.to_map(target_stream['metadata'])
495+
496+
desired_columns = {c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(
497+
stream_md_map, c)}
498+
499+
stream_version = get_stream_version(target_stream['tap_stream_id'], state)
500+
stream_md_map = metadata.to_map(target_stream['metadata'])
501+
502+
desired_columns = [
503+
col for col in target_stream['schema']['properties'].keys()
504+
if sync_common.should_sync_column(stream_md_map, col)
505+
]
506+
507+
col_names = []
508+
col_vals = []
509+
if payload['action'] in ['I', 'U']:
510+
for column in payload['columns']:
511+
if column['name'] in set(desired_columns):
512+
col_names.append(column['name'])
513+
col_vals.append(column['value'])
514+
515+
col_names = col_names + ['_sdc_deleted_at']
516+
col_vals = col_vals + [None]
517+
518+
if conn_info.get('debug_lsn'):
519+
col_names = col_names + ['_sdc_lsn']
520+
col_vals = col_vals + [str(lsn)]
521+
522+
elif payload['action'] == 'D':
523+
for column in payload['identity']:
524+
if column['name'] in set(desired_columns):
525+
col_names.append(column['name'])
526+
col_vals.append(column['value'])
527+
528+
col_names = col_names + ['_sdc_deleted_at']
529+
col_vals = col_vals + [singer.utils.strftime(singer.utils.strptime_to_utc(payload['timestamp']))]
530+
531+
if conn_info.get('debug_lsn'):
532+
col_vals = col_vals + [str(lsn)]
533+
col_names = col_names + ['_sdc_lsn']
534+
535+
# Write 1 record to match the API of V1
536+
record_message = row_to_singer_message(
537+
target_stream,
538+
col_vals,
539+
stream_version,
540+
col_names,
541+
time_extracted,
542+
stream_md_map,
543+
conn_info,
544+
)
545+
546+
singer.write_message(record_message)
547+
state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn)
548+
549+
return state
550+
551+
552+
def consume_message(streams, state, msg, time_extracted, conn_info):
553+
# Strip leading comma generated by write-in-chunks and parse valid JSON
554+
try:
555+
payload = json.loads(msg.payload.lstrip(','))
556+
except Exception:
557+
return state
558+
559+
lsn = msg.data_start
560+
561+
streams_lookup = {s['tap_stream_id']: s for s in streams}
562+
563+
message_format = conn_info['wal2json_message_format']
564+
if message_format == 1:
565+
state = consume_message_format_1(payload, conn_info, streams_lookup, state, time_extracted, lsn)
566+
elif message_format == 2:
567+
state = consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn)
568+
else:
569+
raise Exception(f"Unknown wal2json message format version: {message_format}")
570+
571+
return state
572+
573+
479574
def generate_replication_slot_name(dbname, tap_id=None, prefix='pipelinewise'):
480575
"""Generate replication slot name with
481576
@@ -591,14 +686,19 @@ def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
591686
int_to_lsn(end_lsn),
592687
slot)
593688
# psycopg2 2.8.4 will send a keep-alive message to postgres every status_interval
689+
options = {
690+
'add-tables': streams_to_wal2json_tables(logical_streams),
691+
'format-version': conn_info['wal2json_message_format'],
692+
}
693+
if options['format-version'] == 1:
694+
options['write-in-chunks'] = 1
695+
else:
696+
options['actions'] = ['insert', 'update', 'delete']
594697
cur.start_replication(slot_name=slot,
595698
decode=True,
596699
start_lsn=start_lsn,
597700
status_interval=poll_interval,
598-
options={
599-
'write-in-chunks': 1,
600-
'add-tables': streams_to_wal2json_tables(logical_streams)
601-
})
701+
options=options)
602702

603703
except psycopg2.ProgrammingError as ex:
604704
raise Exception(f"Unable to start replication with logical replication (slot {ex})") from ex

tests/test_full_table_interruption.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def do_not_dump_catalog(catalog):
4848
tap_postgres.dump_catalog = do_not_dump_catalog
4949
full_table.UPDATE_BOOKMARK_PERIOD = 1
5050

51-
@pytest.mark.parametrize('use_secondary', [False, True])
51+
@pytest.mark.parametrize('use_secondary,message_format', [(False, 1), (True, 2)])
5252
@unittest.mock.patch('psycopg2.connect', wraps=psycopg2.connect)
5353
class TestLogicalInterruption:
5454
maxDiff = None
@@ -67,11 +67,11 @@ def setup_method(self):
6767
global CAUGHT_MESSAGES
6868
CAUGHT_MESSAGES.clear()
6969

70-
def test_catalog(self, mock_connect, use_secondary):
70+
def test_catalog(self, mock_connect, use_secondary, message_format):
7171
singer.write_message = singer_write_message_no_cow
7272
pg_common.write_schema_message = singer_write_message_ok
7373

74-
conn_config = get_test_connection_config(use_secondary=use_secondary)
74+
conn_config = get_test_connection_config(use_secondary=use_secondary, message_format=message_format)
7575
streams = tap_postgres.do_discovery(conn_config)
7676

7777
# Assert that we connected to the correct database
@@ -115,7 +115,7 @@ def test_catalog(self, mock_connect, use_secondary):
115115
#the initial phase of cows logical replication will be a full table.
116116
#it will sync the first record and then blow up on the 2nd record
117117
try:
118-
tap_postgres.do_sync(get_test_connection_config(use_secondary=use_secondary), {'streams' : streams}, None, state)
118+
tap_postgres.do_sync(conn_config, {'streams' : streams}, None, state)
119119
except Exception:
120120
blew_up_on_cow = True
121121

@@ -171,7 +171,7 @@ def test_catalog(self, mock_connect, use_secondary):
171171
global COW_RECORD_COUNT
172172
COW_RECORD_COUNT = 0
173173
CAUGHT_MESSAGES.clear()
174-
tap_postgres.do_sync(get_test_connection_config(use_secondary=use_secondary), {'streams' : streams}, None, old_state)
174+
tap_postgres.do_sync(conn_config, {'streams' : streams}, None, old_state)
175175

176176
mock_connect.assert_called_with(**expected_connection)
177177
mock_connect.reset_mock()

0 commit comments

Comments
 (0)