From 7bef2d178808c7de77749d9fffaa36f88eb97713 Mon Sep 17 00:00:00 2001 From: Benjamin Dornel Date: Sun, 28 Jul 2024 12:55:12 +0800 Subject: [PATCH 01/96] Add force on cluster config option --- README.md | 12 ++- dbt/adapters/clickhouse/impl.py | 1 + dbt/adapters/clickhouse/relation.py | 14 +-- .../test_clickhouse_table_materializations.py | 102 ++++++++++++++++++ 4 files changed, 121 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 09dba132..e1fca5e5 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,17 @@ if `cluster` is set in profile, `on_cluster_clause` now will return cluster info - Distributed materializations - Models with Replicated engines -table and incremental materializations with non-replicated engine will not be affected by `cluster` setting (model would be created on the connected node only). +By default, tables and incremental materializations with non-replicated engines will not be affected by the `cluster` setting (model would be created on the connected node only). + +To force relations to be created on a cluster regardless of their engine or materialization, use the `force_on_cluster` argument: +```sql +{{ config( + engine='Null', + materialized='materialized_view', + force_on_cluster='true' + ) +}} +``` ### Compatibility diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 067c719a..bd9cde62 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -51,6 +51,7 @@ @dataclass class ClickHouseConfig(AdapterConfig): engine: str = 'MergeTree()' + force_on_cluster: Optional[bool] = False order_by: Optional[Union[List[str], str]] = 'tuple()' partition_by: Optional[Union[List[str], str]] = None sharding_key: Optional[Union[List[str], str]] = 'rand()' diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index 8ad3b39a..87134c5a 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -112,20 +112,20 @@ def create_from( # If the database is set, and the source schema is "defaulted" to the source.name, override the # schema with the database instead, since that's presumably what's intended for clickhouse schema = relation_config.schema + + cluster = quoting.credentials.cluster or '' can_on_cluster = None # We placed a hardcoded const (instead of importing it from dbt-core) in order to decouple the packages if relation_config.resource_type == NODE_TYPE_SOURCE: if schema == relation_config.source_name and relation_config.database: schema = relation_config.database + if cluster and str(relation_config.config.get("force_on_cluster")).lower() == "true": + can_on_cluster = True + else: - cluster = quoting.credentials.cluster if quoting.credentials.cluster else '' - materialized = ( - relation_config.config.materialized if relation_config.config.materialized else '' - ) - engine = ( - relation_config.config.get('engine') if relation_config.config.get('engine') else '' - ) + materialized = relation_config.config.get('materialized') or '' + engine = relation_config.config.get('engine') or '' can_on_cluster = cls.get_on_cluster(cluster, materialized, engine) return cls.create( diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py index ff6e2efb..c75396ff 100644 --- a/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py @@ -249,3 +249,105 @@ def test_base(self, project): assert len(results) == 1 self.assert_total_count_correct(project) + + +class TestMergeTreeForceClusterMaterialization(BaseSimpleMaterializations): + '''Test MergeTree materialized view is created across a cluster using the + `force_on_cluster` config argument + ''' + + @pytest.fixture(scope="class") + def models(self): + config_force_on_cluster = """ + {{ config( + engine='MergeTree', + materialized='materialized_view', + force_on_cluster='true' + ) + }} + """ + + return { + "force_on_cluster.sql": config_force_on_cluster + model_base, + "schema.yml": schema_base_yml, + } + + @pytest.fixture(scope="class") + def seeds(self): + return { + "schema.yml": base_seeds_schema_yml, + "base.csv": seeds_base_csv, + } + + def assert_total_count_correct(self, project): + '''Check if table is created on cluster''' + cluster = project.test_config['cluster'] + + # check if data is properly distributed/replicated + table_relation = relation_from_name(project.adapter, "force_on_cluster") + # ClickHouse cluster in the docker-compose file + # under tests/integration is configured with 3 nodes + host_count = project.run_sql( + f"select count(host_name) as host_count from system.clusters where cluster='{cluster}'", + fetch="one", + ) + assert host_count[0] > 1 + + table_count = project.run_sql( + f"select count() From clusterAllReplicas('{cluster}', system.tables) " + f"where database='{table_relation.schema}' and name='{table_relation.identifier}'", + fetch="one", + ) + + assert table_count[0] == 3 + + mv_count = project.run_sql( + f"select count() From clusterAllReplicas('{cluster}', system.tables) " + f"where database='{table_relation.schema}' and name='{table_relation.identifier}_mv'", + fetch="one", + ) + + assert mv_count[0] == 3 + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_base(self, project): + # cluster setting must exist + cluster = project.test_config['cluster'] + assert cluster + + # seed command + results = run_dbt(["seed"]) + # seed result length + assert len(results) == 1 + + # run command + results = run_dbt() + # run result length + assert len(results) == 1 + + # names exist in result nodes + check_result_nodes_by_name(results, ["force_on_cluster"]) + + # check relation types + expected = { + "base": "table", + "replicated": "table", + } + check_relation_types(project.adapter, expected) + + relation = relation_from_name(project.adapter, "base") + # table rowcount + result = project.run_sql(f"select count(*) as num_rows from {relation}", fetch="one") + assert result[0] == 10 + + # relations_equal + self.assert_total_count_correct(project) + + # run full refresh + results = run_dbt(['--debug', 'run', '--full-refresh']) + # run result length + assert len(results) == 1 + + self.assert_total_count_correct(project) From a99ef52321469c7505047b4fc8a2a53857e339bd Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Wed, 14 Aug 2024 18:22:01 +0300 Subject: [PATCH 02/96] Fix quotation in split_part macro It seems that there's extra quotes in split_part macro because if you run dbt_utils.get_url_paramter code for clickhouse results in extra quotes around field name. --- dbt/include/clickhouse/macros/utils/utils.sql | 128 ++++++++++-------- 1 file changed, 73 insertions(+), 55 deletions(-) diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index 1bf0ea5f..f3b32c2b 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -1,110 +1,128 @@ {% macro clickhouse__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%} - {% set main_sql_formatted = clickhouse__place_limit(main_sql, limit) if limit !=None else main_sql%} + {% set main_sql_formatted = ( + clickhouse__place_limit(main_sql, limit) + if limit != None + else main_sql + ) %} select - {{ fail_calc }} as failures, - {{ fail_calc }} {{ warn_if }} as should_warn, - {{ fail_calc }} {{ error_if }} as should_error - from ( - {{ main_sql_formatted }} - ) dbt_internal_test + {{ fail_calc }} as failures, + {{ fail_calc }} {{ warn_if }} as should_warn, + {{ fail_calc }} {{ error_if }} as should_error + from ({{ main_sql_formatted }}) dbt_internal_test {%- endmacro %} --- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while preserving any ClickHouse settings specified in the query. +-- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while +-- preserving any ClickHouse settings specified in the query. -- When multiple queries are nested, the limit will be attached to the outer query {% macro clickhouse__place_limit(query, limit) -%} - {% if 'settings' in query.lower()%} - {% if '-- end_of_sql' not in query.lower()%} - {{exceptions.raise_compiler_error("-- end_of_sql must be set when using ClickHouse settings")}} + {% if "settings" in query.lower() %} + {% if "-- end_of_sql" not in query.lower() %} + {{ + exceptions.raise_compiler_error( + "-- end_of_sql must be set when using ClickHouse settings" + ) + }} {% endif %} - {% set split_by_settings_sections = query.split("-- end_of_sql")%} - {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} - {% set query_with_limit = "-- end_of_sql".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} - {{query_with_limit}} - {% else %} - {{query}} - {{"limit " ~ limit}} + {% set split_by_settings_sections = query.split("-- end_of_sql") %} + {% set split_by_settings_sections_with_limit = ( + split_by_settings_sections[-2] + "\n LIMIT " + limit + | string + "\n" + ) %} + {% set query_with_limit = "-- end_of_sql".join( + split_by_settings_sections[:-2] + + [ + split_by_settings_sections_with_limit, + split_by_settings_sections[-1], + ] + ) %} + {{ query_with_limit }} + {% else %} {{ query }} {{ "limit " ~ limit }} {% endif %} {%- endmacro %} -{% macro clickhouse__any_value(expression) -%} - any({{ expression }}) -{%- endmacro %} +{% macro clickhouse__any_value(expression) -%} any({{ expression }}) {%- endmacro %} -{% macro clickhouse__bool_or(expression) -%} - max({{ expression }}) > 0 -{%- endmacro %} +{% macro clickhouse__bool_or(expression) -%} max({{ expression }}) > 0 {%- endmacro %} {% macro clickhouse__cast_bool_to_text(field) %} - multiIf({{ field }} > 0, 'true', {{ field }} = 0, 'false', NULL) + multiif({{ field }} > 0, 'true', {{ field }} = 0, 'false', null) {% endmacro %} {% macro clickhouse__hash(field) -%} - lower(hex(MD5(toString({{ field }} )))) + lower(hex(md5(tostring({{ field }})))) {%- endmacro %} {%- macro clickhouse__last_day(date, datepart) -%} - {{ dbt.dateadd('day', '-1', dbt.dateadd(datepart, '1', dbt.date_trunc(datepart, date)))}} + {{ + dbt.dateadd( + "day", "-1", dbt.dateadd(datepart, "1", dbt.date_trunc(datepart, date)) + ) + }} {%- endmacro -%} {% macro clickhouse__split_part(string_text, delimiter_text, part_number) %} - splitByChar('{{delimiter_text}}', {{ string_text }})[{{ part_number }}] + splitbychar({{ delimiter_text }}, {{ string_text }})[{{ part_number }}] {% endmacro %} {% macro clickhouse__replace(field, old_chars, new_chars) %} - replaceAll({{ field }},'{{ old_chars }}','{{ new_chars }}') + replaceall({{ field }}, '{{ old_chars }}', '{{ new_chars }}') {% endmacro %} {% macro clickhouse__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} - {% if order_by_clause and 'order by' == ' '.join(order_by_clause.split()[:2]).lower() -%} - {% set order_by_clause_tokens = order_by_clause.split() %} - {% if ',' in order_by_clause_tokens %} - {{ exceptions.raise_compiler_error( - 'ClickHouse does not support multiple order by fields.') - }} - {%- endif %} - {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} - {% set sort_direction = '' %} - {% if 'desc' in ''.join(order_by_clause_tokens[1:]).lower() %} - {% set sort_direction = 'Reverse' %} - {% endif %} - {% set order_by_field = order_by_clause_tokens[0] %} - - {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format(sort_direction, measure, order_by_field) %} - {% else -%} - {% set arr = "array_agg({})".format(measure) %} + {% if order_by_clause and "order by" == " ".join( + order_by_clause.split()[:2] + ).lower() -%} + {% set order_by_clause_tokens = order_by_clause.split() %} + {% if "," in order_by_clause_tokens %} + {{ + exceptions.raise_compiler_error( + "ClickHouse does not support multiple order by fields." + ) + }} + {%- endif %} + {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} + {% set sort_direction = "" %} + {% if "desc" in "".join(order_by_clause_tokens[1:]).lower() %} + {% set sort_direction = "Reverse" %} + {% endif %} + {% set order_by_field = order_by_clause_tokens[0] %} + + {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format( + sort_direction, measure, order_by_field + ) %} + {% else -%} {% set arr = "array_agg({})".format(measure) %} {%- endif %} {% if limit_num -%} - arrayStringConcat(arraySlice({{ arr }}, 1, {{ limit_num }}), {{delimiter_text}}) - {% else -%} - arrayStringConcat({{ arr }}, {{delimiter_text}}) + arraystringconcat( + arrayslice({{ arr }}, 1, {{ limit_num }}), {{ delimiter_text }} + ) + {% else -%} arraystringconcat({{ arr }}, {{ delimiter_text }}) {%- endif %} {%- endmacro %} {% macro clickhouse__array_construct(inputs, data_type) -%} - {% if inputs|length > 0 %} - [ {{ inputs|join(' , ') }} ] - {% else %} - emptyArray{{data_type}}() + {% if inputs | length > 0 %}[{{ inputs | join(" , ") }}] + {% else %} emptyarray{{ data_type }} () {% endif %} {%- endmacro %} {% macro clickhouse__array_append(array, new_element) -%} - arrayPushBack({{ array }}, {{ new_element }}) + arraypushback({{ array }}, {{ new_element }}) {% endmacro %} {% macro clickhouse__array_concat(array_1, array_2) -%} - arrayConcat({{ array_1 }}, {{ array_2 }}) + arrayconcat({{ array_1 }}, {{ array_2 }}) {% endmacro %} From 1ec00fec097a3adba08a37961b514513429bd8c2 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Thu, 15 Aug 2024 10:13:18 +0300 Subject: [PATCH 03/96] Remove stack trace from errors --- dbt/adapters/clickhouse/httpclient.py | 3 ++- dbt/adapters/clickhouse/nativeclient.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index e809bca7..f5d81bf8 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -21,7 +21,8 @@ def command(self, sql, **kwargs): try: return self._client.command(sql, **kwargs) except DatabaseError as ex: - raise DbtDatabaseError(str(ex).strip()) from ex + err_msg = str(ex).strip().split("Stack trace")[0] + raise DbtDatabaseError(err_msg) from ex def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index d0d7fdd5..8e46e38b 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -30,7 +30,8 @@ def command(self, sql, **kwargs): if len(result) and len(result[0]): return result[0][0] except clickhouse_driver.errors.Error as ex: - raise DbtDatabaseError(str(ex).strip()) from ex + err_msg = str(ex).strip().split("Stack trace")[0] + raise DbtDatabaseError(err_msg) from ex def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: From 5302194fcf16f71333a75d3722808a07e4224480 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Thu, 15 Aug 2024 10:14:07 +0300 Subject: [PATCH 04/96] Revert "Fix quotation in split_part macro" This reverts commit a99ef52321469c7505047b4fc8a2a53857e339bd. --- dbt/include/clickhouse/macros/utils/utils.sql | 128 ++++++++---------- 1 file changed, 55 insertions(+), 73 deletions(-) diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index f3b32c2b..1bf0ea5f 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -1,128 +1,110 @@ {% macro clickhouse__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%} - {% set main_sql_formatted = ( - clickhouse__place_limit(main_sql, limit) - if limit != None - else main_sql - ) %} + {% set main_sql_formatted = clickhouse__place_limit(main_sql, limit) if limit !=None else main_sql%} select - {{ fail_calc }} as failures, - {{ fail_calc }} {{ warn_if }} as should_warn, - {{ fail_calc }} {{ error_if }} as should_error - from ({{ main_sql_formatted }}) dbt_internal_test + {{ fail_calc }} as failures, + {{ fail_calc }} {{ warn_if }} as should_warn, + {{ fail_calc }} {{ error_if }} as should_error + from ( + {{ main_sql_formatted }} + ) dbt_internal_test {%- endmacro %} --- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while --- preserving any ClickHouse settings specified in the query. +-- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while preserving any ClickHouse settings specified in the query. -- When multiple queries are nested, the limit will be attached to the outer query {% macro clickhouse__place_limit(query, limit) -%} - {% if "settings" in query.lower() %} - {% if "-- end_of_sql" not in query.lower() %} - {{ - exceptions.raise_compiler_error( - "-- end_of_sql must be set when using ClickHouse settings" - ) - }} + {% if 'settings' in query.lower()%} + {% if '-- end_of_sql' not in query.lower()%} + {{exceptions.raise_compiler_error("-- end_of_sql must be set when using ClickHouse settings")}} {% endif %} - {% set split_by_settings_sections = query.split("-- end_of_sql") %} - {% set split_by_settings_sections_with_limit = ( - split_by_settings_sections[-2] + "\n LIMIT " + limit - | string + "\n" - ) %} - {% set query_with_limit = "-- end_of_sql".join( - split_by_settings_sections[:-2] - + [ - split_by_settings_sections_with_limit, - split_by_settings_sections[-1], - ] - ) %} - {{ query_with_limit }} - {% else %} {{ query }} {{ "limit " ~ limit }} + {% set split_by_settings_sections = query.split("-- end_of_sql")%} + {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} + {% set query_with_limit = "-- end_of_sql".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} + {{query_with_limit}} + {% else %} + {{query}} + {{"limit " ~ limit}} {% endif %} {%- endmacro %} -{% macro clickhouse__any_value(expression) -%} any({{ expression }}) {%- endmacro %} +{% macro clickhouse__any_value(expression) -%} + any({{ expression }}) +{%- endmacro %} -{% macro clickhouse__bool_or(expression) -%} max({{ expression }}) > 0 {%- endmacro %} +{% macro clickhouse__bool_or(expression) -%} + max({{ expression }}) > 0 +{%- endmacro %} {% macro clickhouse__cast_bool_to_text(field) %} - multiif({{ field }} > 0, 'true', {{ field }} = 0, 'false', null) + multiIf({{ field }} > 0, 'true', {{ field }} = 0, 'false', NULL) {% endmacro %} {% macro clickhouse__hash(field) -%} - lower(hex(md5(tostring({{ field }})))) + lower(hex(MD5(toString({{ field }} )))) {%- endmacro %} {%- macro clickhouse__last_day(date, datepart) -%} - {{ - dbt.dateadd( - "day", "-1", dbt.dateadd(datepart, "1", dbt.date_trunc(datepart, date)) - ) - }} + {{ dbt.dateadd('day', '-1', dbt.dateadd(datepart, '1', dbt.date_trunc(datepart, date)))}} {%- endmacro -%} {% macro clickhouse__split_part(string_text, delimiter_text, part_number) %} - splitbychar({{ delimiter_text }}, {{ string_text }})[{{ part_number }}] + splitByChar('{{delimiter_text}}', {{ string_text }})[{{ part_number }}] {% endmacro %} {% macro clickhouse__replace(field, old_chars, new_chars) %} - replaceall({{ field }}, '{{ old_chars }}', '{{ new_chars }}') + replaceAll({{ field }},'{{ old_chars }}','{{ new_chars }}') {% endmacro %} {% macro clickhouse__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} - {% if order_by_clause and "order by" == " ".join( - order_by_clause.split()[:2] - ).lower() -%} - {% set order_by_clause_tokens = order_by_clause.split() %} - {% if "," in order_by_clause_tokens %} - {{ - exceptions.raise_compiler_error( - "ClickHouse does not support multiple order by fields." - ) - }} - {%- endif %} - {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} - {% set sort_direction = "" %} - {% if "desc" in "".join(order_by_clause_tokens[1:]).lower() %} - {% set sort_direction = "Reverse" %} - {% endif %} - {% set order_by_field = order_by_clause_tokens[0] %} - - {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format( - sort_direction, measure, order_by_field - ) %} - {% else -%} {% set arr = "array_agg({})".format(measure) %} + {% if order_by_clause and 'order by' == ' '.join(order_by_clause.split()[:2]).lower() -%} + {% set order_by_clause_tokens = order_by_clause.split() %} + {% if ',' in order_by_clause_tokens %} + {{ exceptions.raise_compiler_error( + 'ClickHouse does not support multiple order by fields.') + }} + {%- endif %} + {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} + {% set sort_direction = '' %} + {% if 'desc' in ''.join(order_by_clause_tokens[1:]).lower() %} + {% set sort_direction = 'Reverse' %} + {% endif %} + {% set order_by_field = order_by_clause_tokens[0] %} + + {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format(sort_direction, measure, order_by_field) %} + {% else -%} + {% set arr = "array_agg({})".format(measure) %} {%- endif %} {% if limit_num -%} - arraystringconcat( - arrayslice({{ arr }}, 1, {{ limit_num }}), {{ delimiter_text }} - ) - {% else -%} arraystringconcat({{ arr }}, {{ delimiter_text }}) + arrayStringConcat(arraySlice({{ arr }}, 1, {{ limit_num }}), {{delimiter_text}}) + {% else -%} + arrayStringConcat({{ arr }}, {{delimiter_text}}) {%- endif %} {%- endmacro %} {% macro clickhouse__array_construct(inputs, data_type) -%} - {% if inputs | length > 0 %}[{{ inputs | join(" , ") }}] - {% else %} emptyarray{{ data_type }} () + {% if inputs|length > 0 %} + [ {{ inputs|join(' , ') }} ] + {% else %} + emptyArray{{data_type}}() {% endif %} {%- endmacro %} {% macro clickhouse__array_append(array, new_element) -%} - arraypushback({{ array }}, {{ new_element }}) + arrayPushBack({{ array }}, {{ new_element }}) {% endmacro %} {% macro clickhouse__array_concat(array_1, array_2) -%} - arrayconcat({{ array_1 }}, {{ array_2 }}) + arrayConcat({{ array_1 }}, {{ array_2 }}) {% endmacro %} From 9281cf36342e3a896c2b542564a0462bf0b2bfa2 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Thu, 15 Aug 2024 10:20:56 +0300 Subject: [PATCH 05/96] Fix letters case Previous commit changed all sql to lowercase. Sorry. Fixed. --- dbt/include/clickhouse/macros/utils/utils.sql | 124 ++++++++++-------- 1 file changed, 71 insertions(+), 53 deletions(-) diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index 1bf0ea5f..b43bcc6a 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -1,41 +1,51 @@ {% macro clickhouse__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%} - {% set main_sql_formatted = clickhouse__place_limit(main_sql, limit) if limit !=None else main_sql%} + {% set main_sql_formatted = ( + clickhouse__place_limit(main_sql, limit) + if limit != None + else main_sql + ) %} select - {{ fail_calc }} as failures, - {{ fail_calc }} {{ warn_if }} as should_warn, - {{ fail_calc }} {{ error_if }} as should_error - from ( - {{ main_sql_formatted }} - ) dbt_internal_test + {{ fail_calc }} as failures, + {{ fail_calc }} {{ warn_if }} as should_warn, + {{ fail_calc }} {{ error_if }} as should_error + from ({{ main_sql_formatted }}) dbt_internal_test {%- endmacro %} --- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while preserving any ClickHouse settings specified in the query. +-- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while +-- preserving any ClickHouse settings specified in the query. -- When multiple queries are nested, the limit will be attached to the outer query {% macro clickhouse__place_limit(query, limit) -%} - {% if 'settings' in query.lower()%} - {% if '-- end_of_sql' not in query.lower()%} - {{exceptions.raise_compiler_error("-- end_of_sql must be set when using ClickHouse settings")}} + {% if "settings" in query.lower() %} + {% if "-- end_of_sql" not in query.lower() %} + {{ + exceptions.raise_compiler_error( + "-- end_of_sql must be set when using ClickHouse settings" + ) + }} {% endif %} - {% set split_by_settings_sections = query.split("-- end_of_sql")%} - {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} - {% set query_with_limit = "-- end_of_sql".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} - {{query_with_limit}} - {% else %} - {{query}} - {{"limit " ~ limit}} + {% set split_by_settings_sections = query.split("-- end_of_sql") %} + {% set split_by_settings_sections_with_limit = ( + split_by_settings_sections[-2] + "\n LIMIT " + limit + | string + "\n" + ) %} + {% set query_with_limit = "-- end_of_sql".join( + split_by_settings_sections[:-2] + + [ + split_by_settings_sections_with_limit, + split_by_settings_sections[-1], + ] + ) %} + {{ query_with_limit }} + {% else %} {{ query }} {{ "limit " ~ limit }} {% endif %} {%- endmacro %} -{% macro clickhouse__any_value(expression) -%} - any({{ expression }}) -{%- endmacro %} +{% macro clickhouse__any_value(expression) -%} any({{ expression }}) {%- endmacro %} -{% macro clickhouse__bool_or(expression) -%} - max({{ expression }}) > 0 -{%- endmacro %} +{% macro clickhouse__bool_or(expression) -%} max({{ expression }}) > 0 {%- endmacro %} {% macro clickhouse__cast_bool_to_text(field) %} @@ -44,58 +54,66 @@ {% macro clickhouse__hash(field) -%} - lower(hex(MD5(toString({{ field }} )))) + lower(hex(MD5(toString({{ field }})))) {%- endmacro %} {%- macro clickhouse__last_day(date, datepart) -%} - {{ dbt.dateadd('day', '-1', dbt.dateadd(datepart, '1', dbt.date_trunc(datepart, date)))}} + {{ + dbt.dateadd( + "day", "-1", dbt.dateadd(datepart, "1", dbt.date_trunc(datepart, date)) + ) + }} {%- endmacro -%} {% macro clickhouse__split_part(string_text, delimiter_text, part_number) %} - splitByChar('{{delimiter_text}}', {{ string_text }})[{{ part_number }}] + splitByChar({{ delimiter_text }}, {{ string_text }})[{{ part_number }}] {% endmacro %} {% macro clickhouse__replace(field, old_chars, new_chars) %} - replaceAll({{ field }},'{{ old_chars }}','{{ new_chars }}') + replaceAll({{ field }}, '{{ old_chars }}', '{{ new_chars }}') {% endmacro %} {% macro clickhouse__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} - {% if order_by_clause and 'order by' == ' '.join(order_by_clause.split()[:2]).lower() -%} - {% set order_by_clause_tokens = order_by_clause.split() %} - {% if ',' in order_by_clause_tokens %} - {{ exceptions.raise_compiler_error( - 'ClickHouse does not support multiple order by fields.') - }} - {%- endif %} - {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} - {% set sort_direction = '' %} - {% if 'desc' in ''.join(order_by_clause_tokens[1:]).lower() %} - {% set sort_direction = 'Reverse' %} - {% endif %} - {% set order_by_field = order_by_clause_tokens[0] %} - - {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format(sort_direction, measure, order_by_field) %} - {% else -%} - {% set arr = "array_agg({})".format(measure) %} + {% if order_by_clause and "order by" == " ".join( + order_by_clause.split()[:2] + ).lower() -%} + {% set order_by_clause_tokens = order_by_clause.split() %} + {% if "," in order_by_clause_tokens %} + {{ + exceptions.raise_compiler_error( + "ClickHouse does not support multiple order by fields." + ) + }} + {%- endif %} + {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} + {% set sort_direction = "" %} + {% if "desc" in "".join(order_by_clause_tokens[1:]).lower() %} + {% set sort_direction = "Reverse" %} + {% endif %} + {% set order_by_field = order_by_clause_tokens[0] %} + + {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format( + sort_direction, measure, order_by_field + ) %} + {% else -%} {% set arr = "array_agg({})".format(measure) %} {%- endif %} {% if limit_num -%} - arrayStringConcat(arraySlice({{ arr }}, 1, {{ limit_num }}), {{delimiter_text}}) - {% else -%} - arrayStringConcat({{ arr }}, {{delimiter_text}}) + arrayStringConcat( + arraySlice({{ arr }}, 1, {{ limit_num }}), {{ delimiter_text }} + ) + {% else -%} arrayStringConcat({{ arr }}, {{ delimiter_text }}) {%- endif %} {%- endmacro %} {% macro clickhouse__array_construct(inputs, data_type) -%} - {% if inputs|length > 0 %} - [ {{ inputs|join(' , ') }} ] - {% else %} - emptyArray{{data_type}}() + {% if inputs | length > 0 %}[{{ inputs | join(" , ") }}] + {% else %} emptyArray{{ data_type }} () {% endif %} {%- endmacro %} @@ -106,5 +124,5 @@ {% macro clickhouse__array_concat(array_1, array_2) -%} - arrayConcat({{ array_1 }}, {{ array_2 }}) + arrayConcat({{ array_1 }}, {{ array_2 }}) {% endmacro %} From 678c8b60370e60b0f74f02de298e28e3c1558694 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Thu, 15 Aug 2024 10:31:04 +0300 Subject: [PATCH 06/96] Change to splitByString Changed from splitByChar to splitByStrig because sometimes you need more than one char to split. --- dbt/include/clickhouse/macros/utils/utils.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index b43bcc6a..e4978180 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -68,7 +68,7 @@ {% macro clickhouse__split_part(string_text, delimiter_text, part_number) %} - splitByChar({{ delimiter_text }}, {{ string_text }})[{{ part_number }}] + splitByString({{ delimiter_text }}, {{ string_text }})[{{ part_number }}] {% endmacro %} From 5d119a46a96803ccc4f34dbae39d76490e87cabe Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Wed, 21 Aug 2024 17:53:07 +0300 Subject: [PATCH 07/96] Fix formatting --- dbt/include/clickhouse/macros/utils/utils.sql | 125 ++++++++---------- 1 file changed, 54 insertions(+), 71 deletions(-) diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index e4978180..a07a6137 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -1,51 +1,41 @@ {% macro clickhouse__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%} - {% set main_sql_formatted = ( - clickhouse__place_limit(main_sql, limit) - if limit != None - else main_sql - ) %} + {% set main_sql_formatted = clickhouse__place_limit(main_sql, limit) if limit !=None else main_sql%} select - {{ fail_calc }} as failures, - {{ fail_calc }} {{ warn_if }} as should_warn, - {{ fail_calc }} {{ error_if }} as should_error - from ({{ main_sql_formatted }}) dbt_internal_test + {{ fail_calc }} as failures, + {{ fail_calc }} {{ warn_if }} as should_warn, + {{ fail_calc }} {{ error_if }} as should_error + from ( + {{ main_sql_formatted }} + ) dbt_internal_test {%- endmacro %} --- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while --- preserving any ClickHouse settings specified in the query. +-- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while preserving any ClickHouse settings specified in the query. -- When multiple queries are nested, the limit will be attached to the outer query {% macro clickhouse__place_limit(query, limit) -%} - {% if "settings" in query.lower() %} - {% if "-- end_of_sql" not in query.lower() %} - {{ - exceptions.raise_compiler_error( - "-- end_of_sql must be set when using ClickHouse settings" - ) - }} + {% if 'settings' in query.lower()%} + {% if '-- end_of_sql' not in query.lower()%} + {{exceptions.raise_compiler_error("-- end_of_sql must be set when using ClickHouse settings")}} {% endif %} - {% set split_by_settings_sections = query.split("-- end_of_sql") %} - {% set split_by_settings_sections_with_limit = ( - split_by_settings_sections[-2] + "\n LIMIT " + limit - | string + "\n" - ) %} - {% set query_with_limit = "-- end_of_sql".join( - split_by_settings_sections[:-2] - + [ - split_by_settings_sections_with_limit, - split_by_settings_sections[-1], - ] - ) %} - {{ query_with_limit }} - {% else %} {{ query }} {{ "limit " ~ limit }} + {% set split_by_settings_sections = query.split("-- end_of_sql")%} + {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} + {% set query_with_limit = "-- end_of_sql".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} + {{query_with_limit}} + {% else %} + {{query}} + {{"limit " ~ limit}} {% endif %} {%- endmacro %} -{% macro clickhouse__any_value(expression) -%} any({{ expression }}) {%- endmacro %} +{% macro clickhouse__any_value(expression) -%} + any({{ expression }}) +{%- endmacro %} -{% macro clickhouse__bool_or(expression) -%} max({{ expression }}) > 0 {%- endmacro %} +{% macro clickhouse__bool_or(expression) -%} + max({{ expression }}) > 0 +{%- endmacro %} {% macro clickhouse__cast_bool_to_text(field) %} @@ -54,66 +44,58 @@ {% macro clickhouse__hash(field) -%} - lower(hex(MD5(toString({{ field }})))) + lower(hex(MD5(toString({{ field }} )))) {%- endmacro %} {%- macro clickhouse__last_day(date, datepart) -%} - {{ - dbt.dateadd( - "day", "-1", dbt.dateadd(datepart, "1", dbt.date_trunc(datepart, date)) - ) - }} + {{ dbt.dateadd('day', '-1', dbt.dateadd(datepart, '1', dbt.date_trunc(datepart, date)))}} {%- endmacro -%} {% macro clickhouse__split_part(string_text, delimiter_text, part_number) %} - splitByString({{ delimiter_text }}, {{ string_text }})[{{ part_number }}] + splitByString({{delimiter_text}}, {{ string_text }})[{{ part_number }}] {% endmacro %} {% macro clickhouse__replace(field, old_chars, new_chars) %} - replaceAll({{ field }}, '{{ old_chars }}', '{{ new_chars }}') + replaceAll({{ field }},'{{ old_chars }}','{{ new_chars }}') {% endmacro %} {% macro clickhouse__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} - {% if order_by_clause and "order by" == " ".join( - order_by_clause.split()[:2] - ).lower() -%} - {% set order_by_clause_tokens = order_by_clause.split() %} - {% if "," in order_by_clause_tokens %} - {{ - exceptions.raise_compiler_error( - "ClickHouse does not support multiple order by fields." - ) - }} - {%- endif %} - {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} - {% set sort_direction = "" %} - {% if "desc" in "".join(order_by_clause_tokens[1:]).lower() %} - {% set sort_direction = "Reverse" %} - {% endif %} - {% set order_by_field = order_by_clause_tokens[0] %} - - {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format( - sort_direction, measure, order_by_field - ) %} - {% else -%} {% set arr = "array_agg({})".format(measure) %} + {% if order_by_clause and 'order by' == ' '.join(order_by_clause.split()[:2]).lower() -%} + {% set order_by_clause_tokens = order_by_clause.split() %} + {% if ',' in order_by_clause_tokens %} + {{ exceptions.raise_compiler_error( + 'ClickHouse does not support multiple order by fields.') + }} + {%- endif %} + {% set order_by_clause_tokens = order_by_clause_tokens[2:] %} + {% set sort_direction = '' %} + {% if 'desc' in ''.join(order_by_clause_tokens[1:]).lower() %} + {% set sort_direction = 'Reverse' %} + {% endif %} + {% set order_by_field = order_by_clause_tokens[0] %} + + {% set arr = "arrayMap(x -> x.1, array{}Sort(x -> x.2, arrayZip(array_agg({}), array_agg({}))))".format(sort_direction, measure, order_by_field) %} + {% else -%} + {% set arr = "array_agg({})".format(measure) %} {%- endif %} {% if limit_num -%} - arrayStringConcat( - arraySlice({{ arr }}, 1, {{ limit_num }}), {{ delimiter_text }} - ) - {% else -%} arrayStringConcat({{ arr }}, {{ delimiter_text }}) + arrayStringConcat(arraySlice({{ arr }}, 1, {{ limit_num }}), {{delimiter_text}}) + {% else -%} + arrayStringConcat({{ arr }}, {{delimiter_text}}) {%- endif %} {%- endmacro %} {% macro clickhouse__array_construct(inputs, data_type) -%} - {% if inputs | length > 0 %}[{{ inputs | join(" , ") }}] - {% else %} emptyArray{{ data_type }} () + {% if inputs|length > 0 %} + [ {{ inputs|join(' , ') }} ] + {% else %} + emptyArray{{data_type}}() {% endif %} {%- endmacro %} @@ -124,5 +106,6 @@ {% macro clickhouse__array_concat(array_1, array_2) -%} - arrayConcat({{ array_1 }}, {{ array_2 }}) + arrayConcat({{ array_1 }}, {{ array_2 }}) {% endmacro %} + From efec99795cefbf878db910b339fbfa82cba323e4 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Fri, 23 Aug 2024 16:58:00 +0300 Subject: [PATCH 08/96] Fix quotation in split_part test --- tests/integration/adapter/utils/test_split_part.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/adapter/utils/test_split_part.py b/tests/integration/adapter/utils/test_split_part.py index 81e1afaa..95677075 100644 --- a/tests/integration/adapter/utils/test_split_part.py +++ b/tests/integration/adapter/utils/test_split_part.py @@ -10,7 +10,7 @@ ) select - {{ split_part('parts', '|', 1) }} as actual, + {{ split_part('parts', "'|'", 1) }} as actual, result_1 as expected from data @@ -18,7 +18,7 @@ union all select - {{ split_part('parts', '|', 2) }} as actual, + {{ split_part('parts', "'|'", 2) }} as actual, result_2 as expected from data @@ -26,7 +26,7 @@ union all select - {{ split_part('parts', '|', 3) }} as actual, + {{ split_part('parts', "'|'", 3) }} as actual, result_3 as expected from data From 6e155e8d52c1c37f7b0d82dcadf51406b3d4f757 Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Tue, 24 Sep 2024 14:01:03 -0700 Subject: [PATCH 09/96] feat: support range_hashed and complex_key_range_hashed dictionary materialization layout --- dbt/include/clickhouse/macros/materializations/dictionary.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dbt/include/clickhouse/macros/materializations/dictionary.sql b/dbt/include/clickhouse/macros/materializations/dictionary.sql index 486ff0b0..80fd7982 100644 --- a/dbt/include/clickhouse/macros/materializations/dictionary.sql +++ b/dbt/include/clickhouse/macros/materializations/dictionary.sql @@ -71,6 +71,9 @@ ) LAYOUT({{ config.get('layout') }}) LIFETIME({{ config.get('lifetime') }}) + {%- if config.get('range') %} + RANGE({{ config.get('range') }}) + {%- endif %} {% endmacro %} From 7a84e51c1a4eaee1c4cf2969f86323c5639de008 Mon Sep 17 00:00:00 2001 From: alexsubota Date: Wed, 25 Sep 2024 11:57:33 +0300 Subject: [PATCH 10/96] Apply query settings for empty model --- dbt/include/clickhouse/macros/materializations/table.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 1ef0e8b5..a7601552 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -203,6 +203,7 @@ {{ sql }} ) {%- endif %} + {{ adapter.get_model_query_settings(model) }} {%- endif %} {%- endmacro %} From d992b9f39a4c75a056fa8236a3d10c02bc9550a1 Mon Sep 17 00:00:00 2001 From: alexsubota Date: Thu, 26 Sep 2024 13:26:11 +0300 Subject: [PATCH 11/96] Create test_query_settings.py --- .../query_settings/test_query_settings.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 tests/integration/adapter/query_settings/test_query_settings.py diff --git a/tests/integration/adapter/query_settings/test_query_settings.py b/tests/integration/adapter/query_settings/test_query_settings.py new file mode 100644 index 00000000..43fb6c39 --- /dev/null +++ b/tests/integration/adapter/query_settings/test_query_settings.py @@ -0,0 +1,60 @@ +import pytest +from dbt.tests.util import run_dbt + +nullable_column_model = """ +{{ + config( + materialized='table', + query_settings={ + 'join_use_nulls': 1 + } + ) +}} +select t2.id as test_id +from (select 1 as id) t1 + left join (select 2 as id) t2 +on t1.id=t2.id +""" + + +class TestNullableColumnJoin: + @pytest.fixture(scope="class") + def models(self): + return { + "nullable_column_model.sql": nullable_column_model, + } + + def test_nullable_column_join(self, project): + run_dbt(["run", "--select", "nullable_column_model"]) + result = project.run_sql("select isNullable(test_id) as is_nullable_column from nullable_column_model", fetch="one") + assert result[0] == 1 + + + +not_nullable_column_model = """ +{{ + config( + materialized='table', + query_settings={ + 'join_use_nulls': 0 + } + ) +}} +select t2.id as test_id +from (select 1 as id) t1 + left join (select 2 as id) t2 +on t1.id=t2.id +""" + + +class TestNotNullableColumnJoin: + @pytest.fixture(scope="class") + def models(self): + return { + "not_nullable_column_model.sql": not_nullable_column_model, + } + + def test_nullable_column_join(self, project): + run_dbt(["run", "--select", "not_nullable_column_model"]) + result = project.run_sql("select isNullable(test_id) as is_nullable_column from not_nullable_column_model", fetch="one") + assert result[0] == 0 From 52cbf4ff37a6990647dfc568078674d92a32ec5d Mon Sep 17 00:00:00 2001 From: alexsubota Date: Fri, 27 Sep 2024 09:22:04 +0300 Subject: [PATCH 12/96] fix black test_query_settings.py --- .../adapter/query_settings/test_query_settings.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/integration/adapter/query_settings/test_query_settings.py b/tests/integration/adapter/query_settings/test_query_settings.py index 43fb6c39..83ef370c 100644 --- a/tests/integration/adapter/query_settings/test_query_settings.py +++ b/tests/integration/adapter/query_settings/test_query_settings.py @@ -26,11 +26,13 @@ def models(self): def test_nullable_column_join(self, project): run_dbt(["run", "--select", "nullable_column_model"]) - result = project.run_sql("select isNullable(test_id) as is_nullable_column from nullable_column_model", fetch="one") + result = project.run_sql( + "select isNullable(test_id) as is_nullable_column from nullable_column_model", + fetch="one", + ) assert result[0] == 1 - not_nullable_column_model = """ {{ config( @@ -56,5 +58,8 @@ def models(self): def test_nullable_column_join(self, project): run_dbt(["run", "--select", "not_nullable_column_model"]) - result = project.run_sql("select isNullable(test_id) as is_nullable_column from not_nullable_column_model", fetch="one") + result = project.run_sql( + "select isNullable(test_id) as is_nullable_column from not_nullable_column_model", + fetch="one", + ) assert result[0] == 0 From 58f7fa97a846dd90e3b6906ce68013c2e664e9ae Mon Sep 17 00:00:00 2001 From: Can Bekleyici Date: Tue, 1 Oct 2024 15:36:09 +0200 Subject: [PATCH 13/96] don't drop local tables when there is no distributed table --- .../incremental/distributed_incremental.sql | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql index ef31a76c..26676ed1 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql @@ -15,7 +15,7 @@ {% do exceptions.raise_compiler_error('To use distributed materializations cluster setting in dbt profile must be set') %} {% endif %} - {% set existing_relation_local = existing_relation.incorporate(path={"identifier": this.identifier + local_suffix, "schema": local_db_prefix + this.schema}) if existing_relation is not none else none %} + {% set existing_relation_local = load_cached_relation(this.incorporate(path={"identifier": this.identifier + local_suffix, "schema": local_db_prefix + this.schema})) %} {% set target_relation_local = target_relation.incorporate(path={"identifier": this.identifier + local_suffix, "schema": local_db_prefix + this.schema}) if target_relation is not none else none %} {%- set unique_key = config.get('unique_key') -%} @@ -55,8 +55,8 @@ {{ create_view_as(view_relation, sql) }} {% endcall %} - {% if existing_relation is none %} - -- No existing table, simply create a new one + {% if existing_relation_local is none %} + -- No existing local table, recreate local and distributed tables {{ create_distributed_local_table(target_relation, target_relation_local, view_relation, sql) }} {% elif full_refresh_mode %} @@ -74,6 +74,11 @@ {% endcall %} {% else %} + {% if existing_relation is none %} + {% do run_query(create_distributed_table(target_relation, target_relation_local)) %} + {% set existing_relation = target_relation %} + {% endif %} + {% set incremental_strategy = adapter.calculate_incremental_strategy(config.get('incremental_strategy')) %} {% set incremental_predicates = config.get('predicates', none) or config.get('incremental_predicates', none) %} {%- if on_schema_change != 'ignore' %} From 01c2e9f8585e9ab9bb8d5b6096f5671d40522323 Mon Sep 17 00:00:00 2001 From: Shachi Bista Date: Fri, 11 Oct 2024 18:14:12 +0200 Subject: [PATCH 14/96] fix: Filter settings based on Engine --- CHANGELOG.md | 6 +- dbt/adapters/clickhouse/impl.py | 184 +++++++++++++++++- .../materializations/distributed_table.sql | 2 +- .../macros/materializations/table.sql | 4 +- .../macros/materializations/view.sql | 2 +- 5 files changed, 192 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe3d4536..6663e5b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Release [x.x.x] +### Improvements +* Ignores incompatible settings based on the configured Engine. + ### Release [1.8.4], 2024-09-17 ### Improvement * The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to @@ -527,4 +531,4 @@ for Replicated tables that use the `{uuid}` macro in the path to avoid name conf [0.19.1]: https://github.com/ClickHouse/dbt-clickhouse/compare/v0.19.0.2...v0.19.1 [0.19.0.2]: https://github.com/ClickHouse/dbt-clickhouse/compare/v0.19.0.1...v0.19.0.2 [0.19.0.1]: https://github.com/ClickHouse/dbt-clickhouse/compare/v0.19.0...v0.19.0.1 -[0.19.0]: https://github.com/ClickHouse/dbt-clickhouse/compare/eb3020a...v0.19.0 \ No newline at end of file +[0.19.0]: https://github.com/ClickHouse/dbt-clickhouse/compare/eb3020a...v0.19.0 diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index c40a01f6..5decb3d6 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -47,6 +47,165 @@ GET_CATALOG_MACRO_NAME = 'get_catalog' LIST_SCHEMAS_MACRO_NAME = 'list_schemas' +ENGINE_SETTINGS = { + 'MergeTree': [ + "index_granularity", + "index_granularity_bytes", + "min_index_granularity_bytes", + "enable_mixed_granularity_parts", + "use_minimalistic_part_header_in_zookeeper", + "min_merge_bytes_to_use_direct_io", + "merge_with_ttl_timeout", + "merge_with_recompression_ttl_timeout", + "write_final_mark", + "storage_policy", + "min_bytes_for_wide_part", + "max_compress_block_size", + "min_compress_block_size", + "max_suspicious_broken_parts", + "parts_to_throw_insert", + "parts_to_delay_insert", + "inactive_parts_to_throw_insert", + "inactive_parts_to_delay_insert", + "max_delay_to_insert", + "max_parts_in_total", + "simultaneous_parts_removal_limit", + "replicated_deduplication_window", + "non_replicated_deduplication_window", + "replicated_deduplication_window_seconds", + "replicated_deduplication_window_for_async_inserts", + "replicated_deduplication_window_seconds_for_async_inserts", + "use_async_block_ids_cache", + "async_block_ids_cache_min_update_interval_ms", + "max_replicated_logs_to_keep", + "min_replicated_logs_to_keep", + "prefer_fetch_merged_part_time_threshold", + "prefer_fetch_merged_part_size_threshold", + "execute_merges_on_single_replica_time_threshold", + "remote_fs_execute_merges_on_single_replica_time_threshold", + "try_fetch_recompressed_part_timeout", + "always_fetch_merged_part", + "max_suspicious_broken_parts", + "max_suspicious_broken_parts_bytes", + "max_files_to_modify_in_alter_columns", + "max_files_to_remove_in_alter_columns", + "replicated_max_ratio_of_wrong_parts", + "replicated_max_parallel_fetches_for_host", + "replicated_fetches_http_connection_timeout", + "replicated_can_become_leader", + "zookeeper_session_expiration_check_period", + "detach_old_local_parts_when_cloning_replica", + "replicated_fetches_http_connection_timeout", + "replicated_fetches_http_send_timeout", + "replicated_fetches_http_receive_timeout", + "max_replicated_fetches_network_bandwidth", + "max_replicated_sends_network_bandwidth", + "old_parts_lifetime", + "max_bytes_to_merge_at_max_space_in_pool", + "max_bytes_to_merge_at_min_space_in_pool", + "merge_max_block_size", + "number_of_free_entries_in_pool_to_lower_max_size_of_merge", + "number_of_free_entries_in_pool_to_execute_mutation", + "max_part_loading_threads", + "max_partitions_to_read", + "min_age_to_force_merge_seconds", + "min_age_to_force_merge_on_partition_only", + "number_of_free_entries_in_pool_to_execute_optimize_entire_partition", + "allow_floating_point_partition_key", + "check_sample_column_is_correct", + "min_bytes_to_rebalance_partition_over_jbod", + "detach_not_byte_identical_parts", + "merge_tree_clear_old_temporary_directories_interval_seconds", + "merge_tree_clear_old_parts_interval_seconds", + "max_concurrent_queries", + "min_marks_to_honor_max_concurrent_queries", + "ratio_of_defaults_for_sparse_serialization", + "replace_long_file_name_to_hash", + "max_file_name_length", + "allow_experimental_block_number_column", + "exclude_deleted_rows_for_part_size_in_merge", + "load_existing_rows_count_for_old_parts", + "use_compact_variant_discriminators_serialization", + "merge_workload", + "mutation_workload", + "lightweight_mutation_projection_mode", + "deduplicate_merge_projection_mode", + "min_free_disk_bytes_to_perform_insert", + "min_free_disk_ratio_to_perform_insert" + ], + 'Memory': [ + 'min_bytes_to_keep', + 'max_bytes_to_keep', + 'min_rows_to_keep', + 'max_rows_to_keep' + ], + 'URL': [ + 'engine_url_skip_empty_files', + 'enable_url_encoding' + ], + 'File': [ + 'engine_file_empty_if_not_exists', + 'engine_file_truncate_on_insert', + 'engine_file_allow_create_multiple_files', + 'engine_file_skip_empty_files', + 'storage_file_read_method' + ], + 'Distributed': [ + "fsync_after_insert", + "fsync_directories", + "skip_unavailable_shards", + "bytes_to_throw_insert", + "bytes_to_delay_insert", + "max_delay_to_insert", + "background_insert_batch", + "background_insert_split_batch_on_failure", + "background_insert_sleep_time_ms", + "background_insert_max_sleep_time_ms", + "flush_on_detach" + ], + 'MySQL': [ + 'connection_pool_size', + 'connection_max_tries', + 'connection_wait_timeout', + 'connection_auto_close', + 'connection_timeout', + 'read_write_timeout' + ], + 'S3': [ + 's3_truncate_on_insert', + 's3_create_new_file_on_insert', + 's3_skip_empty_files', + 's3_max_single_part_upload_size', + 's3_min_upload_part_size', + 's3_max_redirects', + 's3_single_read_retries', + 's3_max_put_rps', + 's3_max_put_burst', + 's3_max_get_rps', + 's3_max_get_burst', + 's3_upload_part_size_multiply_factor', + 's3_upload_part_size_multiply_parts_count_threshold', + 's3_max_inflight_parts_for_one_file', + 'endpoint', + 'access_key_id', + 'secret_access_key', + 'use_environment_credentials', + 'region', + 'use_insecure_imds_request', + 'expiration_window_seconds', + 'no_sign_request', + 'header', + 'server_side_encryption_customer_key_base64', + 'server_side_encryption_kms_key_id', + 'server_side_encryption_kms_encryption_context', + 'server_side_encryption_kms_bucket_key_enabled', + 'max_single_read_retries', + 'max_put_rps', + 'max_put_burst', + 'max_get_rps', + 'max_get_burst' + ] +} @dataclass class ClickHouseConfig(AdapterConfig): @@ -455,12 +614,13 @@ def run_sql_for_tests(self, sql, fetch, conn): conn.state = 'close' @available - def get_model_settings(self, model): + def get_model_settings(self, model, engine='MergeTree'): settings = model['config'].get('settings', {}) materialization_type = model['config'].get('materialized') conn = self.connections.get_if_exists() conn.handle.update_model_settings(settings, materialization_type) res = [] + settings = self.filter_settings_by_engine(settings, engine) for key in settings: res.append(f' {key}={settings[key]}') settings_str = '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' @@ -468,6 +628,28 @@ def get_model_settings(self, model): -- end_of_sql {settings_str} """ + + @available + def filter_settings_by_engine(self, settings, engine): + filtered_settings = {} + + if engine not in ENGINE_SETTINGS: + # If the engine has no settings it will not be in the ENGINE_SETTINGS map. + return filtered_settings + + if engine.endswith('MergeTree'): + # Special case for MergeTree due to all its variations. + allowed_settings = ENGINE_SETTINGS['MergeTree'] + else: + allowed_settings = ENGINE_SETTINGS[engine] + + for key, value in settings.items(): + if key in allowed_settings: + filtered_settings[key] = value + else: + logger.warning(f"Setting {key} not available for engine {engine}, ignoring.") + + return filtered_settings @available def get_model_query_settings(self, model): diff --git a/dbt/include/clickhouse/macros/materializations/distributed_table.sql b/dbt/include/clickhouse/macros/materializations/distributed_table.sql index e84e8396..f9a7bea1 100644 --- a/dbt/include/clickhouse/macros/materializations/distributed_table.sql +++ b/dbt/include/clickhouse/macros/materializations/distributed_table.sql @@ -127,7 +127,7 @@ {{ order_cols(label="order by") }} {{ primary_key_clause(label="primary key") }} {{ partition_cols(label="partition by") }} - {{ adapter.get_model_settings(model) }} + {{ adapter.get_model_settings(model, config.get('engine', default='MergeTree')) }} {%- endmacro %} {% macro create_distributed_local_table(distributed_relation, shard_relation, structure_relation, sql_query=none) -%} diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 1ef0e8b5..afb3d5d4 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -177,7 +177,7 @@ {% if temporary -%} create temporary table {{ relation }} engine Memory - {{ adapter.get_model_settings(model) }} + {{ adapter.get_model_settings(model, 'Memory') }} as ( {{ sql }} ) @@ -193,7 +193,7 @@ {{ primary_key_clause(label="primary key") }} {{ partition_cols(label="partition by") }} {{ ttl_config(label="ttl")}} - {{ adapter.get_model_settings(model) }} + {{ adapter.get_model_settings(model, config.get('engine', default='MergeTree')) }} {%- if not has_contract %} {%- if not adapter.is_before_version('22.7.1.2484') %} diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index 09063fd4..8b4fde5f 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -81,7 +81,7 @@ {{ adapter.get_model_query_settings(model) }} ) {% if model.get('config').get('materialized') == 'view' %} - {{ adapter.get_model_settings(model) }} + {{ adapter.get_model_settings(model, config.get('engine', default='MergeTree')) }} {%- endif %} {%- endmacro %} From 8f49ae968a07eb7bcc053583bdf61a1bfa917100 Mon Sep 17 00:00:00 2001 From: Shachi Bista Date: Fri, 11 Oct 2024 18:58:46 +0200 Subject: [PATCH 15/96] Make linter happy --- dbt/adapters/clickhouse/impl.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 5decb3d6..65df492f 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -131,24 +131,16 @@ "lightweight_mutation_projection_mode", "deduplicate_merge_projection_mode", "min_free_disk_bytes_to_perform_insert", - "min_free_disk_ratio_to_perform_insert" - ], - 'Memory': [ - 'min_bytes_to_keep', - 'max_bytes_to_keep', - 'min_rows_to_keep', - 'max_rows_to_keep' - ], - 'URL': [ - 'engine_url_skip_empty_files', - 'enable_url_encoding' + "min_free_disk_ratio_to_perform_insert", ], + 'Memory': ['min_bytes_to_keep', 'max_bytes_to_keep', 'min_rows_to_keep', 'max_rows_to_keep'], + 'URL': ['engine_url_skip_empty_files', 'enable_url_encoding'], 'File': [ 'engine_file_empty_if_not_exists', 'engine_file_truncate_on_insert', 'engine_file_allow_create_multiple_files', 'engine_file_skip_empty_files', - 'storage_file_read_method' + 'storage_file_read_method', ], 'Distributed': [ "fsync_after_insert", @@ -161,7 +153,7 @@ "background_insert_split_batch_on_failure", "background_insert_sleep_time_ms", "background_insert_max_sleep_time_ms", - "flush_on_detach" + "flush_on_detach", ], 'MySQL': [ 'connection_pool_size', @@ -169,7 +161,7 @@ 'connection_wait_timeout', 'connection_auto_close', 'connection_timeout', - 'read_write_timeout' + 'read_write_timeout', ], 'S3': [ 's3_truncate_on_insert', @@ -203,10 +195,11 @@ 'max_put_rps', 'max_put_burst', 'max_get_rps', - 'max_get_burst' - ] + 'max_get_burst', + ], } + @dataclass class ClickHouseConfig(AdapterConfig): engine: str = 'MergeTree()' @@ -628,7 +621,7 @@ def get_model_settings(self, model, engine='MergeTree'): -- end_of_sql {settings_str} """ - + @available def filter_settings_by_engine(self, settings, engine): filtered_settings = {} @@ -648,7 +641,7 @@ def filter_settings_by_engine(self, settings, engine): filtered_settings[key] = value else: logger.warning(f"Setting {key} not available for engine {engine}, ignoring.") - + return filtered_settings @available From 01dd31abd6bd4537793598bfaa392e9a6492b67f Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Thu, 31 Oct 2024 10:01:38 -0700 Subject: [PATCH 16/96] update CHANGELOG --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe3d4536..09ae63d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Unreleased +### Improvement +* Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) + ### Release [1.8.4], 2024-09-17 ### Improvement * The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to @@ -527,4 +531,4 @@ for Replicated tables that use the `{uuid}` macro in the path to avoid name conf [0.19.1]: https://github.com/ClickHouse/dbt-clickhouse/compare/v0.19.0.2...v0.19.1 [0.19.0.2]: https://github.com/ClickHouse/dbt-clickhouse/compare/v0.19.0.1...v0.19.0.2 [0.19.0.1]: https://github.com/ClickHouse/dbt-clickhouse/compare/v0.19.0...v0.19.0.1 -[0.19.0]: https://github.com/ClickHouse/dbt-clickhouse/compare/eb3020a...v0.19.0 \ No newline at end of file +[0.19.0]: https://github.com/ClickHouse/dbt-clickhouse/compare/eb3020a...v0.19.0 From 86f88438fd8f8eb398750fb2bcf1d18ec2fa7c98 Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Thu, 31 Oct 2024 13:17:55 -0700 Subject: [PATCH 17/96] add test for range dictionary layout --- .../adapter/dictionary/test_dictionary.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/integration/adapter/dictionary/test_dictionary.py b/tests/integration/adapter/dictionary/test_dictionary.py index 77ee1aae..563d6b1a 100644 --- a/tests/integration/adapter/dictionary/test_dictionary.py +++ b/tests/integration/adapter/dictionary/test_dictionary.py @@ -6,6 +6,7 @@ import os import pytest + from dbt.tests.util import run_dbt testing_s3 = os.environ.get('DBT_CH_TEST_INCLUDE_S3', '').lower() in ('1', 'true', 'yes') @@ -114,6 +115,33 @@ - name: people """ +RANGE_DICTIONARY = """ +{{ config( + materialized='dictionary', + fields=[ + ('id', 'UInt8'), + ('start', 'UInt8'), + ('stop', 'UInt8'), + ('value', 'String') + ], + primary_key='id', + layout='RANGE_HASHED()', + lifetime='MIN 0 MAX 0', + source_type='clickhouse', + range='min start max stop' +) }} + +select + c1 as id, + c2 as start, + c3 as stop, + c4 as value +from values( + (0, 0, 2, 'foo'), + (0, 3, 5, 'bar') +) +""" + class TestQueryDictionary: @pytest.fixture(scope="class") @@ -193,3 +221,17 @@ def test_create(self, project): "select count(distinct LocationID) from taxi_zone_dictionary", fetch="all" ) assert results[0][0] == 265 + + +class TestRangeDictionary: + @pytest.fixture(scope="class") + def models(self): + return {"range_dictionary.sql": RANGE_DICTIONARY} + + def test_create(self, project): + run_dbt() + + results = project.run_sql("select dictGet(range_dictionary, 'value', 0, 1)", fetch="all") + assert results[0][0] == "foo" + results = project.run_sql("select dictGet(range_dictionary, 'value', 0, 5)", fetch="all") + assert results[0][0] == "bar" From ae8a7b69a83fd3973f7554b5af938cb9d35eb134 Mon Sep 17 00:00:00 2001 From: Lucas Fernando Cardoso Nunes Date: Sat, 2 Nov 2024 21:42:16 -0300 Subject: [PATCH 18/96] feat: add list support to `primary_key` Signed-off-by: Lucas Fernando Cardoso Nunes --- .../clickhouse/macros/materializations/table.sql | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index a7601552..feb0130c 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -86,10 +86,18 @@ {%- endmacro -%} {% macro primary_key_clause(label) %} - {%- set primary_key = config.get('primary_key', validator=validation.any[basestring]) -%} + {%- set cols = config.get('primary_key', validator=validation.any[list, basestring]) -%} - {%- if primary_key is not none %} - {{ label }} {{ primary_key }} + {%- if cols is not none %} + {%- if cols is string -%} + {%- set cols = [cols] -%} + {%- endif -%} + {{ label }} ( + {%- for item in cols -%} + {{ item }} + {%- if not loop.last -%},{%- endif -%} + {%- endfor -%} + ) {%- endif %} {%- endmacro -%} From 90ba9205482e16fdad4dd3910ff23a71961c079e Mon Sep 17 00:00:00 2001 From: Lucas Fernando Cardoso Nunes Date: Sat, 2 Nov 2024 22:18:52 -0300 Subject: [PATCH 19/96] add pk to tests Signed-off-by: Lucas Fernando Cardoso Nunes --- tests/integration/adapter/basic/test_incremental.py | 2 +- .../integration/adapter/clickhouse/test_clickhouse_s3.py | 1 + .../adapter/incremental/test_base_incremental.py | 7 ++++++- .../adapter/incremental/test_distributed_incremental.py | 8 ++++++-- .../integration/adapter/incremental/test_schema_change.py | 3 +++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/integration/adapter/basic/test_incremental.py b/tests/integration/adapter/basic/test_incremental.py index c50d477a..4e3df690 100644 --- a/tests/integration/adapter/basic/test_incremental.py +++ b/tests/integration/adapter/basic/test_incremental.py @@ -7,7 +7,7 @@ class TestIncremental(BaseIncremental): incremental_not_schema_change_sql = """ -{{ config(materialized="incremental", unique_key="user_id_current_time",on_schema_change="append_new_columns") }} +{{ config(materialized="incremental", primary_key="user_id_current_time", unique_key="user_id_current_time",on_schema_change="append_new_columns") }} select toString(1) || '-' || toString(now64()) as user_id_current_time, {% if is_incremental() %} diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_s3.py b/tests/integration/adapter/clickhouse/test_clickhouse_s3.py index 10f1289e..25d70c50 100644 --- a/tests/integration/adapter/clickhouse/test_clickhouse_s3.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_s3.py @@ -36,6 +36,7 @@ materialized='incremental', order_by='pickup_datetime', incremental_strategy='delete+insert', + primary_key='trip_id', unique_key='trip_id', taxi_s3={"structure":['trip_id UInt32', 'pickup_datetime DateTime', 'passenger_count UInt8']} ) diff --git a/tests/integration/adapter/incremental/test_base_incremental.py b/tests/integration/adapter/incremental/test_base_incremental.py index 0c522df7..a949d96f 100644 --- a/tests/integration/adapter/incremental/test_base_incremental.py +++ b/tests/integration/adapter/incremental/test_base_incremental.py @@ -20,6 +20,7 @@ materialized='table', engine='MergeTree()', order_by=['ts'], + primary_key=['impid'], unique_key=['impid'] ) }} @@ -33,6 +34,7 @@ materialized='incremental', engine='MergeTree()', order_by=['ts'], + primary_key=['impid'], unique_key=['impid'], settings={'allow_nullable_key':'1'} ) @@ -62,6 +64,7 @@ def test_simple_incremental(self, project): {{ config( materialized='incremental', order_by=['key1'], + primary_key='key1', unique_key='key1', incremental_strategy='delete+insert', settings={'allow_nullable_key':1} @@ -97,6 +100,7 @@ def test_lw_delete(self, project): {{ config( materialized='incremental', order_by=['key1'], + primary_key='key1', unique_key='key1', incremental_strategy='legacy', settings={'allow_nullable_key':1} @@ -140,6 +144,7 @@ def test_legacy(self, project): {{ config( materialized='incremental', order_by=['key1', 'key2'], + primary_key=['key1', 'key2'], unique_key='key1, key2', incremental_strategy='delete+insert' ) @@ -174,7 +179,7 @@ class TestInsertsOnlyIncrementalMaterialization(BaseIncremental): @pytest.fixture(scope="class") def models(self): config_materialized_incremental = """ - {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='incremental', unique_key='id') }} + {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='incremental', primary_key='id', unique_key='id') }} """ incremental_sql = config_materialized_incremental + model_incremental return { diff --git a/tests/integration/adapter/incremental/test_distributed_incremental.py b/tests/integration/adapter/incremental/test_distributed_incremental.py index f132933d..9c74b239 100644 --- a/tests/integration/adapter/incremental/test_distributed_incremental.py +++ b/tests/integration/adapter/incremental/test_distributed_incremental.py @@ -17,6 +17,7 @@ materialized='distributed_table', engine='MergeTree()', order_by=['ts'], + primary_key=['impid'], unique_key=['impid'] ) }} @@ -29,6 +30,7 @@ config( materialized='distributed_incremental', engine='MergeTree()', + primary_key=['impid'], order_by=['ts'], unique_key=['impid'] ) @@ -69,6 +71,7 @@ def test_simple_incremental(self, project): {{ config( materialized='distributed_incremental', order_by=['key1'], + primary_key='key1', unique_key='key1', incremental_strategy='delete+insert' ) @@ -111,6 +114,7 @@ def test_lw_delete(self, project): {{ config( materialized='distributed_incremental', order_by=['key1', 'key2'], + primary_key=['key1', 'key2'], unique_key='key1, key2', incremental_strategy='delete+insert' ) @@ -158,7 +162,7 @@ class TestInsertsOnlyDistributedIncrementalMaterialization(BaseIncremental): @pytest.fixture(scope="class") def models(self): config_materialized_incremental = """ - {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='distributed_incremental', unique_key='id') }} + {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='distributed_incremental', primary_key='id', unique_key='id') }} """ incremental_sql = config_materialized_incremental + model_incremental return { @@ -182,7 +186,7 @@ def test_incremental(self, project): incremental_not_schema_change_sql = """ -{{ config(materialized="distributed_incremental", unique_key="user_id_current_time",on_schema_change="sync_all_columns") }} +{{ config(materialized="distributed_incremental", primary_key="user_id_current_time", unique_key="user_id_current_time", on_schema_change="sync_all_columns") }} select toString(1) || '-' || toString(now64()) as user_id_current_time, {% if is_incremental() %} diff --git a/tests/integration/adapter/incremental/test_schema_change.py b/tests/integration/adapter/incremental/test_schema_change.py index e3efcb5f..9a8871b0 100644 --- a/tests/integration/adapter/incremental/test_schema_change.py +++ b/tests/integration/adapter/incremental/test_schema_change.py @@ -8,6 +8,7 @@ {{ config( materialized='%s', + primary_key='col_1', unique_key='col_1', on_schema_change='%s' ) @@ -101,6 +102,7 @@ def test_append(self, project, model): {{ config( materialized='%s', + primary_key='col_1', unique_key='col_1', on_schema_change='%s' ) @@ -189,6 +191,7 @@ def test_sync(self, project, model): {{ config( materialized='%s', + primary_key='col_1', unique_key='col_1', on_schema_change='fail' ) From d40abf4a739857d4634e79d080b45c923c00b1bf Mon Sep 17 00:00:00 2001 From: Can Bekleyici Date: Mon, 4 Nov 2024 10:09:32 +0100 Subject: [PATCH 20/96] add changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ae63d4..db012d4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ### Improvement * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) +### Bug Fixes +* Existing local tables are no longer dropped/recreated in case of missing distributed tables in `distributed_incremental` materialization mode. ([#363](https://github.com/ClickHouse/dbt-clickhouse/pull/363)) + ### Release [1.8.4], 2024-09-17 ### Improvement * The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to From 83564fb7d640c3399d273e8690a6143a89799574 Mon Sep 17 00:00:00 2001 From: Lucas Fernando Cardoso Nunes Date: Tue, 5 Nov 2024 10:31:10 -0300 Subject: [PATCH 21/96] Update CHANGELOG.md --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe3d4536..106fac0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### Unreleased +### Improvement + +* Enhance the `primary_key` macro to accept a list of columns, allowing for primary keys with multiple columns. ([#337](https://github.com/ClickHouse/dbt-clickhouse/pull/337)) + ### Release [1.8.4], 2024-09-17 ### Improvement * The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to From 2e87f58051db5ce3c7964cc92c9c333ef2143338 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Mon, 11 Nov 2024 08:05:15 +0300 Subject: [PATCH 22/96] Add in other places --- dbt/adapters/clickhouse/nativeclient.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index 8e46e38b..ec609c7f 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -22,7 +22,8 @@ def query(self, sql, **kwargs): try: return NativeClientResult(self._client.execute(sql, with_column_types=True, **kwargs)) except clickhouse_driver.errors.Error as ex: - raise DbtDatabaseError(str(ex).strip()) from ex + err_msg = str(ex).strip().split("Stack trace")[0] + raise DbtDatabaseError(err_msg) from ex def command(self, sql, **kwargs): try: @@ -41,7 +42,8 @@ def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: ) return [ClickHouseColumn.create(column[0], column[1]) for column in columns] except clickhouse_driver.errors.Error as ex: - raise DbtDatabaseError(str(ex).strip()) from ex + err_msg = str(ex).strip().split("Stack trace")[0] + raise DbtDatabaseError(err_msg) from ex def get_ch_setting(self, setting_name): try: From 3f00634d0fc1831d09e0ef8c8565c754a71b37e1 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Mon, 11 Nov 2024 08:10:26 +0300 Subject: [PATCH 23/96] Changelog update --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ae63d4..553fc840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ### Unreleased ### Improvement * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) +* Truncated stack trace for database errors for cleaner output ### Release [1.8.4], 2024-09-17 ### Improvement From e26bacc784295673bba875712975b3c36244c86f Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Mon, 11 Nov 2024 08:22:10 +0300 Subject: [PATCH 24/96] Update httpclient --- dbt/adapters/clickhouse/httpclient.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index f5d81bf8..cb8ba4f7 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -15,7 +15,8 @@ def query(self, sql, **kwargs): try: return self._client.query(sql, **kwargs) except DatabaseError as ex: - raise DbtDatabaseError(str(ex).strip()) from ex + err_msg = str(ex).strip().split("Stack trace")[0] + raise DbtDatabaseError(err_msg) from ex def command(self, sql, **kwargs): try: @@ -35,7 +36,8 @@ def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: for name, ch_type in zip(query_result.column_names, query_result.column_types) ] except DatabaseError as ex: - raise DbtDatabaseError(str(ex).strip()) from ex + err_msg = str(ex).strip().split("Stack trace")[0] + raise DbtDatabaseError(err_msg) from ex def get_ch_setting(self, setting_name): setting = self._client.server_settings.get(setting_name) From 4a64149c5da98a25d2e2f89f965928e7cfce0622 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Mon, 11 Nov 2024 12:26:03 +0300 Subject: [PATCH 25/96] Created ENV VAR to control behavior --- dbt/adapters/clickhouse/httpclient.py | 7 ++++--- dbt/adapters/clickhouse/nativeclient.py | 7 ++++--- dbt/adapters/clickhouse/util.py | 11 ++++++++++- tests/unit/test_util.py | 20 +++++++++++++++++++- 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index cb8ba4f7..bd707e5f 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -8,6 +8,7 @@ from dbt.adapters.clickhouse import ClickHouseColumn from dbt.adapters.clickhouse.__version__ import version as dbt_clickhouse_version from dbt.adapters.clickhouse.dbclient import ChClientWrapper, ChRetryableException +from dbt.adapters.clickhouse.util import hide_stack_trace class ChHttpClient(ChClientWrapper): @@ -15,14 +16,14 @@ def query(self, sql, **kwargs): try: return self._client.query(sql, **kwargs) except DatabaseError as ex: - err_msg = str(ex).strip().split("Stack trace")[0] + err_msg = hide_stack_trace(ex) raise DbtDatabaseError(err_msg) from ex def command(self, sql, **kwargs): try: return self._client.command(sql, **kwargs) except DatabaseError as ex: - err_msg = str(ex).strip().split("Stack trace")[0] + err_msg = hide_stack_trace(ex) raise DbtDatabaseError(err_msg) from ex def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: @@ -36,7 +37,7 @@ def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: for name, ch_type in zip(query_result.column_names, query_result.column_types) ] except DatabaseError as ex: - err_msg = str(ex).strip().split("Stack trace")[0] + err_msg = hide_stack_trace(ex) raise DbtDatabaseError(err_msg) from ex def get_ch_setting(self, setting_name): diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index ec609c7f..676b48a4 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -10,6 +10,7 @@ from dbt.adapters.clickhouse.__version__ import version as dbt_clickhouse_version from dbt.adapters.clickhouse.dbclient import ChClientWrapper, ChRetryableException from dbt.adapters.clickhouse.logger import logger +from dbt.adapters.clickhouse.util import hide_stack_trace try: driver_version = pkg_resources.get_distribution('clickhouse-driver').version @@ -22,7 +23,7 @@ def query(self, sql, **kwargs): try: return NativeClientResult(self._client.execute(sql, with_column_types=True, **kwargs)) except clickhouse_driver.errors.Error as ex: - err_msg = str(ex).strip().split("Stack trace")[0] + err_msg = hide_stack_trace(ex) raise DbtDatabaseError(err_msg) from ex def command(self, sql, **kwargs): @@ -31,7 +32,7 @@ def command(self, sql, **kwargs): if len(result) and len(result[0]): return result[0][0] except clickhouse_driver.errors.Error as ex: - err_msg = str(ex).strip().split("Stack trace")[0] + err_msg = hide_stack_trace(ex) raise DbtDatabaseError(err_msg) from ex def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: @@ -42,7 +43,7 @@ def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: ) return [ClickHouseColumn.create(column[0], column[1]) for column in columns] except clickhouse_driver.errors.Error as ex: - err_msg = str(ex).strip().split("Stack trace")[0] + err_msg = hide_stack_trace(ex) raise DbtDatabaseError(err_msg) from ex def get_ch_setting(self, setting_name): diff --git a/dbt/adapters/clickhouse/util.py b/dbt/adapters/clickhouse/util.py index 9410ad7d..7c120386 100644 --- a/dbt/adapters/clickhouse/util.py +++ b/dbt/adapters/clickhouse/util.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +import os from dbt_common.exceptions import DbtRuntimeError @@ -13,3 +13,12 @@ def compare_versions(v1: str, v2: str) -> int: except ValueError: raise DbtRuntimeError("Version must consist of only numbers separated by '.'") return 0 + + +def hide_stack_trace(ex: Exception) -> str: + + if not os.getenv("HIDE_STACK_TRACE", ''): + return str(ex).strip() + + err_msg = str(ex).split("Stack trace")[0].strip() + return err_msg diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py index d87d2e57..2d287921 100644 --- a/tests/unit/test_util.py +++ b/tests/unit/test_util.py @@ -1,4 +1,6 @@ -from dbt.adapters.clickhouse.util import compare_versions +from unittest.mock import patch + +from dbt.adapters.clickhouse.util import compare_versions, hide_stack_trace def test_is_before_version(): @@ -11,3 +13,19 @@ def test_is_before_version(): assert compare_versions('22.0.0', '21.0.0') == 1 assert compare_versions('21.0.1', '21.0.0') == 1 assert compare_versions('21.0.1', '21.0') == 0 + + +def test_hide_stack_trace_no_env_var(): + # Test when HIDE_STACK_TRACE is not set + with patch('os.getenv', return_value=''): + exception = Exception("Error occurred\nStack trace details follow...") + result = hide_stack_trace(exception) + assert result == "Error occurred\nStack trace details follow..." + + +def test_hide_stack_trace_env_var_set(): + # Test when HIDE_STACK_TRACE is set + with patch('os.getenv', return_value='1'): + exception = Exception("Error occurred\nStack trace details follow...") + result = hide_stack_trace(exception) + assert result == "Error occurred" From 7a1906ce6f99a9c05f99c34a616217efaa6aa0cc Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Mon, 11 Nov 2024 12:29:29 +0300 Subject: [PATCH 26/96] Updated CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 553fc840..c6c20661 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ### Unreleased ### Improvement * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) -* Truncated stack trace for database errors for cleaner output +* Truncated stack trace for database errors for cleaner output when HIDE_STACK_TRACE variable is set to any value. ### Release [1.8.4], 2024-09-17 ### Improvement From ba346f3d19dbea4bc9fa126460a2db63d59fb457 Mon Sep 17 00:00:00 2001 From: Vladimir Trifonov <3fonov@gmail.com> Date: Wed, 13 Nov 2024 11:52:43 +0300 Subject: [PATCH 27/96] Fix linting --- tests/integration/adapter/dictionary/test_dictionary.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/adapter/dictionary/test_dictionary.py b/tests/integration/adapter/dictionary/test_dictionary.py index 563d6b1a..70c6ea8c 100644 --- a/tests/integration/adapter/dictionary/test_dictionary.py +++ b/tests/integration/adapter/dictionary/test_dictionary.py @@ -6,7 +6,6 @@ import os import pytest - from dbt.tests.util import run_dbt testing_s3 = os.environ.get('DBT_CH_TEST_INCLUDE_S3', '').lower() in ('1', 'true', 'yes') From 798aca989fa7d020ba0d84c2582ee04c3978acbf Mon Sep 17 00:00:00 2001 From: the4thamigo-uk <7022874+the4thamigo-uk@users.noreply.github.com> Date: Tue, 8 Oct 2024 13:58:03 +0100 Subject: [PATCH 28/96] Allow multiple materialized views to write to same target (#280) --- .../materializations/materialized_view.sql | 73 ++++++++---- .../test_materialized_view.py | 108 +++++++++++++++++- 2 files changed, 157 insertions(+), 24 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 28cb626d..513070ca 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -6,7 +6,6 @@ {%- materialization materialized_view, adapter='clickhouse' -%} {%- set target_relation = this.incorporate(type='table') -%} - {%- set mv_relation = target_relation.derivative('_mv', 'materialized_view') -%} {%- set cluster_clause = on_cluster_clause(target_relation) -%} {# look for an existing relation for the target table and create backup relations if necessary #} @@ -35,16 +34,26 @@ -- `BEGIN` happens here: {{ run_hooks(pre_hooks, inside_transaction=True) }} + -- extract the names of the materialized views from the sql + {% set view_names = modules.re.findall('--([^:]+):begin', sql) %} + + -- extract the sql for each of the materialized view into a map + {% set views = {} %} + {% if view_names %} + {% for view_name in view_names %} + {% set view_sql = modules.re.findall('--' + view_name + ':begin(.*)--' + view_name + ':end', sql, flags=modules.re.DOTALL)[0] %} + {%- set _ = views.update({view_name: view_sql}) -%} + {% endfor %} + {% else %} + {%- set _ = views.update({"mv": sql}) -%} + {% endif %} + {% if backup_relation is none %} {{ log('Creating new materialized view ' + target_relation.name )}} - {% call statement('main') -%} - {{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql) }} - {%- endcall %} + {{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql, views) }} {% elif existing_relation.can_exchange %} {{ log('Replacing existing materialized view ' + target_relation.name) }} - {% call statement('drop existing materialized view') %} - drop view if exists {{ mv_relation }} {{ cluster_clause }} - {% endcall %} + {{ clickhouse__drop_mvs(target_relation, cluster_clause, views) }} {% if should_full_refresh() %} {% call statement('main') -%} {{ get_create_table_as_sql(False, backup_relation, sql) }} @@ -56,12 +65,10 @@ select 1 {%- endcall %} {% endif %} - {% call statement('create new materialized view') %} - {{ clickhouse__create_mv_sql(mv_relation, existing_relation, cluster_clause, sql) }} - {% endcall %} + {{ clickhouse__create_mvs(existing_relation, cluster_clause, views) }} {% else %} {{ log('Replacing existing materialized view ' + target_relation.name) }} - {{ clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql) }} + {{ clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql, views) }} {% endif %} -- cleanup @@ -78,7 +85,12 @@ {{ run_hooks(post_hooks, inside_transaction=False) }} - {{ return({'relations': [target_relation, mv_relation]}) }} + {% set relations = [target_relation] %} + {% for view in views %} + {{ relations.append(target_relation.derivative('_' + view, 'materialized_view')) }} + {% endfor %} + + {{ return({'relations': relations}) }} {%- endmaterialization -%} @@ -89,30 +101,47 @@ 2. Create a materialized view using the SQL in the model that inserts data into the table creating during step 1 #} -{% macro clickhouse__get_create_materialized_view_as_sql(relation, sql) -%} - {% call statement('create_target_table') %} +{% macro clickhouse__get_create_materialized_view_as_sql(relation, sql, views) -%} + {% call statement('main') %} {{ get_create_table_as_sql(False, relation, sql) }} {% endcall %} {%- set cluster_clause = on_cluster_clause(relation) -%} {%- set mv_relation = relation.derivative('_mv', 'materialized_view') -%} - {{ clickhouse__create_mv_sql(mv_relation, relation, cluster_clause, sql) }} + {{ clickhouse__create_mvs(relation, cluster_clause, views) }} {%- endmacro %} +{% macro clickhouse__drop_mv(mv_relation, cluster_clause) -%} + drop view if exists {{ mv_relation }} {{ cluster_clause }} +{%- endmacro %}u -{% macro clickhouse__create_mv_sql(mv_relation, target_table, cluster_clause, sql) -%} +{% macro clickhouse__create_mv(mv_relation, target_table, cluster_clause, sql) -%} create materialized view if not exists {{ mv_relation }} {{ cluster_clause }} to {{ target_table }} as {{ sql }} {%- endmacro %} +{% macro clickhouse__drop_mvs(target_relation, cluster_clause, views) -%} + {% for view in views.keys() %} + {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} + {% call statement('drop existing mv: ' + view) -%} + {{ clickhouse__drop_mv(mv_relation, cluster_clause) }}; + {% endcall %} + {% endfor %} +{%- endmacro %} + +{% macro clickhouse__create_mvs(target_relation, cluster_clause, views) -%} + {% for view, view_sql in views.items() %} + {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} + {% call statement('create existing mv: ' + view) -%} + {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {% endcall %} + {% endfor %} +{%- endmacro %} -{% macro clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql) %} +{% macro clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql, views) %} {# drop existing materialized view while we recreate the target table #} {%- set cluster_clause = on_cluster_clause(target_relation) -%} - {%- set mv_relation = target_relation.derivative('_mv', 'materialized_view') -%} - {% call statement('drop existing mv') -%} - drop view if exists {{ mv_relation }} {{ cluster_clause }} - {%- endcall %} + {{ clickhouse__drop_mvs(target_relation, cluster_clause, views) }} {# recreate the target table #} {% call statement('main') -%} @@ -122,5 +151,5 @@ {{ adapter.rename_relation(intermediate_relation, target_relation) }} {# now that the target table is recreated, we can finally create our new view #} - {{ clickhouse__create_mv_sql(mv_relation, target_relation, cluster_clause, sql) }} + {{ clickhouse__create_mvs(target_relation, cluster_clause, views) }} {% endmacro %} diff --git a/tests/integration/adapter/materialized_view/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py index ce651ff3..1e9cc99f 100644 --- a/tests/integration/adapter/materialized_view/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -15,6 +15,9 @@ 1231,Dade,33,engineering 6666,Ksenia,48,engineering 8888,Kate,50,engineering +1000,Alfie,10,sales +2000,Bill,20,sales +3000,Charlie,30,sales """.lstrip() # This model is parameterized, in a way, by the "run_type" dbt project variable @@ -40,8 +43,7 @@ from {{ source('raw', 'people') }} where department = 'engineering' -{% else %} - +{% elif var('run_type', '') == 'extended_schema' %} select id, name, @@ -55,6 +57,33 @@ from {{ source('raw', 'people') }} where department = 'engineering' +{% elif var('run_type', '') == 'multiple_materialized_views' %} + +--mv1:begin +select + id, + name, + case + when name like 'Dade' then 'crash_override' + when name like 'Kate' then 'acid burn' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} +where department = 'engineering' +--mv1:end + +union all + +--mv2:begin +select + id, + name, + -- sales people are not cool enough to have a hacker alias + 'N/A' as hacker_alias +from {{ source('raw', 'people') }} +where department = 'sales' +--mv2:end + {% endif %} """ @@ -191,3 +220,78 @@ def test_update_full_refresh(self, project): f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all" ) assert len(result) == 2 + + +class TestMultipleMV: + @pytest.fixture(scope="class") + def seeds(self): + """ + we need a base table to pull from + """ + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": MV_MODEL, + } + + def test_create(self, project): + """ + 1. create a base table via dbt seed + 2. create a model as a materialized view, selecting from the table created in (1) + 3. insert data into the base table and make sure it's there in the target table created in (2) + """ + schema = quote_identifier(project.test_schema + "_custom_schema") + results = run_dbt(["seed"]) + assert len(results) == 1 + columns = project.run_sql("DESCRIBE TABLE people", fetch="all") + assert columns[0][1] == "Int32" + + # create the model + run_vars = {"run_type": "multiple_materialized_views"} + run_dbt(["run", "--vars", json.dumps(run_vars)]) + assert len(results) == 1 + + columns = project.run_sql(f"DESCRIBE TABLE {schema}.hackers", fetch="all") + assert columns[0][1] == "Int32" + + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv1", fetch="all") + assert columns[0][1] == "Int32" + + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv2", fetch="all") + assert columns[0][1] == "Int32" + + with pytest.raises(Exception): + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv", fetch="all") + + check_relation_types( + project.adapter, + { + "hackers_mv": "view", + "hackers": "table", + }, + ) + + # insert some data and make sure it reaches the target table + project.run_sql( + f""" + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") + values (4000,'Dave',40,'sales'), (9999,'Eugene',40,'engineering'); + """ + ) + + result = project.run_sql(f"select * from {schema}.hackers order by id", fetch="all") + assert result == [ + (1000, 'Alfie', 'N/A'), + (1231, 'Dade', 'crash_override'), + (2000, 'Bill', 'N/A'), + (3000, 'Charlie', 'N/A'), + (4000, 'Dave', 'N/A'), + (6666, 'Ksenia', 'N/A'), + (8888, 'Kate', 'acid burn'), + (9999, 'Eugene', 'N/A'), + ] From 5d2933ccc9424e673768865fb3d4b0fd94733ce7 Mon Sep 17 00:00:00 2001 From: the4thamigo-uk <7022874+the4thamigo-uk@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:28:53 +0000 Subject: [PATCH 29/96] Update CHANGELOG.md (#280) --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ae63d4..648668de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ### Improvement * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) +### New Features +* Added support for the creation of more than one materialized view inserting records into the same target table. ([#360](https://github.com/ClickHouse/dbt-clickhouse/pull/364)) + ### Release [1.8.4], 2024-09-17 ### Improvement * The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to From 064ed53456ee6870001dddd565ab7602585bc33c Mon Sep 17 00:00:00 2001 From: the4thamigo-uk <7022874+the4thamigo-uk@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:42:53 +0000 Subject: [PATCH 30/96] Add note to README.md (#364) --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 99da7ff6..d058f9c8 100644 --- a/README.md +++ b/README.md @@ -246,6 +246,21 @@ no corresponding REFRESH operation). Instead, it acts as an "insert trigger", a (https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/materialized_view/test_materialized_view.py) for an introductory example of how to use this functionality. +Clickhouse provides the ability for more than one materialized view to write records to the same target table. To support this in dbt-clickhouse, you can construct a `UNION` in your model file, such that the SQL for each of your materialized views is wrapped with comments of the form `--my_mv_name:begin` and `--my_mv_name:end`. + +For example the following will build two materialized views both writing data to the same destination table of the model. The names of the materialized views will take the form `_mv1` and `_mv2` : + +``` +--mv1:begin +select a,b,c from {{ source('raw', 'table_1') }} +--mv1:end +union all +--mv2:begin +select a,b,c from {{ source('raw', 'table_2') }} +--mv2:end +``` + + # Dictionary materializations (experimental) See the tests in https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/dictionary/test_dictionary.py for examples of how to implement materializations for ClickHouse dictionaries From 210d53242353b4d7ad940276d81829de7d531d7f Mon Sep 17 00:00:00 2001 From: flytrap Date: Thu, 14 Nov 2024 15:37:30 +0800 Subject: [PATCH 31/96] fix: create materialized view on cluster --- dbt/adapters/clickhouse/relation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index 8ad3b39a..a75a1e45 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -82,7 +82,7 @@ def get_on_cluster( ) -> bool: if cluster.strip(): return ( - materialized in ('view', 'dictionary') + materialized in ('materialized_view', 'view', 'dictionary') or 'distributed' in materialized or 'Replicated' in engine ) From f99d0d00d4c4b803c494d137a8d0fb56853c2797 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 19 Nov 2024 09:18:50 +0200 Subject: [PATCH 32/96] warn about potential hanging mvs that need to be dropped manually --- .../materializations/materialized_view.sql | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 513070ca..20711875 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -53,6 +53,37 @@ {{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql, views) }} {% elif existing_relation.can_exchange %} {{ log('Replacing existing materialized view ' + target_relation.name) }} + -- in this section, we look for mvs that has the same pattern as this model, but for some reason, + -- are not listed in the model. This might happen when using multiple mv, and renaming one of the mv in the model. + -- In case such mv found, we raise a warning to the user, that they might need to drop the mv manually. + {{ log('Searching for existing materialized views with the pattern of ' + target_relation.name) }} + {{ log('Views dictionary contents: ' + views | string) }} + + {% set tables_query %} + select table_name + from information_schema.tables + where table_schema = '{{ existing_relation.schema }}' + and table_name like '%{{ target_relation.name }}%' + and table_type = 'VIEW' + {% endset %} + + {% set tables_result = run_query(tables_query) %} + {% if tables_result is not none %} + {% set tables = tables_result.columns[0].values() %} + {{ log('Current mvs found in ClickHouse are: ' + tables | join(', ')) }} + {% set mv_names = [] %} + {% for key in views.keys() %} + {% do mv_names.append(target_relation.name ~ "_" ~ key) %} + {% endfor %} + {{ log('Model mvs to replace ' + mv_names | string) }} + {% for table in tables %} + {% if table not in mv_names %} + {{ log('Warning - Table "' + table + '" was detected with the same pattern as model name "' + target_relation.name + '" but was not found this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!)') }} + {% endif %} + {% endfor %} + {% else %} + {{ log('No existing mvs found matching the pattern. continuing..', info=True) }} + {% endif %} {{ clickhouse__drop_mvs(target_relation, cluster_clause, views) }} {% if should_full_refresh() %} {% call statement('main') -%} @@ -153,3 +184,4 @@ {# now that the target table is recreated, we can finally create our new view #} {{ clickhouse__create_mvs(target_relation, cluster_clause, views) }} {% endmacro %} + From 15125f0aac0fdec4f887e8378d0c941d451befe9 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 19 Nov 2024 10:07:51 +0200 Subject: [PATCH 33/96] update warning message --- .../clickhouse/macros/materializations/materialized_view.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 20711875..4583918b 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -78,7 +78,7 @@ {{ log('Model mvs to replace ' + mv_names | string) }} {% for table in tables %} {% if table not in mv_names %} - {{ log('Warning - Table "' + table + '" was detected with the same pattern as model name "' + target_relation.name + '" but was not found this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!)') }} + {{ log('Warning - Table "' + table + '" was detected with the same pattern as model name "' + target_relation.name + '" but was not found in this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!)') }} {% endif %} {% endfor %} {% else %} From adbcb4eba5b0aa341b81d590b8c4a23cc974dd14 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 19 Nov 2024 10:08:15 +0200 Subject: [PATCH 34/96] update readme file regarding a potential hanging mv --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index d058f9c8..7855fbb2 100644 --- a/README.md +++ b/README.md @@ -260,6 +260,12 @@ select a,b,c from {{ source('raw', 'table_2') }} --mv2:end ``` +> IMPORTANT! +> +> When updating a model with multiple materialized views (MVs), especially when renaming one of the MV names, dbt-clickhouse does not automatically drop the old MV. Instead, +> you will encounter the following warning: `Warning - Table was detected with the same pattern as model name but was not found in this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!) ` + + # Dictionary materializations (experimental) See the tests in https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/dictionary/test_dictionary.py for examples of how to From f7e6db125db1abcfb594050570c3c4bb199162d1 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 19 Nov 2024 10:08:49 +0200 Subject: [PATCH 35/96] add tests to cover update-schema situation and full refresh --- .../test_materialized_view.py | 118 +++++++++++++++++- 1 file changed, 113 insertions(+), 5 deletions(-) diff --git a/tests/integration/adapter/materialized_view/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py index 1e9cc99f..89c48427 100644 --- a/tests/integration/adapter/materialized_view/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -57,7 +57,18 @@ from {{ source('raw', 'people') }} where department = 'engineering' -{% elif var('run_type', '') == 'multiple_materialized_views' %} +{% endif %} +""" + +MULTIPLE_MV_MODEL = """ +{{ config( + materialized='materialized_view', + engine='MergeTree()', + order_by='(id)', + schema='custom_schema_for_multiple_mv', +) }} + +{% if var('run_type', '') == '' %} --mv1:begin select @@ -84,6 +95,35 @@ where department = 'sales' --mv2:end +{% elif var('run_type', '') == 'extended_schema' %} + +--mv1:begin +select + id, + name, + case + -- Dade wasn't always known as 'crash override'! + when name like 'Dade' and age = 11 then 'zero cool' + when name like 'Dade' and age != 11 then 'crash override' + when name like 'Kate' then 'acid burn' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} +where department = 'engineering' +--mv1:end + +union all + +--mv2:begin +select + id, + name, + -- sales people are not cool enough to have a hacker alias + 'N/A' as hacker_alias +from {{ source('raw', 'people') }} +where department = 'sales' +--mv2:end + {% endif %} """ @@ -236,7 +276,7 @@ def seeds(self): @pytest.fixture(scope="class") def models(self): return { - "hackers.sql": MV_MODEL, + "hackers.sql": MULTIPLE_MV_MODEL, } def test_create(self, project): @@ -245,15 +285,14 @@ def test_create(self, project): 2. create a model as a materialized view, selecting from the table created in (1) 3. insert data into the base table and make sure it's there in the target table created in (2) """ - schema = quote_identifier(project.test_schema + "_custom_schema") + schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") results = run_dbt(["seed"]) assert len(results) == 1 columns = project.run_sql("DESCRIBE TABLE people", fetch="all") assert columns[0][1] == "Int32" # create the model - run_vars = {"run_type": "multiple_materialized_views"} - run_dbt(["run", "--vars", json.dumps(run_vars)]) + run_dbt(["run"]) assert len(results) == 1 columns = project.run_sql(f"DESCRIBE TABLE {schema}.hackers", fetch="all") @@ -295,3 +334,72 @@ def test_create(self, project): (8888, 'Kate', 'acid burn'), (9999, 'Eugene', 'N/A'), ] + + +class TestUpdateMultipleMV: + @pytest.fixture(scope="class") + def seeds(self): + """ + we need a base table to pull from + """ + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": MULTIPLE_MV_MODEL, + } + + def test_update_incremental(self, project): + schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + # create our initial materialized view + run_dbt(["seed"]) + run_dbt() + + # re-run dbt but this time with the new MV SQL + run_vars = {"run_type": "extended_schema"} + run_dbt(["run", "--vars", json.dumps(run_vars)]) + + project.run_sql( + f""" + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") + values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); + """ + ) + + # assert that we now have both of Dade's aliases in our hackers table + result = project.run_sql( + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", fetch="all" + ) + assert len(result) == 2 + assert result[0][0] == "crash_override" + assert result[1][0] == "zero cool" + + def test_update_full_refresh(self, project): + schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + # create our initial materialized view + run_dbt(["seed"]) + run_dbt() + + # re-run dbt but this time with the new MV SQL + run_vars = {"run_type": "extended_schema"} + run_dbt(["run", "--full-refresh", "--vars", json.dumps(run_vars)]) + + project.run_sql( + f""" + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") + values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); + """ + ) + + # assert that we now have both of Dade's aliases in our hackers table + result = project.run_sql( + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", fetch="all" + ) + print(result) + assert len(result) == 2 + assert result[0][0] == "crash override" + assert result[1][0] == "zero cool" From 0b54910a6f55d368e6db891c6c365b96dce00f41 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 19 Nov 2024 10:22:54 +0200 Subject: [PATCH 36/96] fix lint --- .../adapter/materialized_view/test_materialized_view.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/adapter/materialized_view/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py index 89c48427..9c9ffdb6 100644 --- a/tests/integration/adapter/materialized_view/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -372,7 +372,8 @@ def test_update_incremental(self, project): # assert that we now have both of Dade's aliases in our hackers table result = project.run_sql( - f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", fetch="all" + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", + fetch="all", ) assert len(result) == 2 assert result[0][0] == "crash_override" @@ -397,7 +398,8 @@ def test_update_full_refresh(self, project): # assert that we now have both of Dade's aliases in our hackers table result = project.run_sql( - f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", fetch="all" + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", + fetch="all", ) print(result) assert len(result) == 2 From 8eabe427c692a38e9901299a7e3f15678bd8b832 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 19 Nov 2024 13:37:06 +0200 Subject: [PATCH 37/96] update CHANGELOG.md for version 1.8.4 --- CHANGELOG.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b6727a6..44a16649 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,19 @@ -### Unreleased -### Improvement -* Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) -* Truncated stack trace for database errors for cleaner output when HIDE_STACK_TRACE variable is set to any value. +### Release [1.8.4], 2024-11-19 ### New Features * Added support for the creation of more than one materialized view inserting records into the same target table. ([#360](https://github.com/ClickHouse/dbt-clickhouse/pull/364)) -### Release [1.8.4], 2024-09-17 ### Improvement * The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to -[Mitchell Bregman](https://github.com/mitchbregs) for the contribution! +[Mitchell Bregman](https://github.com/mitchbregs) for the contribution! ([#356](https://github.com/ClickHouse/dbt-clickhouse/pull/356)) +* Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) +* Truncated stack trace for database errors for cleaner output when HIDE_STACK_TRACE variable is set to any value. ([#382](https://github.com/ClickHouse/dbt-clickhouse/pull/382)) +* It is now possible to pass query settings not only on table creation but also on query. ([#362](https://github.com/ClickHouse/dbt-clickhouse/pull/362)) + +### Bug Fixes +* Before this version, `split_part` macro used to add an extra quotation. that was fixed in ([#338](https://github.com/ClickHouse/dbt-clickhouse/pull/338)) + + ### Release [1.8.3], 2024-09-01 ### Bug Fixes From 9ca5578c7599fb515f65a0fad32679c2934f90ca Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 19 Nov 2024 13:49:31 +0200 Subject: [PATCH 38/96] update CHANGELOG.md for version 1.8.5 --- CHANGELOG.md | 9 +++++---- dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44a16649..0e23f02c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,9 @@ -### Release [1.8.4], 2024-11-19 +### Release [1.8.5], 2024-11-19 ### New Features * Added support for the creation of more than one materialized view inserting records into the same target table. ([#360](https://github.com/ClickHouse/dbt-clickhouse/pull/364)) ### Improvement -* The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to -[Mitchell Bregman](https://github.com/mitchbregs) for the contribution! ([#356](https://github.com/ClickHouse/dbt-clickhouse/pull/356)) * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) * Truncated stack trace for database errors for cleaner output when HIDE_STACK_TRACE variable is set to any value. ([#382](https://github.com/ClickHouse/dbt-clickhouse/pull/382)) * It is now possible to pass query settings not only on table creation but also on query. ([#362](https://github.com/ClickHouse/dbt-clickhouse/pull/362)) @@ -13,7 +11,10 @@ ### Bug Fixes * Before this version, `split_part` macro used to add an extra quotation. that was fixed in ([#338](https://github.com/ClickHouse/dbt-clickhouse/pull/338)) - +### Release [1.8.4], 2024-09-17 +### Improvement +* The S3 help macro now support a `role_arn` parameter as an alternative way to provide authentication for S3 based models. Thanks to +[Mitchell Bregman](https://github.com/mitchbregs) for the contribution! ### Release [1.8.3], 2024-09-01 ### Bug Fixes diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index be6c9703..61aaff6b 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.8.4' +version = '1.8.5' From 26eb7726da05ec54bcf6726a65d60c56bd3efe66 Mon Sep 17 00:00:00 2001 From: the4thamigo-uk <7022874+the4thamigo-uk@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:18:06 +0000 Subject: [PATCH 39/96] Prevent data loss when updating materialized view (#383) --- CHANGELOG.md | 5 ++ .../materializations/materialized_view.sql | 67 +++++++++++++------ 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e23f02c..c0f584c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### Unreleased + +### Improvement +* Materialized view now attempts to use `ALTER TABLE...MODIFY QUERY` to update existing materialized views. This is an atomic operation so data is not lost. ([#390](https://github.com/ClickHouse/dbt-clickhouse/pull/390)) + ### Release [1.8.5], 2024-11-19 ### New Features diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 4583918b..e219ec0e 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -84,19 +84,25 @@ {% else %} {{ log('No existing mvs found matching the pattern. continuing..', info=True) }} {% endif %} - {{ clickhouse__drop_mvs(target_relation, cluster_clause, views) }} {% if should_full_refresh() %} + {{ clickhouse__drop_mvs(target_relation, cluster_clause, views) }} + {% call statement('main') -%} {{ get_create_table_as_sql(False, backup_relation, sql) }} {%- endcall %} {% do exchange_tables_atomic(backup_relation, existing_relation) %} + + {{ clickhouse__create_mvs(existing_relation, cluster_clause, views) }} {% else %} -- we need to have a 'main' statement {% call statement('main') -%} select 1 {%- endcall %} + + -- try to alter view first to replace sql, else drop and create + {{ clickhouse__update_mvs(target_relation, cluster_clause, views) }} + {% endif %} - {{ clickhouse__create_mvs(existing_relation, cluster_clause, views) }} {% else %} {{ log('Replacing existing materialized view ' + target_relation.name) }} {{ clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql, views) }} @@ -142,31 +148,54 @@ {%- endmacro %} {% macro clickhouse__drop_mv(mv_relation, cluster_clause) -%} + {% call statement('drop existing mv: ' + mv_relation.name) -%} drop view if exists {{ mv_relation }} {{ cluster_clause }} -{%- endmacro %}u + {% endcall %} +{%- endmacro %} -{% macro clickhouse__create_mv(mv_relation, target_table, cluster_clause, sql) -%} - create materialized view if not exists {{ mv_relation }} {{ cluster_clause }} - to {{ target_table }} - as {{ sql }} +{% macro clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) -%} + {% call statement('create existing mv: ' + mv_relation.name) -%} + create materialized view if not exists {{ mv_relation }} {{ cluster_clause }} + to {{ target_relation }} + as {{ view_sql }} + {% endcall %} +{%- endmacro %} + +{% macro clickhouse__modify_mv(mv_relation, cluster_clause, view_sql) -%} + {% call statement('modify existing mv: ' + mv_relation.name) -%} + alter table {{ mv_relation }} {{ cluster_clause }} modify query {{ view_sql }} + {% endcall %} +{%- endmacro %} + +{% macro clickhouse__update_mv(mv_relation, target_relation, cluster_clause, view_sql) -%} + {% set existing_relation = adapter.get_relation(database=mv_relation.database, schema=mv_relation.schema, identifier=mv_relation.identifier) %} + {% if existing_relation %} + {{ clickhouse__modify_mv(mv_relation, cluster_clause, view_sql) }}; + {% else %} + {{ clickhouse__drop_mv(mv_relation, cluster_clause) }}; + {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {% endif %} {%- endmacro %} {% macro clickhouse__drop_mvs(target_relation, cluster_clause, views) -%} - {% for view in views.keys() %} - {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} - {% call statement('drop existing mv: ' + view) -%} - {{ clickhouse__drop_mv(mv_relation, cluster_clause) }}; - {% endcall %} - {% endfor %} + {% for view in views.keys() %} + {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} + {{ clickhouse__drop_mv(mv_relation, cluster_clause) }}; + {% endfor %} {%- endmacro %} {% macro clickhouse__create_mvs(target_relation, cluster_clause, views) -%} - {% for view, view_sql in views.items() %} - {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} - {% call statement('create existing mv: ' + view) -%} - {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; - {% endcall %} - {% endfor %} + {% for view, view_sql in views.items() %} + {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} + {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {% endfor %} +{%- endmacro %} + +{% macro clickhouse__update_mvs(target_relation, cluster_clause, views) -%} + {% for view, view_sql in views.items() %} + {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} + {{ clickhouse__update_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {% endfor %} {%- endmacro %} {% macro clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql, views) %} From 2a7a1381b9550ade2c2933ad1fed252e4c15794c Mon Sep 17 00:00:00 2001 From: the4thamigo-uk <7022874+the4thamigo-uk@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:53:39 +0000 Subject: [PATCH 40/96] Use `CREATE OR REPLACE DICTIONARY` to avoid data loss (#392) --- CHANGELOG.md | 7 ++++++ .../macros/materializations/dictionary.sql | 24 +++---------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e23f02c..41668f42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +### Unreleased + +### Improvement + +Avoid potential data loss by using `CREATE OR REPLACE DICTIONARY` to atomically update a dictionary (#393) + + ### Release [1.8.5], 2024-11-19 ### New Features diff --git a/dbt/include/clickhouse/macros/materializations/dictionary.sql b/dbt/include/clickhouse/macros/materializations/dictionary.sql index 80fd7982..a0a94c42 100644 --- a/dbt/include/clickhouse/macros/materializations/dictionary.sql +++ b/dbt/include/clickhouse/macros/materializations/dictionary.sql @@ -2,38 +2,21 @@ {%- set existing_relation = load_cached_relation(this) -%} {%- set target_relation = this.incorporate(type='dictionary') -%} - {%- set intermediate_relation = make_intermediate_relation(target_relation) -%} - {%- set existing_intermediate_relation = load_cached_relation(intermediate_relation) -%} - {%- set backup_relation_type = 'dictionary' if existing_relation is none else existing_relation.type -%} - {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%} - {%- set existing_backup_relation = load_cached_relation(backup_relation) -%} {%- set cluster_clause = on_cluster_clause(target_relation) -%} {%- set grant_config = config.get('grants') -%} {{ run_hooks(pre_hooks, inside_transaction=False) }} - {{ drop_dictionary_if_exists(existing_backup_relation, cluster_clause) }} - {{ drop_dictionary_if_exists(existing_intermediate_relation, cluster_clause) }} - {{ run_hooks(pre_hooks, inside_transaction=True) }} {# create our new dictionary #} {% call statement('main') -%} - {{ clickhouse__get_create_dictionary_as_sql(intermediate_relation, cluster_clause, sql) }} + {{ clickhouse__get_create_dictionary_as_sql(target_relation, cluster_clause, sql) }} {%- endcall %} - {# cleanup #} - {% if existing_relation is not none %} - {% set existing_relation = load_cached_relation(existing_relation) %} - {% if existing_relation is not none %} - {{ adapter.rename_relation(existing_relation, backup_relation) }} - {% endif %} - {% endif %} - {{ adapter.rename_relation(intermediate_relation, target_relation) }} - - {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} + {% set should_revoke = should_revoke(target_relation, full_refresh_mode=True) %} {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} {% do persist_docs(target_relation, model) %} @@ -42,7 +25,6 @@ {{ adapter.commit() }} - {{ drop_dictionary_if_exists(backup_relation, cluster_clause) }} {{ run_hooks(post_hooks, inside_transaction=False) }} @@ -55,7 +37,7 @@ {%- set fields = config.get('fields') -%} {%- set source_type = config.get('source_type') -%} - CREATE DICTIONARY {{ relation }} {{ cluster_clause }} + CREATE OR REPLACE DICTIONARY {{ relation }} {{ cluster_clause }} ( {%- for (name, data_type) in fields -%} {{ name }} {{ data_type }}{%- if not loop.last -%},{%- endif -%} From 94e9242983f45364c424c9ef4c5278a839e929d9 Mon Sep 17 00:00:00 2001 From: "f.abapolov" Date: Thu, 28 Nov 2024 13:54:14 +0300 Subject: [PATCH 41/96] 1) Support of insert_overwrite on cluster 2) Test for insert_overwrite with "incremental" and RepicatedMergeTree 3) Test for insert_overwrite with "distributed_incremental" 4) Changed naming of tests to actual 5) Refactored old test to be more determinate --- .../incremental/distributed_incremental.sql | 11 ++- .../incremental/incremental.sql | 65 ++++++++++------- .../incremental/test_base_incremental.py | 69 +++++++++++++++++-- .../test_distributed_incremental.py | 57 +++++++++++++++ 4 files changed, 170 insertions(+), 32 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql index ef31a76c..0ed4dec9 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql @@ -65,7 +65,7 @@ {% do adapter.drop_relation(distributed_intermediate_relation) or '' %} {% set need_swap = true %} - {% elif inserts_only or unique_key is none -%} + {% elif inserts_only -%} -- There are no updates/deletes or duplicate keys are allowed. Simply add all of the new rows to the existing -- table. It is the user's responsibility to avoid duplicates. Note that "inserts_only" is a ClickHouse adapter -- specific configurable that is used to avoid creating an expensive intermediate table. @@ -91,6 +91,15 @@ {% set need_swap = true %} {% elif incremental_strategy == 'delete_insert' %} {% do clickhouse__incremental_delete_insert(existing_relation, unique_key, incremental_predicates, True) %} + {% elif incremental_strategy == 'insert_overwrite' %} + {%- set partition_by = config.get('partition_by') -%} + {% if partition_by is none or partition_by|length == 0 %} + {% do exceptions.raise_compiler_error(incremental_strategy + ' strategy requires nonempty partition_by. Current partition_by is ' ~ partition_by) %} + {% endif %} + {% if inserts_only or unique_key is not none or incremental_predicates is not none %} + {% do exceptions.raise_compiler_error(incremental_strategy + ' strategy does not support inserts_only, unique_key, and incremental predicates.') %} + {% endif %} + {% do clickhouse__incremental_insert_overwrite(existing_relation, partition_by, True) %} {% elif incremental_strategy == 'append' %} {% call statement('main') %} {{ clickhouse__insert_into(target_relation, sql) }} diff --git a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql index c4dcb1b8..cd25e9db 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql @@ -76,7 +76,7 @@ {% call statement('main') %} {{ clickhouse__insert_into(target_relation, sql) }} {% endcall %} - {% elif incremental_strategy == 'insert_overwrite' %}#} + {% elif incremental_strategy == 'insert_overwrite' %} {%- set partition_by = config.get('partition_by') -%} {% if partition_by is none or partition_by|length == 0 %} {% do exceptions.raise_compiler_error(incremental_strategy + ' strategy requires nonempty partition_by. Current partition_by is ' ~ partition_by) %} @@ -84,7 +84,7 @@ {% if inserts_only or unique_key is not none or incremental_predicates is not none %} {% do exceptions.raise_compiler_error(incremental_strategy + ' strategy does not support inserts_only, unique_key, and incremental predicates.') %} {% endif %} - {% do clickhouse__incremental_insert_overwrite(existing_relation, intermediate_relation, partition_by) %} %} + {% do clickhouse__incremental_insert_overwrite(existing_relation, partition_by, False) %} {% endif %} {% endif %} @@ -246,41 +246,58 @@ {{ drop_relation_if_exists(distributed_new_data_relation) }} {% endmacro %} -{% macro clickhouse__incremental_insert_overwrite(existing_relation, intermediate_relation, partition_by) %} - {% set new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] +{% macro clickhouse__incremental_insert_overwrite(existing_relation, partition_by, is_distributed=False) %} + {% set new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_new_data_' + invocation_id.replace('-', '_')}) %} {{ drop_relation_if_exists(new_data_relation) }} - {% call statement('create_new_data_temp') -%} - {{ get_create_table_as_sql(False, new_data_relation, sql) }} - {%- endcall %} - {% call statement('main') -%} - create table {{ intermediate_relation }} as {{ existing_relation }} - {%- endcall %} - {% call statement('insert_new_data') -%} - insert into {{ intermediate_relation }} select * from {{ new_data_relation }} - {%- endcall %} + {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_distributed_new_data'}) -%} + + + {%- set local_suffix = adapter.get_clickhouse_local_suffix() -%} + {%- set local_db_prefix = adapter.get_clickhouse_local_db_prefix() -%} + {% set existing_local = existing_relation.incorporate(path={"identifier": this.identifier + local_suffix, "schema": local_db_prefix + this.schema}) if existing_relation is not none else none %} + + {% if is_distributed %} + {{ create_distributed_local_table(distributed_new_data_relation, new_data_relation, existing_relation, sql) }} + {% else %} + {% call statement('main') %} + {{ get_create_table_as_sql(False, new_data_relation, sql) }} + {% endcall %} + {% endif %} + + {# Get the parts from the cluster table, since the partitions between shards may not overlap due to distribution #} {% if execute %} {% set select_changed_partitions %} SELECT DISTINCT partition_id - FROM system.parts + {% if is_distributed %} + FROM cluster({{ adapter.get_clickhouse_cluster_name() }}, system.parts) + {% else %} + FROM system.parts + {% endif %} WHERE active - AND database = '{{ intermediate_relation.schema }}' - AND table = '{{ intermediate_relation.identifier }}' + AND database = '{{ new_data_relation.schema }}' + AND table = '{{ new_data_relation.identifier }}' {% endset %} {% set changed_partitions = run_query(select_changed_partitions).rows %} {% else %} {% set changed_partitions = [] %} {% endif %} + {% if changed_partitions %} - {% call statement('replace_partitions') %} - alter table {{ existing_relation }} - {%- for partition in changed_partitions %} - replace partition id '{{ partition['partition_id'] }}' - from {{ intermediate_relation }} - {{- ', ' if not loop.last }} - {%- endfor %} + {% call statement('replace_partitions') %} + {% if is_distributed %} + alter table {{ existing_local }} {{ on_cluster_clause(existing_relation) }} + {% else %} + alter table {{ existing_relation }} + {% endif %} + {%- for partition in changed_partitions %} + replace partition id '{{ partition['partition_id'] }}' + from {{ new_data_relation }} + {{- ', ' if not loop.last }} + {%- endfor %} {% endcall %} {% endif %} - {% do adapter.drop_relation(intermediate_relation) %} + + {% do adapter.drop_relation(distributed_new_data_relation) %} {% do adapter.drop_relation(new_data_relation) %} {% endmacro %} diff --git a/tests/integration/adapter/incremental/test_base_incremental.py b/tests/integration/adapter/incremental/test_base_incremental.py index 0c522df7..a31cbb05 100644 --- a/tests/integration/adapter/incremental/test_base_incremental.py +++ b/tests/integration/adapter/incremental/test_base_incremental.py @@ -195,7 +195,7 @@ def models(self): SELECT partitionKey1, partitionKey2, orderKey, value FROM VALUES( 'partitionKey1 UInt8, partitionKey2 String, orderKey UInt8, value String', - (1, 'p1', 1, 'a'), (1, 'p1', 1, 'b'), (2, 'p1', 1, 'c'), (2, 'p2', 1, 'd') + (1, 'p1', 1, 'a'), (1, 'p1', 2, 'b'), (2, 'p1', 3, 'c'), (2, 'p2', 4, 'd') ) {% else %} SELECT partitionKey1, partitionKey2, orderKey, value @@ -207,7 +207,7 @@ def models(self): """ -class TestInsertReplaceIncremental: +class TestInsertOverwriteIncremental: @pytest.fixture(scope="class") def models(self): return {"insert_overwrite_inc.sql": insert_overwrite_inc} @@ -220,9 +220,9 @@ def test_insert_overwrite_incremental(self, project): ) assert result == [ (1, 'p1', 1, 'a'), - (1, 'p1', 1, 'b'), - (2, 'p1', 1, 'c'), - (2, 'p2', 1, 'd'), + (1, 'p1', 2, 'b'), + (2, 'p1', 3, 'c'), + (2, 'p2', 4, 'd'), ] run_dbt() result = project.run_sql( @@ -231,7 +231,62 @@ def test_insert_overwrite_incremental(self, project): ) assert result == [ (1, 'p1', 2, 'e'), - (2, 'p1', 1, 'c'), - (2, 'p2', 1, 'd'), + (2, 'p1', 3, 'c'), + (2, 'p2', 4, 'd'), + (3, 'p1', 2, 'f'), + ] + +# "ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/{table}/{uuid}/', '{replica}')" +insert_overwrite_replicated_inc = """ +{{ config( + materialized='incremental', + incremental_strategy='insert_overwrite', + partition_by=['partitionKey1', 'partitionKey2'], + order_by=['orderKey'], + engine="ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}')" + ) +}} +{% if not is_incremental() %} + SELECT partitionKey1, partitionKey2, orderKey, value + FROM VALUES( + 'partitionKey1 UInt8, partitionKey2 String, orderKey UInt8, value String', + (1, 'p1', 1, 'a'), (1, 'p1', 2, 'b'), (2, 'p1', 3, 'c'), (2, 'p2', 4, 'd') + ) +{% else %} + SELECT partitionKey1, partitionKey2, orderKey, value + FROM VALUES( + 'partitionKey1 UInt8, partitionKey2 String, orderKey UInt8, value String', + (1, 'p1', 2, 'e'), (3, 'p1', 2, 'f') + ) +{% endif %} +""" + + +class TestInsertOverwriteReplicatedIncremental: + @pytest.fixture(scope="class") + def models(self): + return {"insert_overwrite_replicated_inc.sql": insert_overwrite_replicated_inc} + + def test_insert_overwrite_replicated_incremental(self, project): + run_dbt() + result = project.run_sql( + "select * from insert_overwrite_replicated_inc order by partitionKey1, partitionKey2, orderKey", + fetch="all", + ) + assert result == [ + (1, 'p1', 1, 'a'), + (1, 'p1', 2, 'b'), + (2, 'p1', 3, 'c'), + (2, 'p2', 4, 'd'), + ] + run_dbt() + result = project.run_sql( + "select * from insert_overwrite_replicated_inc order by partitionKey1, partitionKey2, orderKey", + fetch="all", + ) + assert result == [ + (1, 'p1', 2, 'e'), + (2, 'p1', 3, 'c'), + (2, 'p2', 4, 'd'), (3, 'p1', 2, 'f'), ] diff --git a/tests/integration/adapter/incremental/test_distributed_incremental.py b/tests/integration/adapter/incremental/test_distributed_incremental.py index f132933d..e704dc4a 100644 --- a/tests/integration/adapter/incremental/test_distributed_incremental.py +++ b/tests/integration/adapter/incremental/test_distributed_incremental.py @@ -203,3 +203,60 @@ def models(self): ) def test_incremental_not_schema_change(self, project): super().test_incremental_not_schema_change(project) + + +insert_overwrite_dist_inc = """ +{{ config( + materialized='distributed_incremental', + incremental_strategy='insert_overwrite', + partition_by=['partitionKey'], + order_by=['orderKey'], + sharding_key='shardingKey' + ) +}} +{% if not is_incremental() %} + SELECT shardingKey, partitionKey, orderKey, value + FROM VALUES( + 'shardingKey UInt8, partitionKey String, orderKey UInt8, value String', + (1, 'p1', 1, 'a'), (1, 'p1', 2, 'b'), (2, 'p1', 3, 'c'), (2, 'p2', 4, 'd') + ) +{% else %} + SELECT shardingKey, partitionKey, orderKey, value + FROM VALUES( + 'shardingKey UInt8, partitionKey String, orderKey UInt8, value String', + (1, 'p1', 2, 'e'), (3, 'p1', 2, 'f') + ) +{% endif %} +""" + + +class TestInsertOverwriteDistributedIncremental: + @pytest.fixture(scope="class") + def models(self): + return {"insert_overwrite_dist_inc.sql": insert_overwrite_dist_inc} + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_insert_overwrite_distributed_incremental(self, project): + run_dbt() + result = project.run_sql( + "select * from insert_overwrite_dist_inc order by shardingKey, partitionKey, orderKey", + fetch="all", + ) + assert result == [ + (1, 'p1', 1, 'a'), + (1, 'p1', 2, 'b'), + (2, 'p1', 3, 'c'), + (2, 'p2', 4, 'd'), + ] + run_dbt() + result = project.run_sql( + "select * from insert_overwrite_dist_inc order by shardingKey, partitionKey, orderKey", + fetch="all", + ) + assert result == [ + (1, 'p1', 2, 'e'), + (2, 'p2', 4, 'd'), + (3, 'p1', 2, 'f'), + ] From 89e865e8b7b3355d17fef1eb02a286736fb12227 Mon Sep 17 00:00:00 2001 From: the4thamigo-uk <7022874+the4thamigo-uk@users.noreply.github.com> Date: Fri, 29 Nov 2024 11:52:22 +0000 Subject: [PATCH 42/96] Protect against no columns returned in materialized-view (#395) --- .../clickhouse/macros/materializations/materialized_view.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 4583918b..452a3439 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -68,7 +68,7 @@ {% endset %} {% set tables_result = run_query(tables_query) %} - {% if tables_result is not none %} + {% if tables_result is not none and tables_result.columns %} {% set tables = tables_result.columns[0].values() %} {{ log('Current mvs found in ClickHouse are: ' + tables | join(', ')) }} {% set mv_names = [] %} From c262aeb4a10ce97e30032532ad874b849ae6b1d9 Mon Sep 17 00:00:00 2001 From: "f.abapolov" Date: Mon, 2 Dec 2024 11:25:30 +0300 Subject: [PATCH 43/96] CHANGELOG.md updated --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e23f02c..4cfd196b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) * Truncated stack trace for database errors for cleaner output when HIDE_STACK_TRACE variable is set to any value. ([#382](https://github.com/ClickHouse/dbt-clickhouse/pull/382)) * It is now possible to pass query settings not only on table creation but also on query. ([#362](https://github.com/ClickHouse/dbt-clickhouse/pull/362)) +* Added Support of insert_overwrite in cluster setup with incremental and distributed_incremental materializations ([#394](https://github.com/ClickHouse/dbt-clickhouse/pull/394)) + ### Bug Fixes * Before this version, `split_part` macro used to add an extra quotation. that was fixed in ([#338](https://github.com/ClickHouse/dbt-clickhouse/pull/338)) From 29f2551780dc524d0880cb536efc6290e96307cd Mon Sep 17 00:00:00 2001 From: Stephen Nancekivell Date: Tue, 3 Dec 2024 14:50:24 +1100 Subject: [PATCH 44/96] create indexes --- .../clickhouse/macros/materializations/table.sql | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index a7601552..faedba24 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -150,7 +150,9 @@ {% if config.get('projections')%} {{ projection_statement(relation) }} {% endif %} - + {% if config.get('indexes') %} + {{ indexes_statement(relation) }} + {% endif %} {{ clickhouse__insert_into(relation, sql, has_contract) }} {%- endif %} @@ -169,6 +171,16 @@ {%- endfor %} {%- endmacro %} +{% macro indexes_statement(relation) %} + {%- set indexes = config.get('indexes', default=[]) -%} + + {%- for index in indexes %} + {% call statement('add_indexes') %} + ALTER TABLE {{ relation }} ADD INDEX {{ index.get('name') }} {{ index.get('definition') }} + {%endcall %} + {%- endfor %} +{%- endmacro %} + {% macro create_table_or_empty(temporary, relation, sql, has_contract) -%} {%- set sql_header = config.get('sql_header', none) -%} From 5848fcceaed646551abd2458e1ad8f1190ac086a Mon Sep 17 00:00:00 2001 From: Stephen Nancekivell Date: Tue, 3 Dec 2024 14:53:48 +1100 Subject: [PATCH 45/96] update changelog --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e23f02c..e2d18e94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,18 @@ +### Next Version + +#### New Features +* [ClickHouse indexes](https://clickhouse.com/docs/en/optimize/sparse-primary-indexes) are now fully supported for `table` materialization. +The index config should be added to the model config. for instance: + ```python + {{ config( + materialized='%s', + indexes=[{ + 'name': 'your_index_name', + 'definition': 'your_column TYPE minmax GRANULARITY 2' + }] + ) }} + ``` + ### Release [1.8.5], 2024-11-19 ### New Features From 2575f77a9055511fdc96405cf4689797bc1deb3e Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 5 Dec 2024 17:13:24 +0700 Subject: [PATCH 46/96] make mv data catchup configurable --- .../macros/materializations/materialized_view.sql | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 4583918b..8cb6ea04 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -50,7 +50,8 @@ {% if backup_relation is none %} {{ log('Creating new materialized view ' + target_relation.name )}} - {{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql, views) }} + {% set catchup_data = config.get("catchup", True) %} + {{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql, views, catchup_data) }} {% elif existing_relation.can_exchange %} {{ log('Replacing existing materialized view ' + target_relation.name) }} -- in this section, we look for mvs that has the same pattern as this model, but for some reason, @@ -132,9 +133,15 @@ 2. Create a materialized view using the SQL in the model that inserts data into the table creating during step 1 #} -{% macro clickhouse__get_create_materialized_view_as_sql(relation, sql, views) -%} +{% macro clickhouse__get_create_materialized_view_as_sql(relation, sql, views, catchup=True ) -%} {% call statement('main') %} + {% if catchup == True %} {{ get_create_table_as_sql(False, relation, sql) }} + {% else %} + {{ log('Catchup data config was set to false, skipping mv-target-table initial insertion ')}} + {% set has_contract = config.get('contract').enforced %} + {{ create_table_or_empty(False, relation, sql, has_contract) }} + {% endif %} {% endcall %} {%- set cluster_clause = on_cluster_clause(relation) -%} {%- set mv_relation = relation.derivative('_mv', 'materialized_view') -%} From 501e0a29dfba73eca8f81160390c8ba4f274db44 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 5 Dec 2024 17:13:54 +0700 Subject: [PATCH 47/96] split mv test file + add catchup test --- .../test_materialized_view.py | 215 ++------------- .../test_multiple_materialized_views.py | 247 ++++++++++++++++++ 2 files changed, 271 insertions(+), 191 deletions(-) create mode 100644 tests/integration/adapter/materialized_view/test_multiple_materialized_views.py diff --git a/tests/integration/adapter/materialized_view/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py index 9c9ffdb6..f4723998 100644 --- a/tests/integration/adapter/materialized_view/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -28,10 +28,11 @@ materialized='materialized_view', engine='MergeTree()', order_by='(id)', - schema='custom_schema', + schema='catchup' if var('run_type', '') == 'catchup' else 'custom_schema', + **({'catchup': False} if var('run_type', '') == 'catchup' else {}) ) }} -{% if var('run_type', '') == '' %} +{% if var('run_type', '') in ['', 'catchup'] %} select id, name, @@ -60,74 +61,6 @@ {% endif %} """ -MULTIPLE_MV_MODEL = """ -{{ config( - materialized='materialized_view', - engine='MergeTree()', - order_by='(id)', - schema='custom_schema_for_multiple_mv', -) }} - -{% if var('run_type', '') == '' %} - ---mv1:begin -select - id, - name, - case - when name like 'Dade' then 'crash_override' - when name like 'Kate' then 'acid burn' - else 'N/A' - end as hacker_alias -from {{ source('raw', 'people') }} -where department = 'engineering' ---mv1:end - -union all - ---mv2:begin -select - id, - name, - -- sales people are not cool enough to have a hacker alias - 'N/A' as hacker_alias -from {{ source('raw', 'people') }} -where department = 'sales' ---mv2:end - -{% elif var('run_type', '') == 'extended_schema' %} - ---mv1:begin -select - id, - name, - case - -- Dade wasn't always known as 'crash override'! - when name like 'Dade' and age = 11 then 'zero cool' - when name like 'Dade' and age != 11 then 'crash override' - when name like 'Kate' then 'acid burn' - else 'N/A' - end as hacker_alias -from {{ source('raw', 'people') }} -where department = 'engineering' ---mv1:end - -union all - ---mv2:begin -select - id, - name, - -- sales people are not cool enough to have a hacker alias - 'N/A' as hacker_alias -from {{ source('raw', 'people') }} -where department = 'sales' ---mv2:end - -{% endif %} -""" - - SEED_SCHEMA_YML = """ version: 2 @@ -197,116 +130,30 @@ def test_create(self, project): result = project.run_sql(f"select count(*) from {schema}.hackers", fetch="all") assert result[0][0] == 4 - -class TestUpdateMV: - @pytest.fixture(scope="class") - def seeds(self): - """ - we need a base table to pull from - """ - return { - "people.csv": PEOPLE_SEED_CSV, - "schema.yml": SEED_SCHEMA_YML, - } - - @pytest.fixture(scope="class") - def models(self): - return { - "hackers.sql": MV_MODEL, - } - - def test_update_incremental(self, project): - schema = quote_identifier(project.test_schema + "_custom_schema") - # create our initial materialized view - run_dbt(["seed"]) - run_dbt() - - # re-run dbt but this time with the new MV SQL - run_vars = {"run_type": "extended_schema"} - run_dbt(["run", "--vars", json.dumps(run_vars)]) - - project.run_sql( - f""" - insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") - values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); - """ - ) - - # assert that we now have both of Dade's aliases in our hackers table - result = project.run_sql( - f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all" - ) - assert len(result) == 2 - - def test_update_full_refresh(self, project): - schema = quote_identifier(project.test_schema + "_custom_schema") - # create our initial materialized view - run_dbt(["seed"]) - run_dbt() - - # re-run dbt but this time with the new MV SQL - run_vars = {"run_type": "extended_schema"} - run_dbt(["run", "--full-refresh", "--vars", json.dumps(run_vars)]) - - project.run_sql( - f""" - insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") - values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); - """ - ) - - # assert that we now have both of Dade's aliases in our hackers table - result = project.run_sql( - f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all" - ) - assert len(result) == 2 - - -class TestMultipleMV: - @pytest.fixture(scope="class") - def seeds(self): - """ - we need a base table to pull from - """ - return { - "people.csv": PEOPLE_SEED_CSV, - "schema.yml": SEED_SCHEMA_YML, - } - - @pytest.fixture(scope="class") - def models(self): - return { - "hackers.sql": MULTIPLE_MV_MODEL, - } - - def test_create(self, project): + def test_disabled_catchup(self, project): """ 1. create a base table via dbt seed - 2. create a model as a materialized view, selecting from the table created in (1) + 2. create a model with catchup disabled as a materialized view, selecting from the table created in (1) 3. insert data into the base table and make sure it's there in the target table created in (2) """ - schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + schema = quote_identifier(project.test_schema + "_catchup") results = run_dbt(["seed"]) assert len(results) == 1 columns = project.run_sql("DESCRIBE TABLE people", fetch="all") assert columns[0][1] == "Int32" - # create the model - run_dbt(["run"]) + # create the model with catchup disabled + run_vars = {"run_type": "catchup"} + run_dbt(["run", "--vars", json.dumps(run_vars)]) + # check that we only have the new row, without the historical data assert len(results) == 1 columns = project.run_sql(f"DESCRIBE TABLE {schema}.hackers", fetch="all") assert columns[0][1] == "Int32" - columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv1", fetch="all") - assert columns[0][1] == "Int32" - - columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv2", fetch="all") + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv", fetch="all") assert columns[0][1] == "Int32" - with pytest.raises(Exception): - columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv", fetch="all") - check_relation_types( project.adapter, { @@ -318,25 +165,16 @@ def test_create(self, project): # insert some data and make sure it reaches the target table project.run_sql( f""" - insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") - values (4000,'Dave',40,'sales'), (9999,'Eugene',40,'engineering'); - """ + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") + values (1232,'Dade',16,'engineering'), (9999,'eugene',40,'malware'); + """ ) - result = project.run_sql(f"select * from {schema}.hackers order by id", fetch="all") - assert result == [ - (1000, 'Alfie', 'N/A'), - (1231, 'Dade', 'crash_override'), - (2000, 'Bill', 'N/A'), - (3000, 'Charlie', 'N/A'), - (4000, 'Dave', 'N/A'), - (6666, 'Ksenia', 'N/A'), - (8888, 'Kate', 'acid burn'), - (9999, 'Eugene', 'N/A'), - ] + result = project.run_sql(f"select count(*) from {schema}.hackers", fetch="all") + assert result[0][0] == 1 -class TestUpdateMultipleMV: +class TestUpdateMV: @pytest.fixture(scope="class") def seeds(self): """ @@ -350,11 +188,11 @@ def seeds(self): @pytest.fixture(scope="class") def models(self): return { - "hackers.sql": MULTIPLE_MV_MODEL, + "hackers.sql": MV_MODEL, } def test_update_incremental(self, project): - schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + schema = quote_identifier(project.test_schema + "_custom_schema") # create our initial materialized view run_dbt(["seed"]) run_dbt() @@ -372,15 +210,12 @@ def test_update_incremental(self, project): # assert that we now have both of Dade's aliases in our hackers table result = project.run_sql( - f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", - fetch="all", + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all" ) assert len(result) == 2 - assert result[0][0] == "crash_override" - assert result[1][0] == "zero cool" def test_update_full_refresh(self, project): - schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + schema = quote_identifier(project.test_schema + "_custom_schema") # create our initial materialized view run_dbt(["seed"]) run_dbt() @@ -398,10 +233,8 @@ def test_update_full_refresh(self, project): # assert that we now have both of Dade's aliases in our hackers table result = project.run_sql( - f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", - fetch="all", + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all" ) - print(result) assert len(result) == 2 - assert result[0][0] == "crash override" - assert result[1][0] == "zero cool" + + diff --git a/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py b/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py new file mode 100644 index 00000000..5db81d7f --- /dev/null +++ b/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py @@ -0,0 +1,247 @@ +""" +test materialized view creation. This is ClickHouse specific, which has a significantly different implementation +of materialized views from PostgreSQL or Oracle +""" + +import json + +import pytest +from dbt.tests.util import check_relation_types, run_dbt + +from dbt.adapters.clickhouse.query import quote_identifier + +PEOPLE_SEED_CSV = """ +id,name,age,department +1231,Dade,33,engineering +6666,Ksenia,48,engineering +8888,Kate,50,engineering +1000,Alfie,10,sales +2000,Bill,20,sales +3000,Charlie,30,sales +""".lstrip() + +# This model is parameterized, in a way, by the "run_type" dbt project variable +# This is to be able to switch between different model definitions within +# the same test run and allow us to test the evolution of a materialized view + +MULTIPLE_MV_MODEL = """ +{{ config( + materialized='materialized_view', + engine='MergeTree()', + order_by='(id)', + schema='custom_schema_for_multiple_mv', +) }} + +{% if var('run_type', '') == '' %} + +--mv1:begin +select + id, + name, + case + when name like 'Dade' then 'crash_override' + when name like 'Kate' then 'acid burn' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} +where department = 'engineering' +--mv1:end + +union all + +--mv2:begin +select + id, + name, + -- sales people are not cool enough to have a hacker alias + 'N/A' as hacker_alias +from {{ source('raw', 'people') }} +where department = 'sales' +--mv2:end + +{% elif var('run_type', '') == 'extended_schema' %} + +--mv1:begin +select + id, + name, + case + -- Dade wasn't always known as 'crash override'! + when name like 'Dade' and age = 11 then 'zero cool' + when name like 'Dade' and age != 11 then 'crash override' + when name like 'Kate' then 'acid burn' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} +where department = 'engineering' +--mv1:end + +union all + +--mv2:begin +select + id, + name, + -- sales people are not cool enough to have a hacker alias + 'N/A' as hacker_alias +from {{ source('raw', 'people') }} +where department = 'sales' +--mv2:end + +{% endif %} +""" + + +SEED_SCHEMA_YML = """ +version: 2 + +sources: + - name: raw + schema: "{{ target.schema }}" + tables: + - name: people +""" + +class TestMultipleMV: + @pytest.fixture(scope="class") + def seeds(self): + """ + we need a base table to pull from + """ + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": MULTIPLE_MV_MODEL, + } + + def test_create(self, project): + """ + 1. create a base table via dbt seed + 2. create a model as a materialized view, selecting from the table created in (1) + 3. insert data into the base table and make sure it's there in the target table created in (2) + """ + schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + results = run_dbt(["seed"]) + assert len(results) == 1 + columns = project.run_sql("DESCRIBE TABLE people", fetch="all") + assert columns[0][1] == "Int32" + + # create the model + run_dbt(["run"]) + assert len(results) == 1 + + columns = project.run_sql(f"DESCRIBE TABLE {schema}.hackers", fetch="all") + assert columns[0][1] == "Int32" + + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv1", fetch="all") + assert columns[0][1] == "Int32" + + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv2", fetch="all") + assert columns[0][1] == "Int32" + + with pytest.raises(Exception): + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv", fetch="all") + + check_relation_types( + project.adapter, + { + "hackers_mv": "view", + "hackers": "table", + }, + ) + + # insert some data and make sure it reaches the target table + project.run_sql( + f""" + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") + values (4000,'Dave',40,'sales'), (9999,'Eugene',40,'engineering'); + """ + ) + + result = project.run_sql(f"select * from {schema}.hackers order by id", fetch="all") + assert result == [ + (1000, 'Alfie', 'N/A'), + (1231, 'Dade', 'crash_override'), + (2000, 'Bill', 'N/A'), + (3000, 'Charlie', 'N/A'), + (4000, 'Dave', 'N/A'), + (6666, 'Ksenia', 'N/A'), + (8888, 'Kate', 'acid burn'), + (9999, 'Eugene', 'N/A'), + ] + + +class TestUpdateMultipleMV: + @pytest.fixture(scope="class") + def seeds(self): + """ + we need a base table to pull from + """ + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": MULTIPLE_MV_MODEL, + } + + def test_update_incremental(self, project): + schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + # create our initial materialized view + run_dbt(["seed"]) + run_dbt() + + # re-run dbt but this time with the new MV SQL + run_vars = {"run_type": "extended_schema"} + run_dbt(["run", "--vars", json.dumps(run_vars)]) + + project.run_sql( + f""" + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") + values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); + """ + ) + + # assert that we now have both of Dade's aliases in our hackers table + result = project.run_sql( + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", + fetch="all", + ) + assert len(result) == 2 + assert result[0][0] == "crash_override" + assert result[1][0] == "zero cool" + + def test_update_full_refresh(self, project): + schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv") + # create our initial materialized view + run_dbt(["seed"]) + run_dbt() + + # re-run dbt but this time with the new MV SQL + run_vars = {"run_type": "extended_schema"} + run_dbt(["run", "--full-refresh", "--vars", json.dumps(run_vars)]) + + project.run_sql( + f""" + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") + values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); + """ + ) + + # assert that we now have both of Dade's aliases in our hackers table + result = project.run_sql( + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias", + fetch="all", + ) + print(result) + assert len(result) == 2 + assert result[0][0] == "crash override" + assert result[1][0] == "zero cool" \ No newline at end of file From 69693eb3c2baaea887a56114973f272a0b943524 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 5 Dec 2024 17:22:06 +0700 Subject: [PATCH 48/96] fix lint --- .../adapter/materialized_view/test_materialized_view.py | 2 -- .../materialized_view/test_multiple_materialized_views.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration/adapter/materialized_view/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py index f4723998..b8cb8214 100644 --- a/tests/integration/adapter/materialized_view/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -236,5 +236,3 @@ def test_update_full_refresh(self, project): f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all" ) assert len(result) == 2 - - diff --git a/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py b/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py index 5db81d7f..f28d1784 100644 --- a/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py +++ b/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py @@ -244,4 +244,4 @@ def test_update_full_refresh(self, project): print(result) assert len(result) == 2 assert result[0][0] == "crash override" - assert result[1][0] == "zero cool" \ No newline at end of file + assert result[1][0] == "zero cool" From 4cbf8cbbd1621e863e7fe4ed0f377e5b96147bc4 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 5 Dec 2024 17:32:06 +0700 Subject: [PATCH 49/96] fix lint --- .../materialized_view/test_multiple_materialized_views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py b/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py index f28d1784..9a2d9850 100644 --- a/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py +++ b/tests/integration/adapter/materialized_view/test_multiple_materialized_views.py @@ -102,6 +102,7 @@ - name: people """ + class TestMultipleMV: @pytest.fixture(scope="class") def seeds(self): From d31591cdf90a627609bf26832a06cb9fc85fbe66 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 5 Dec 2024 17:48:34 +0700 Subject: [PATCH 50/96] update readme file --- README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7855fbb2..eeb8d11e 100644 --- a/README.md +++ b/README.md @@ -264,7 +264,22 @@ select a,b,c from {{ source('raw', 'table_2') }} > > When updating a model with multiple materialized views (MVs), especially when renaming one of the MV names, dbt-clickhouse does not automatically drop the old MV. Instead, > you will encounter the following warning: `Warning - Table was detected with the same pattern as model name but was not found in this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!) ` - + +## Data catchup +Currently, when creating a materialized view (MV), the target table is first populated with historical data before the MV itself is created. + +In other words, dbt-clickhouse initially creates the target table and preloads it with historical data based on the query defined for the MV. Only after this step is the MV created. + +If you prefer not to preload historical data during MV creation, you can disable this behavior by setting the catchup config to False: + +```python +{{config( + materialized='materialized_view', + engine='MergeTree()', + order_by='(id)', + catchup=False +)}} +``` # Dictionary materializations (experimental) From e2ab9184231a71ad14c29392b442491d32bb30d5 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 5 Dec 2024 17:48:58 +0700 Subject: [PATCH 51/96] version 1.8.6 housekeeping --- CHANGELOG.md | 5 +++++ dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e23f02c..b99ab58e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### Release [1.8.6], 2024-12-05 + +### Improvement +* Today, on mv model creation, the target table is being populated with the historical data based on the query provided in the mv creation. This catchup mechanism is now behind a config flag and enabled by default (as is today). ([#399](https://github.com/ClickHouse/dbt-clickhouse/pull/399)) + ### Release [1.8.5], 2024-11-19 ### New Features diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 61aaff6b..f22f38bf 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.8.5' +version = '1.8.6' From a8c79867c0a0f905d7bc4e1919604470db4450f0 Mon Sep 17 00:00:00 2001 From: Bentsi Leviav Date: Sun, 15 Dec 2024 15:14:18 +0200 Subject: [PATCH 52/96] Revert "Add list support to `primary_key`" --- CHANGELOG.md | 2 -- .../clickhouse/macros/materializations/table.sql | 14 +++----------- .../integration/adapter/basic/test_incremental.py | 2 +- .../adapter/clickhouse/test_clickhouse_s3.py | 1 - .../adapter/incremental/test_base_incremental.py | 7 +------ .../incremental/test_distributed_incremental.py | 8 ++------ .../adapter/incremental/test_schema_change.py | 3 --- 7 files changed, 7 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f0ff70c..2a775c05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,6 @@ Avoid potential data loss by using `CREATE OR REPLACE DICTIONARY` to atomically * Added support for the creation of more than one materialized view inserting records into the same target table. ([#360](https://github.com/ClickHouse/dbt-clickhouse/pull/364)) ### Improvement - -* Enhance the `primary_key` macro to accept a list of columns, allowing for primary keys with multiple columns. ([#337](https://github.com/ClickHouse/dbt-clickhouse/pull/337)) * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) * Truncated stack trace for database errors for cleaner output when HIDE_STACK_TRACE variable is set to any value. ([#382](https://github.com/ClickHouse/dbt-clickhouse/pull/382)) * It is now possible to pass query settings not only on table creation but also on query. ([#362](https://github.com/ClickHouse/dbt-clickhouse/pull/362)) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index feb0130c..a7601552 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -86,18 +86,10 @@ {%- endmacro -%} {% macro primary_key_clause(label) %} - {%- set cols = config.get('primary_key', validator=validation.any[list, basestring]) -%} + {%- set primary_key = config.get('primary_key', validator=validation.any[basestring]) -%} - {%- if cols is not none %} - {%- if cols is string -%} - {%- set cols = [cols] -%} - {%- endif -%} - {{ label }} ( - {%- for item in cols -%} - {{ item }} - {%- if not loop.last -%},{%- endif -%} - {%- endfor -%} - ) + {%- if primary_key is not none %} + {{ label }} {{ primary_key }} {%- endif %} {%- endmacro -%} diff --git a/tests/integration/adapter/basic/test_incremental.py b/tests/integration/adapter/basic/test_incremental.py index 4e3df690..c50d477a 100644 --- a/tests/integration/adapter/basic/test_incremental.py +++ b/tests/integration/adapter/basic/test_incremental.py @@ -7,7 +7,7 @@ class TestIncremental(BaseIncremental): incremental_not_schema_change_sql = """ -{{ config(materialized="incremental", primary_key="user_id_current_time", unique_key="user_id_current_time",on_schema_change="append_new_columns") }} +{{ config(materialized="incremental", unique_key="user_id_current_time",on_schema_change="append_new_columns") }} select toString(1) || '-' || toString(now64()) as user_id_current_time, {% if is_incremental() %} diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_s3.py b/tests/integration/adapter/clickhouse/test_clickhouse_s3.py index 25d70c50..10f1289e 100644 --- a/tests/integration/adapter/clickhouse/test_clickhouse_s3.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_s3.py @@ -36,7 +36,6 @@ materialized='incremental', order_by='pickup_datetime', incremental_strategy='delete+insert', - primary_key='trip_id', unique_key='trip_id', taxi_s3={"structure":['trip_id UInt32', 'pickup_datetime DateTime', 'passenger_count UInt8']} ) diff --git a/tests/integration/adapter/incremental/test_base_incremental.py b/tests/integration/adapter/incremental/test_base_incremental.py index a949d96f..0c522df7 100644 --- a/tests/integration/adapter/incremental/test_base_incremental.py +++ b/tests/integration/adapter/incremental/test_base_incremental.py @@ -20,7 +20,6 @@ materialized='table', engine='MergeTree()', order_by=['ts'], - primary_key=['impid'], unique_key=['impid'] ) }} @@ -34,7 +33,6 @@ materialized='incremental', engine='MergeTree()', order_by=['ts'], - primary_key=['impid'], unique_key=['impid'], settings={'allow_nullable_key':'1'} ) @@ -64,7 +62,6 @@ def test_simple_incremental(self, project): {{ config( materialized='incremental', order_by=['key1'], - primary_key='key1', unique_key='key1', incremental_strategy='delete+insert', settings={'allow_nullable_key':1} @@ -100,7 +97,6 @@ def test_lw_delete(self, project): {{ config( materialized='incremental', order_by=['key1'], - primary_key='key1', unique_key='key1', incremental_strategy='legacy', settings={'allow_nullable_key':1} @@ -144,7 +140,6 @@ def test_legacy(self, project): {{ config( materialized='incremental', order_by=['key1', 'key2'], - primary_key=['key1', 'key2'], unique_key='key1, key2', incremental_strategy='delete+insert' ) @@ -179,7 +174,7 @@ class TestInsertsOnlyIncrementalMaterialization(BaseIncremental): @pytest.fixture(scope="class") def models(self): config_materialized_incremental = """ - {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='incremental', primary_key='id', unique_key='id') }} + {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='incremental', unique_key='id') }} """ incremental_sql = config_materialized_incremental + model_incremental return { diff --git a/tests/integration/adapter/incremental/test_distributed_incremental.py b/tests/integration/adapter/incremental/test_distributed_incremental.py index 9c74b239..f132933d 100644 --- a/tests/integration/adapter/incremental/test_distributed_incremental.py +++ b/tests/integration/adapter/incremental/test_distributed_incremental.py @@ -17,7 +17,6 @@ materialized='distributed_table', engine='MergeTree()', order_by=['ts'], - primary_key=['impid'], unique_key=['impid'] ) }} @@ -30,7 +29,6 @@ config( materialized='distributed_incremental', engine='MergeTree()', - primary_key=['impid'], order_by=['ts'], unique_key=['impid'] ) @@ -71,7 +69,6 @@ def test_simple_incremental(self, project): {{ config( materialized='distributed_incremental', order_by=['key1'], - primary_key='key1', unique_key='key1', incremental_strategy='delete+insert' ) @@ -114,7 +111,6 @@ def test_lw_delete(self, project): {{ config( materialized='distributed_incremental', order_by=['key1', 'key2'], - primary_key=['key1', 'key2'], unique_key='key1, key2', incremental_strategy='delete+insert' ) @@ -162,7 +158,7 @@ class TestInsertsOnlyDistributedIncrementalMaterialization(BaseIncremental): @pytest.fixture(scope="class") def models(self): config_materialized_incremental = """ - {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='distributed_incremental', primary_key='id', unique_key='id') }} + {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='distributed_incremental', unique_key='id') }} """ incremental_sql = config_materialized_incremental + model_incremental return { @@ -186,7 +182,7 @@ def test_incremental(self, project): incremental_not_schema_change_sql = """ -{{ config(materialized="distributed_incremental", primary_key="user_id_current_time", unique_key="user_id_current_time", on_schema_change="sync_all_columns") }} +{{ config(materialized="distributed_incremental", unique_key="user_id_current_time",on_schema_change="sync_all_columns") }} select toString(1) || '-' || toString(now64()) as user_id_current_time, {% if is_incremental() %} diff --git a/tests/integration/adapter/incremental/test_schema_change.py b/tests/integration/adapter/incremental/test_schema_change.py index 9a8871b0..e3efcb5f 100644 --- a/tests/integration/adapter/incremental/test_schema_change.py +++ b/tests/integration/adapter/incremental/test_schema_change.py @@ -8,7 +8,6 @@ {{ config( materialized='%s', - primary_key='col_1', unique_key='col_1', on_schema_change='%s' ) @@ -102,7 +101,6 @@ def test_append(self, project, model): {{ config( materialized='%s', - primary_key='col_1', unique_key='col_1', on_schema_change='%s' ) @@ -191,7 +189,6 @@ def test_sync(self, project, model): {{ config( materialized='%s', - primary_key='col_1', unique_key='col_1', on_schema_change='fail' ) From 86ad380d399fb31868cccbcf7b30347b391cda76 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 26 Dec 2024 14:07:17 +0200 Subject: [PATCH 53/96] support refreshable materialized views --- .../materializations/materialized_view.sql | 120 +++++++++++++++++- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 854ee1c5..417e8247 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -7,6 +7,8 @@ {%- set target_relation = this.incorporate(type='table') -%} {%- set cluster_clause = on_cluster_clause(target_relation) -%} + {%- set refreshable_clause = refreshable_mv_clause() -%} + {# look for an existing relation for the target table and create backup relations if necessary #} {%- set existing_relation = load_cached_relation(this) -%} @@ -97,7 +99,7 @@ select 1 {%- endcall %} {% endif %} - {{ clickhouse__create_mvs(existing_relation, cluster_clause, views) }} + {{ clickhouse__create_mvs(existing_relation, cluster_clause, refreshable_clause, views) }} {% else %} {{ log('Replacing existing materialized view ' + target_relation.name) }} {{ clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql, views) }} @@ -144,16 +146,18 @@ {% endif %} {% endcall %} {%- set cluster_clause = on_cluster_clause(relation) -%} + {%- set refreshable_clause = refreshable_mv_clause() -%} {%- set mv_relation = relation.derivative('_mv', 'materialized_view') -%} - {{ clickhouse__create_mvs(relation, cluster_clause, views) }} + {{ clickhouse__create_mvs(relation, cluster_clause, refreshable_clause, views) }} {%- endmacro %} {% macro clickhouse__drop_mv(mv_relation, cluster_clause) -%} drop view if exists {{ mv_relation }} {{ cluster_clause }} {%- endmacro %}u -{% macro clickhouse__create_mv(mv_relation, target_table, cluster_clause, sql) -%} +{% macro clickhouse__create_mv(mv_relation, target_table, cluster_clause, refreshable_clause, sql) -%} create materialized view if not exists {{ mv_relation }} {{ cluster_clause }} + {{ refreshable_clause }} to {{ target_table }} as {{ sql }} {%- endmacro %} @@ -167,11 +171,11 @@ {% endfor %} {%- endmacro %} -{% macro clickhouse__create_mvs(target_relation, cluster_clause, views) -%} +{% macro clickhouse__create_mvs(target_relation, cluster_clause, refreshable_clause, views) -%} {% for view, view_sql in views.items() %} {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} {% call statement('create existing mv: ' + view) -%} - {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, refreshable_clause, view_sql) }}; {% endcall %} {% endfor %} {%- endmacro %} @@ -179,6 +183,7 @@ {% macro clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql, views) %} {# drop existing materialized view while we recreate the target table #} {%- set cluster_clause = on_cluster_clause(target_relation) -%} + {%- set refreshable_clause = refreshable_mv_clause() -%} {{ clickhouse__drop_mvs(target_relation, cluster_clause, views) }} {# recreate the target table #} @@ -189,6 +194,109 @@ {{ adapter.rename_relation(intermediate_relation, target_relation) }} {# now that the target table is recreated, we can finally create our new view #} - {{ clickhouse__create_mvs(target_relation, cluster_clause, views) }} + {{ clickhouse__create_mvs(target_relation, cluster_clause, refreshable_clause, views) }} {% endmacro %} +{% macro refreshable_mv_clause() %} + {%- if config.get('refreshable') is not none -%} + + {% set refreshable_config = config.get('refreshable') %} + {% if refreshable_config is not mapping %} + {% do exceptions.raise_compiler_error( + "The 'refreshable' configuration must be defined as a dictionary. Please review the docs for more details." + ) %} + {% endif %} + + {% set refresh_interval = refreshable_config.get('interval', none) %} + {% set refresh_randomize = refreshable_config.get('randomize', none) %} + {% set depends_on = refreshable_config.get('depends_on', none) %} + {% set depends_on_validation = refreshable_config.get('depends_on_validation', true) %} + {% set append = refreshable_config.get('append', false) %} + + {% if not refresh_interval %} + {% do exceptions.raise_compiler_error( + "The 'refreshable' configuration is defined, but 'interval' is missing. " + ~ "This is required to create a refreshable materialized view." + ) %} + {% endif %} + + {% if refresh_interval %} + REFRESH {{ refresh_interval }} + {%- if refresh_randomize -%} + RANDOMIZE FOR {{ refresh_randomize }} + {%- endif -%} + {% endif %} + + {% if depends_on %} + {% set depends_on_list = [] %} + + {% if depends_on is string %} + {% set depends_on_list = [depends_on] %} + {% elif depends_on is iterable %} + {% set temp_list = depends_on_list %} + {%- for dep in depends_on %} + {% if dep is string %} + {% do temp_list.append(dep) %} + {% else %} + {% do exceptions.raise_compiler_error( + "The 'depends_on' configuration must be either a string or a list of strings." + ) %} + {% endif %} + {% endfor %} + {% set depends_on_list = temp_list %} + {% else %} + {% do exceptions.raise_compiler_error( + "The 'depends_on' configuration must be either a string or a list of strings." + ) %} + {% endif %} + + {% if depends_on_validation and depends_on_list | length > 0 %} + {%- for dep in depends_on_list %} + {% do validate_refreshable_mv_existence(dep) %} + {%- endfor %} + {% endif %} + + DEPENDS ON {{ depends_on_list | join(', ') }} + {% endif %} + + {%- if append -%} + APPEND + {%- endif -%} + + {%- endif -%} +{% endmacro %} + + +{% macro validate_refreshable_mv_existence(mv) %} + {{ log(mv + ' was recognized as a refreshable mv dependency, checking its existence') }} + {% set default_database = "default" %} + + {%- set parts = mv.split('.') %} + {%- if parts | length == 2 %} + {%- set database = parts[0] %} + {%- set table = parts[1] %} + {%- else %} + {%- set database = default_database %} + {%- set table = parts[0] %} + {%- endif %} + + {%- set condition = "database='" + database + "' and view='" + table + "'" %} + + {% set query %} + select database, view + from system.view_refreshes + where {{ condition }} + {% endset %} + + {% set tables_result = run_query(query) %} + {% if tables_result is not none and tables_result.columns %} + {{ log('MV ' + mv + ' exists.') }} + {% else %} + {% do exceptions.raise_compiler_error( + 'No existing MV found matching MV: ' + mv + ) %} + {% endif %} +{% endmacro %} + + + From c43e2a3014dec549b9c71dfab681d13d8731a24e Mon Sep 17 00:00:00 2001 From: lkhagvadorj-amp Date: Fri, 27 Dec 2024 10:24:01 +0000 Subject: [PATCH 54/96] fix: remove python 3.8 from various places, #385 --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 43b2151e..b156c6a6 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,7 @@ def _dbt_clickhouse_version(): 'clickhouse-driver>=0.2.6', 'setuptools>=0.69', ], - python_requires=">=3.8", + python_requires=">=3.9", platforms='any', classifiers=[ 'Development Status :: 5 - Production/Stable', @@ -67,7 +67,6 @@ def _dbt_clickhouse_version(): 'Operating System :: Microsoft :: Windows', 'Operating System :: MacOS :: MacOS X', 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', From 24a9714335ed5d3a95e77abab57a97bfd8829184 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 29 Dec 2024 13:16:43 +0200 Subject: [PATCH 55/96] Add basic refreshable mv tests --- .../test_refreshable_materialized_view.py | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py diff --git a/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py b/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py new file mode 100644 index 00000000..c0221688 --- /dev/null +++ b/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py @@ -0,0 +1,122 @@ +""" +test refreshable materialized view creation. This is ClickHouse specific, which has a significantly different implementation +of materialized views from PostgreSQL or Oracle +""" + +import json + +import pytest +from dbt.tests.util import check_relation_types, run_dbt + +PEOPLE_SEED_CSV = """ +id,name,age,department +1231,Dade,33,engineering +6666,Ksenia,48,engineering +8888,Kate,50,engineering +1000,Alfie,10,sales +2000,Bill,20,sales +3000,Charlie,30,sales +""".lstrip() + +# This model is parameterized, in a way, by the "run_type" dbt project variable +# This is to be able to switch between different model definitions within +# the same test run and allow us to test the evolution of a materialized view +MV_MODEL = """ +{{ config( + materialized='materialized_view', + engine='MergeTree()', + order_by='(department)', + refreshable=( + { + "interval": "EVERY 2 MINUTE", + "depends_on": ['depend_on_model'], + "depends_on_validation": True + } if var('run_type', '') == 'validate_depends_on' else { + "interval": "EVERY 2 MINUTE" + } + ) + ) + }} +select + department, + avg(age) as average + from {{ source('raw', 'people') }} +group by department +""" + +SEED_SCHEMA_YML = """ +version: 2 + +sources: + - name: raw + schema: "{{ target.schema }}" + tables: + - name: people +""" + + +class TestBasicRefreshableMV: + @pytest.fixture(scope="class") + def seeds(self): + """ + we need a base table to pull from + """ + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": MV_MODEL, + } + + def test_create(self, project): + """ + 1. create a base table via dbt seed + 2. create a model as a refreshable materialized view, selecting from the table created in (1) + 3. check in system.view_refreshes for the table existence + """ + results = run_dbt(["seed"]) + assert len(results) == 1 + columns = project.run_sql(f"DESCRIBE TABLE {project.test_schema}.people", fetch="all") + assert columns[0][1] == "Int32" + + # create the model + results = run_dbt() + assert len(results) == 1 + + columns = project.run_sql(f"DESCRIBE TABLE hackers", fetch="all") + assert columns[0][1] == "String" + + columns = project.run_sql(f"DESCRIBE hackers_mv", fetch="all") + assert columns[0][1] == "String" + + check_relation_types( + project.adapter, + { + "hackers_mv": "view", + "hackers": "table", + }, + ) + + result = project.run_sql(f"select database, view, status from system.view_refreshes where database= '{project.test_schema}' and view='hackers_mv'", fetch="all") + assert result[0][2] == 'Scheduled' + + def test_validate_dependency(self, project): + """ + 1. create a base table via dbt seed + 2. create a refreshable mv model with non exist dependency and validation config, selecting from the table created in (1) + 3. make sure we get an error + """ + results = run_dbt(["seed"]) + assert len(results) == 1 + columns = project.run_sql(f"DESCRIBE TABLE {project.test_schema}.people", fetch="all") + assert columns[0][1] == "Int32" + + # re-run dbt but this time with the new MV SQL + run_vars = {"run_type": "validate_depends_on"} + result = run_dbt(["run", "--vars", json.dumps(run_vars)], False) + assert result[0].status == 'error' + assert 'No existing MV found matching MV' in result[0].message From 125da936baf07f96b2d2bbce79644208cc9ad8f5 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 29 Dec 2024 13:51:54 +0200 Subject: [PATCH 56/96] Add docs in README.md + indentation --- README.md | 355 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 249 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index eeb8d11e..830af44f 100644 --- a/README.md +++ b/README.md @@ -6,18 +6,21 @@ This plugin ports [dbt](https://getdbt.com) functionality to [Clickhouse](https://clickhouse.tech/). -The plugin uses syntax that requires ClickHouse version 22.1 or newer. We do not test older versions of Clickhouse. We also do not currently test +The plugin uses syntax that requires ClickHouse version 22.1 or newer. We do not test older versions of Clickhouse. We +also do not currently test Replicated tables or the related `ON CLUSTER` functionality. ## Installation Use your favorite Python package manager to install the app from PyPI, e.g. + ```bash pip install dbt-core dbt-clickhouse ``` -> **_NOTE:_** Beginning in v1.8, dbt-core and adapters are decoupled. Therefore, the installation mentioned above explicitly includes both dbt-core and the desired adapter.If you use a version prior to 1.8.0 the pip installation command should look like this: - +> **_NOTE:_** Beginning in v1.8, dbt-core and adapters are decoupled. Therefore, the installation mentioned above +> explicitly includes both dbt-core and the desired adapter.If you use a version prior to 1.8.0 the pip installation +> command should look like this: ```bash pip install dbt-clickhouse @@ -34,7 +37,7 @@ pip install dbt-clickhouse - [x] Docs generate - [x] Tests - [x] Snapshots -- [x] Most dbt-utils macros (now included in dbt-core) +- [x] Most dbt-utils macros (now included in dbt-core) - [x] Ephemeral materialization - [x] Distributed table materialization (experimental) - [x] Distributed incremental materialization (experimental) @@ -43,17 +46,20 @@ pip install dbt-clickhouse # Usage Notes ## SET Statement Warning + In many environments, using the SET statement to persist a ClickHouse setting across all DBT queries is not reliable -and can cause unexpected failures. This is particularly true when using HTTP connections through a load balancer that +and can cause unexpected failures. This is particularly true when using HTTP connections through a load balancer that distributes queries across multiple nodes (such as ClickHouse cloud), although in some circumstances this can also -happen with native ClickHouse connections. Accordingly, we recommend configuring any required ClickHouse settings in the +happen with native ClickHouse connections. Accordingly, we recommend configuring any required ClickHouse settings in the "custom_settings" property of the DBT profile as a best practice, instead of relying on a prehook "SET" statement as has been occasionally suggested. ## Database -The dbt model relation identifier `database.schema.table` is not compatible with Clickhouse because Clickhouse does not support a `schema`. -So we use a simplified approach `schema.table`, where `schema` is the Clickhouse database. Using the `default` database is not recommended. +The dbt model relation identifier `database.schema.table` is not compatible with Clickhouse because Clickhouse does not +support a `schema`. +So we use a simplified approach `schema.table`, where `schema` is the Clickhouse database. Using the `default` database +is not recommended. ## Example Profile @@ -112,112 +118,150 @@ your_profile_name: | query_settings | A map/dictionary of ClickHouse user level settings to be used with `INSERT` or `DELETE` statements in conjunction with this model | | | ttl | A TTL expression to be used with the table. The TTL expression is a string that can be used to specify the TTL for the table. | | - ## Column Configuration | Option | Description | Default if any | -| ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | +|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------| | codec | A string consisting of arguments passed to `CODEC()` in the column's DDL. For example: `codec: "Delta, ZSTD"` will be interpreted as `CODEC(Delta, ZSTD)`. | | -## ClickHouse Cluster +## ClickHouse Cluster The `cluster` setting in profile enables dbt-clickhouse to run against a ClickHouse cluster. ### Effective Scope - if `cluster` is set in profile, `on_cluster_clause` now will return cluster info for: + - Database creation - View materialization - Distributed materializations - Models with Replicated engines -table and incremental materializations with non-replicated engine will not be affected by `cluster` setting (model would be created on the connected node only). +table and incremental materializations with non-replicated engine will not be affected by `cluster` setting (model would +be created on the connected node only). ### Compatibility - -If a model has been created without a `cluster` setting, dbt-clickhouse will detect the situation and run all DDL/DML without `on cluster` clause for this model. - +If a model has been created without a `cluster` setting, dbt-clickhouse will detect the situation and run all DDL/DML +without `on cluster` clause for this model. ## A Note on Model Settings -ClickHouse has several types/levels of "settings". In the model configuration above, two types of these are configurable. `settings` means the `SETTINGS` -clause used in `CREATE TABLE/VIEW` types of DDL statements, so this is generally settings that are specific to the specific ClickHouse table engine. The new -`query_settings` is use to add a `SETTINGS` clause to the `INSERT` and `DELETE` queries used for model materialization (including incremental materializations). -There are hundreds of ClickHouse settings, and it's not always clear which is a "table" setting and which is a "user" setting (although the latter are generally -available in the `system.settings` table.) In general the defaults are recommended, and any use of these properties should be carefully researched and tested. - +ClickHouse has several types/levels of "settings". In the model configuration above, two types of these are +configurable. `settings` means the `SETTINGS` +clause used in `CREATE TABLE/VIEW` types of DDL statements, so this is generally settings that are specific to the +specific ClickHouse table engine. The new +`query_settings` is use to add a `SETTINGS` clause to the `INSERT` and `DELETE` queries used for model materialization ( +including incremental materializations). +There are hundreds of ClickHouse settings, and it's not always clear which is a "table" setting and which is a "user" +setting (although the latter are generally +available in the `system.settings` table.) In general the defaults are recommended, and any use of these properties +should be carefully researched and tested. ## Known Limitations -* Ephemeral models/CTEs don't work if placed before the "INSERT INTO" in a ClickHouse insert statement, see https://github.com/ClickHouse/ClickHouse/issues/30323. This -should not affect most models, but care should be taken where an ephemeral model is placed in model definitions and other SQL statements. +* Ephemeral models/CTEs don't work if placed before the "INSERT INTO" in a ClickHouse insert statement, + see https://github.com/ClickHouse/ClickHouse/issues/30323. This + should not affect most models, but care should be taken where an ephemeral model is placed in model definitions and + other SQL statements. ## Incremental Model Strategies As of version 1.3.2, dbt-clickhouse supports three incremental model strategies. -### The Default (Legacy) Strategy +### The Default (Legacy) Strategy -Historically ClickHouse has had only limited support for updates and deletes, in the form of asynchronous "mutations." To emulate expected dbt behavior, -dbt-clickhouse by default creates a new temporary table containing all unaffected (not deleted, not changed) "old" records, plus any new or updated records, -and then swaps or exchanges this temporary table with the existing incremental model relation. This is the only strategy that preserves the original relation if something -goes wrong before the operation completes; however, since it involves a full copy of the original table, it can be quite expensive and slow to execute. +Historically ClickHouse has had only limited support for updates and deletes, in the form of asynchronous "mutations." +To emulate expected dbt behavior, +dbt-clickhouse by default creates a new temporary table containing all unaffected (not deleted, not changed) "old" +records, plus any new or updated records, +and then swaps or exchanges this temporary table with the existing incremental model relation. This is the only strategy +that preserves the original relation if something +goes wrong before the operation completes; however, since it involves a full copy of the original table, it can be quite +expensive and slow to execute. ### The Delete+Insert Strategy -ClickHouse added "lightweight deletes" as an experimental feature in version 22.8. Lightweight deletes are significantly faster than ALTER TABLE ... DELETE -operations, because they don't require rewriting ClickHouse data parts. The incremental strategy `delete+insert` utilizes lightweight deletes to implement -incremental materializations that perform significantly better than the "legacy" strategy. However, there are important caveats to using this strategy: -- Lightweight deletes must be enabled on your ClickHouse server using the setting `allow_experimental_lightweight_delete=1` or you -must set `use_lw_deletes=true` in your profile (which will enable that setting for your dbt sessions) -- Lightweight deletes are now production ready, but there may be performance and other problems on ClickHouse versions earlier than 23.3. -- This strategy operates directly on the affected table/relation (with creating any intermediate or temporary tables), so if there is an issue during the operation, the -data in the incremental model is likely to be in an invalid state -- When using lightweight deletes, dbt-clickhouse enabled the setting `allow_nondeterministic_mutations`. In some very rare cases using non-deterministic incremental_predicates -this could result in a race condition for the updated/deleted items (and related log messages in the ClickHouse logs). To ensure consistent results the -incremental predicates should only include sub-queries on data that will not be modified during the incremental materialization. +ClickHouse added "lightweight deletes" as an experimental feature in version 22.8. Lightweight deletes are significantly +faster than ALTER TABLE ... DELETE +operations, because they don't require rewriting ClickHouse data parts. The incremental strategy `delete+insert` +utilizes lightweight deletes to implement +incremental materializations that perform significantly better than the "legacy" strategy. However, there are important +caveats to using this strategy: + +- Lightweight deletes must be enabled on your ClickHouse server using the setting + `allow_experimental_lightweight_delete=1` or you + must set `use_lw_deletes=true` in your profile (which will enable that setting for your dbt sessions) +- Lightweight deletes are now production ready, but there may be performance and other problems on ClickHouse versions + earlier than 23.3. +- This strategy operates directly on the affected table/relation (with creating any intermediate or temporary tables), + so if there is an issue during the operation, the + data in the incremental model is likely to be in an invalid state +- When using lightweight deletes, dbt-clickhouse enabled the setting `allow_nondeterministic_mutations`. In some very + rare cases using non-deterministic incremental_predicates + this could result in a race condition for the updated/deleted items (and related log messages in the ClickHouse logs). + To ensure consistent results the + incremental predicates should only include sub-queries on data that will not be modified during the incremental + materialization. ### The Append Strategy -This strategy replaces the `inserts_only` setting in previous versions of dbt-clickhouse. This approach simply appends new rows to the existing relation. -As a result duplicate rows are not eliminated, and there is no temporary or intermediate table. It is the fastest approach if duplicates are either permitted +This strategy replaces the `inserts_only` setting in previous versions of dbt-clickhouse. This approach simply appends +new rows to the existing relation. +As a result duplicate rows are not eliminated, and there is no temporary or intermediate table. It is the fastest +approach if duplicates are either permitted in the data or excluded by the incremental query WHERE clause/filter. ### The insert_overwrite Strategy (Experimental) + > [IMPORTANT] -> Currently, the insert_overwrite strategy is not fully functional with distributed materializations. - +> Currently, the insert_overwrite strategy is not fully functional with distributed materializations. + Performs the following steps: -1. Create a staging (temporary) table with the same structure as the incremental model relation: `CREATE TABLE AS `. + +1. Create a staging (temporary) table with the same structure as the incremental model relation: + `CREATE TABLE AS `. 2. Insert only new records (produced by `SELECT`) into the staging table. 3. Replace only new partitions (present in the staging table) into the target table. This approach has the following advantages: + - It is faster than the default strategy because it doesn't copy the entire table. -- It is safer than other strategies because it doesn't modify the original table until the INSERT operation completes successfully: in case of intermediate failure, the original table is not modified. -- It implements "partitions immutability" data engineering best practice. Which simplifies incremental and parallel data processing, rollbacks, etc. +- It is safer than other strategies because it doesn't modify the original table until the INSERT operation completes + successfully: in case of intermediate failure, the original table is not modified. +- It implements "partitions immutability" data engineering best practice. Which simplifies incremental and parallel data + processing, rollbacks, etc. -The strategy requires `partition_by` to be set in the model configuration. Ignores all other strategies-specific parameters of the model config. +The strategy requires `partition_by` to be set in the model configuration. Ignores all other strategies-specific +parameters of the model config. ## Additional ClickHouse Macros ### Model Materialization Utility Macros The following macros are included to facilitate creating ClickHouse specific tables and views: -- `engine_clause` -- Uses the `engine` model configuration property to assign a ClickHouse table engine. dbt-clickhouse uses the `MergeTree` engine by default. -- `partition_cols` -- Uses the `partition_by` model configuration property to assign a ClickHouse partition key. No partition key is assigned by default. -- `order_cols` -- Uses the `order_by` model configuration to assign a ClickHouse order by/sorting key. If not specified ClickHouse will use an empty tuple() and the table will be unsorted -- `primary_key_clause` -- Uses the `primary_key` model configuration property to assign a ClickHouse primary key. By default, primary key is set and ClickHouse will use the order by clause as the primary key. -- `on_cluster_clause` -- Uses the `cluster` profile property to add an `ON CLUSTER` clause to certain dbt-operations: distributed materializations, views creation, database creation. -- `ttl_config` -- Uses the `ttl` model configuration property to assign a ClickHouse table TTL expression. No TTL is assigned by default. + +- `engine_clause` -- Uses the `engine` model configuration property to assign a ClickHouse table engine. dbt-clickhouse + uses the `MergeTree` engine by default. +- `partition_cols` -- Uses the `partition_by` model configuration property to assign a ClickHouse partition key. No + partition key is assigned by default. +- `order_cols` -- Uses the `order_by` model configuration to assign a ClickHouse order by/sorting key. If not specified + ClickHouse will use an empty tuple() and the table will be unsorted +- `primary_key_clause` -- Uses the `primary_key` model configuration property to assign a ClickHouse primary key. By + default, primary key is set and ClickHouse will use the order by clause as the primary key. +- `on_cluster_clause` -- Uses the `cluster` profile property to add an `ON CLUSTER` clause to certain dbt-operations: + distributed materializations, views creation, database creation. +- `ttl_config` -- Uses the `ttl` model configuration property to assign a ClickHouse table TTL expression. No TTL is + assigned by default. ### s3Source Helper Macro -The `s3source` macro simplifies the process of selecting ClickHouse data directly from S3 using the ClickHouse S3 table function. It works by -populating the S3 table function parameters from a named configuration dictionary (the name of the dictionary must end in `s3`). The macro -first looks for the dictionary in the profile `vars`, and then in the model configuration. The dictionary can contain any of the following +The `s3source` macro simplifies the process of selecting ClickHouse data directly from S3 using the ClickHouse S3 table +function. It works by +populating the S3 table function parameters from a named configuration dictionary (the name of the dictionary must end +in `s3`). The macro +first looks for the dictionary in the profile `vars`, and then in the model configuration. The dictionary can contain +any of the following keys used to populate the parameters of the S3 table function: | Argument Name | Description | @@ -230,25 +274,38 @@ keys used to populate the parameters of the S3 table function: | aws_secret_access_key | The S3 secret key. | | role_arn | The ARN of a ClickhouseAccess IAM role to use to securely access the S3 objects. See this [documentation](https://clickhouse.com/docs/en/cloud/security/secure-s3) for more information. | | compression | The compression method used with the S3 objects. If not provided ClickHouse will attempt to determine compression based on the file name. | -See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/clickhouse/test_clickhouse_s3.py) for examples of how to use this macro. + +See +the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/clickhouse/test_clickhouse_s3.py) +for examples of how to use this macro. # Contracts and Constraints -Only exact column type contracts are supported. For example, a contract with a UInt32 column type will fail if the model returns a UInt64 or other integer type. -ClickHouse also support _only_ `CHECK` constraints on the entire table/model. Primary key, foreign key, unique, and column level CHECK constraints are not supported. +Only exact column type contracts are supported. For example, a contract with a UInt32 column type will fail if the model +returns a UInt64 or other integer type. +ClickHouse also support _only_ `CHECK` constraints on the entire table/model. Primary key, foreign key, unique, and +column level CHECK constraints are not supported. (See ClickHouse documentation on primary/order by keys.) # Materialized Views (Experimental) -A `materialized_view` materialization should be a `SELECT` from an existing (source) table. The adapter will create a target table with the model name -and a ClickHouse MATERIALIZED VIEW with the name `_mv`. Unlike PostgreSQL, a ClickHouse materialized view is not "static" (and has -no corresponding REFRESH operation). Instead, it acts as an "insert trigger", and will insert new rows into the target table using the defined `SELECT` -"transformation" in the view definition on rows inserted into the source table. See the [test file] -(https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/materialized_view/test_materialized_view.py) for an introductory example + +A `materialized_view` materialization should be a `SELECT` from an existing (source) table. The adapter will create a +target table with the model name +and a ClickHouse MATERIALIZED VIEW with the name `_mv`. Unlike PostgreSQL, a ClickHouse materialized view is +not "static" (and has +no corresponding REFRESH operation). Instead, it acts as an "insert trigger", and will insert new rows into the target +table using the defined `SELECT` +"transformation" in the view definition on rows inserted into the source table. See the [test file] +(https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/materialized_view/test_materialized_view.py) +for an introductory example of how to use this functionality. -Clickhouse provides the ability for more than one materialized view to write records to the same target table. To support this in dbt-clickhouse, you can construct a `UNION` in your model file, such that the SQL for each of your materialized views is wrapped with comments of the form `--my_mv_name:begin` and `--my_mv_name:end`. +Clickhouse provides the ability for more than one materialized view to write records to the same target table. To +support this in dbt-clickhouse, you can construct a `UNION` in your model file, such that the SQL for each of your +materialized views is wrapped with comments of the form `--my_mv_name:begin` and `--my_mv_name:end`. -For example the following will build two materialized views both writing data to the same destination table of the model. The names of the materialized views will take the form `_mv1` and `_mv2` : +For example the following will build two materialized views both writing data to the same destination table of the +model. The names of the materialized views will take the form `_mv1` and `_mv2` : ``` --mv1:begin @@ -260,17 +317,23 @@ select a,b,c from {{ source('raw', 'table_2') }} --mv2:end ``` -> IMPORTANT! -> -> When updating a model with multiple materialized views (MVs), especially when renaming one of the MV names, dbt-clickhouse does not automatically drop the old MV. Instead, -> you will encounter the following warning: `Warning - Table was detected with the same pattern as model name but was not found in this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!) ` +> IMPORTANT! +> +> When updating a model with multiple materialized views (MVs), especially when renaming one of the MV names, +> dbt-clickhouse does not automatically drop the old MV. Instead, +> you will encounter the following warning: +`Warning - Table was detected with the same pattern as model name but was not found in this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!) ` ## Data catchup -Currently, when creating a materialized view (MV), the target table is first populated with historical data before the MV itself is created. -In other words, dbt-clickhouse initially creates the target table and preloads it with historical data based on the query defined for the MV. Only after this step is the MV created. +Currently, when creating a materialized view (MV), the target table is first populated with historical data before the +MV itself is created. -If you prefer not to preload historical data during MV creation, you can disable this behavior by setting the catchup config to False: +In other words, dbt-clickhouse initially creates the target table and preloads it with historical data based on the +query defined for the MV. Only after this step is the MV created. + +If you prefer not to preload historical data during MV creation, you can disable this behavior by setting the catchup +config to False: ```python {{config( @@ -281,27 +344,66 @@ If you prefer not to preload historical data during MV creation, you can disable )}} ``` +## Refreshable Materialized Views + +To use [Refreshable Materialized View](https://clickhouse.com/docs/en/materialized-view/refreshable-materialized-view), +please adjust the following configs as needed in your MV model (all these configs are supposed to be set inside a +refreshable config object): + +| Option | Description | Required | Default Value | +|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|---------------| +| refresh_interval | The interval clause (required) | Yes | | +| randomize | The randomization clause, will appear after `RANDOMIZE FOR` | | | +| append | If set to `True`, each refresh inserts rows into the table without deleting existing rows. The insert is not atomic, just like a regular INSERT SELECT. | | False | +| depends_on | A dependencies list for the refreshable mv. Please provide the dependencies in the following format `{schema}.{view_name}` | | | +| depends_on_validation | Whether to validate the existence of the dependencies provided in `depends_on`. In case a dependency doesn't contain a schema, the validation occurs on schema `default` | | False | + +A config example for refreshable materialized view: +```python +{{ + config( + materialized='materialized_view', + refreshable={ + "interval": "EVERY 5 MINUTE", + "randomize": "1 MINUTE", + "append": True, + "depends_on": ['schema.depend_on_model'], + "depends_on_validation": True + } + ) +}} +``` + +> [!IMPORTANT] +> The refreshable feature was not tested with multiple mvs directing to the same target model. # Dictionary materializations (experimental) -See the tests in https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/dictionary/test_dictionary.py for examples of how to -implement materializations for ClickHouse dictionaries + +See the tests +in https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/dictionary/test_dictionary.py for +examples of how to +implement materializations for ClickHouse dictionaries # Distributed materializations Notes: -- dbt-clickhouse queries now automatically include the setting `insert_distributed_sync = 1` in order to ensure that downstream incremental -materialization operations execute correctly. This could cause some distributed table inserts to run more slowly than expected. +- dbt-clickhouse queries now automatically include the setting `insert_distributed_sync = 1` in order to ensure that + downstream incremental + materialization operations execute correctly. This could cause some distributed table inserts to run more slowly than + expected. ## Distributed table materialization Distributed table created with following steps: + 1. Creates temp view with sql query to get right structure -2. Create empty local tables based on view -3. Create distributed table based on local tables. +2. Create empty local tables based on view +3. Create distributed table based on local tables. 4. Data inserts into distributed table, so it is distributed across shards without duplicating. ### Distributed table model example + ```sql {{ config( @@ -312,7 +414,8 @@ Distributed table created with following steps: ) }} -select id, created_at, item from {{ source('db', 'table') }} +select id, created_at, item +from {{ source('db', 'table') }} ``` ### Generated migrations @@ -320,36 +423,56 @@ select id, created_at, item from {{ source('db', 'table') }} ```sql CREATE TABLE db.table_local on cluster cluster ( - `id` UInt64, - `created_at` DateTime, - `item` String + `id` + UInt64, + `created_at` + DateTime, + `item` + String ) -ENGINE = ReplacingMergeTree -ORDER BY (id, created_at) -SETTINGS index_granularity = 8192; + ENGINE = ReplacingMergeTree + ORDER BY +( + id, + created_at +) + SETTINGS index_granularity = 8192; CREATE TABLE db.table on cluster cluster ( - `id` UInt64, - `created_at` DateTime, - `item` String + `id` + UInt64, + `created_at` + DateTime, + `item` + String ) -ENGINE = Distributed('cluster', 'db', 'table_local', cityHash64(id)); + ENGINE = Distributed +( + 'cluster', + 'db', + 'table_local', + cityHash64 +( + id +)); ``` ## Distributed incremental materialization -Incremental model based on the same idea as distributed table, the main difficulty is to process all incremental strategies correctly. +Incremental model based on the same idea as distributed table, the main difficulty is to process all incremental +strategies correctly. 1. _The Append Strategy_ just insert data into distributed table. 2. _The Delete+Insert_ Strategy creates distributed temp table to work with all data on every shard. 3. _The Default (Legacy) Strategy_ creates distributed temp and intermediate tables for the same reason. -Only shard tables are replacing, because distributed table does not keep data. +Only shard tables are replacing, because distributed table does not keep data. The distributed table reloads only when the full_refresh mode is enabled or the table structure may have changed. ### Distributed incremental model example + ```sql {{ config( @@ -360,7 +483,8 @@ The distributed table reloads only when the full_refresh mode is enabled or the ) }} -select id, created_at, item from {{ source('db', 'table') }} +select id, created_at, item +from {{ source('db', 'table') }} ``` ### Generated migrations @@ -368,27 +492,46 @@ select id, created_at, item from {{ source('db', 'table') }} ```sql CREATE TABLE db.table_local on cluster cluster ( - `id` UInt64, - `created_at` DateTime, - `item` String + `id` + UInt64, + `created_at` + DateTime, + `item` + String ) -ENGINE = MergeTree -SETTINGS index_granularity = 8192; + ENGINE = MergeTree + SETTINGS index_granularity = 8192; CREATE TABLE db.table on cluster cluster ( - `id` UInt64, - `created_at` DateTime, - `item` String + `id` + UInt64, + `created_at` + DateTime, + `item` + String ) -ENGINE = Distributed('cluster', 'db', 'table_local', cityHash64(id)); + ENGINE = Distributed +( + 'cluster', + 'db', + 'table_local', + cityHash64 +( + id +)); ``` ## Contributing -We welcome contributions from the community to help improve the dbt-ClickHouse adapter. Whether you’re fixing a bug, adding a new feature, or enhancing documentation, your efforts are greatly appreciated! -Please take a moment to read our [Contribution Guide](CONTRIBUTING.md) to get started. The guide provides detailed instructions on setting up your environment, running tests, and submitting pull requests. +We welcome contributions from the community to help improve the dbt-ClickHouse adapter. Whether you’re fixing a bug, +adding a new feature, or enhancing documentation, your efforts are greatly appreciated! + +Please take a moment to read our [Contribution Guide](CONTRIBUTING.md) to get started. The guide provides detailed +instructions on setting up your environment, running tests, and submitting pull requests. ## Original Author -ClickHouse wants to thank @[silentsokolov](https://github.com/silentsokolov) for creating this connector and for their valuable contributions. + +ClickHouse wants to thank @[silentsokolov](https://github.com/silentsokolov) for creating this connector and for their +valuable contributions. From f8c43a46cd7447643cbd4d4c5ae4052c0c2d4c24 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 29 Dec 2024 13:54:49 +0200 Subject: [PATCH 57/96] update CHANGELOG.md --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a775c05..9aba795d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ ### Unreleased +### New Features +* Added support for [refreshable materialized view](https://clickhouse.com/docs/en/materialized-view/refreshable-materialized-view) ([#401](https://github.com/ClickHouse/dbt-clickhouse/pull/401)) ### Improvement - -Avoid potential data loss by using `CREATE OR REPLACE DICTIONARY` to atomically update a dictionary (#393) +* Avoid potential data loss by using `CREATE OR REPLACE DICTIONARY` to atomically update a dictionary ([#393](https://github.com/ClickHouse/dbt-clickhouse/pull/393)) ### Release [1.8.6], 2024-12-05 From 7ceed417533abded38753193c0973adc5b98346d Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 29 Dec 2024 13:59:39 +0200 Subject: [PATCH 58/96] disable the dependencies validation by default --- .../clickhouse/macros/materializations/materialized_view.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 417e8247..209a04b3 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -210,7 +210,7 @@ {% set refresh_interval = refreshable_config.get('interval', none) %} {% set refresh_randomize = refreshable_config.get('randomize', none) %} {% set depends_on = refreshable_config.get('depends_on', none) %} - {% set depends_on_validation = refreshable_config.get('depends_on_validation', true) %} + {% set depends_on_validation = refreshable_config.get('depends_on_validation', false) %} {% set append = refreshable_config.get('append', false) %} {% if not refresh_interval %} From fa1d9ac5cc97585b3fd4517c202ae4391f175fb1 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 29 Dec 2024 14:01:19 +0200 Subject: [PATCH 59/96] fix lint --- .../materialized_view/test_refreshable_materialized_view.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py b/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py index c0221688..3bc1e1db 100644 --- a/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py @@ -101,7 +101,10 @@ def test_create(self, project): }, ) - result = project.run_sql(f"select database, view, status from system.view_refreshes where database= '{project.test_schema}' and view='hackers_mv'", fetch="all") + result = project.run_sql( + f"select database, view, status from system.view_refreshes where database= '{project.test_schema}' and view='hackers_mv'", + fetch="all" + ) assert result[0][2] == 'Scheduled' def test_validate_dependency(self, project): From bd2ae04741b11b64b756640d8d6c0e0dcbedc834 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 29 Dec 2024 14:11:33 +0200 Subject: [PATCH 60/96] add limitation section to refreshable mv --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 830af44f..11c1f728 100644 --- a/README.md +++ b/README.md @@ -374,8 +374,11 @@ A config example for refreshable materialized view: }} ``` -> [!IMPORTANT] -> The refreshable feature was not tested with multiple mvs directing to the same target model. +### Limitations +* When creating a refreshable materialized view (MV) in ClickHouse that has a dependency, ClickHouse does not throw an error if the specified dependency does not exist at the time of creation. Instead, the refreshable MV remains in an inactive state, waiting for the dependency to be satisfied before it starts processing updates or refreshing. +This behavior is by design, but it may lead to delays in data availability if the required dependency is not addressed promptly. Users are advised to ensure all dependencies are correctly defined and exist before creating a refreshable materialized view. +* As of today, there is no actual "dbt linkage" between the mv and its dependencies, therefore the creation order is not guaranteed. +* The refreshable feature was not tested with multiple mvs directing to the same target model. # Dictionary materializations (experimental) From e222d83d5c4fa4b27a4c80de7cd9fdf7bf5d81e6 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 29 Dec 2024 14:11:52 +0200 Subject: [PATCH 61/96] fix lint --- .../materialized_view/test_refreshable_materialized_view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py b/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py index 3bc1e1db..dac51f4d 100644 --- a/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_refreshable_materialized_view.py @@ -103,7 +103,7 @@ def test_create(self, project): result = project.run_sql( f"select database, view, status from system.view_refreshes where database= '{project.test_schema}' and view='hackers_mv'", - fetch="all" + fetch="all", ) assert result[0][2] == 'Scheduled' From 63b8d7950751afbeb77019d79cc541160c6bac29 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Mon, 30 Dec 2024 11:06:23 +0200 Subject: [PATCH 62/96] use count due to differences between native and http when getting empty dataset --- .../clickhouse/macros/materializations/materialized_view.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 209a04b3..684e77ab 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -283,7 +283,7 @@ {%- set condition = "database='" + database + "' and view='" + table + "'" %} {% set query %} - select database, view + select count(*) from system.view_refreshes where {{ condition }} {% endset %} From 34a7280ed2561fdf800a6bec0586dc27a3b0d8df Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Mon, 30 Dec 2024 11:06:29 +0200 Subject: [PATCH 63/96] use count due to differences between native and http when getting empty dataset --- .../clickhouse/macros/materializations/materialized_view.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 684e77ab..e8847c44 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -289,7 +289,8 @@ {% endset %} {% set tables_result = run_query(query) %} - {% if tables_result is not none and tables_result.columns %} + {{ log(tables_result.columns[0].values()[0]) }} + {% if tables_result.columns[0].values()[0] > 0 %} {{ log('MV ' + mv + ' exists.') }} {% else %} {% do exceptions.raise_compiler_error( From 289eb896ba3d7381a02a5f1ecc676346b7a348f8 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Mon, 30 Dec 2024 11:06:54 +0200 Subject: [PATCH 64/96] add a new line between refresh and randomize clauses --- .../clickhouse/macros/materializations/materialized_view.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index e8847c44..741ec631 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -222,6 +222,7 @@ {% if refresh_interval %} REFRESH {{ refresh_interval }} + {# This is a comment to force a new line between REFRESH and RANDOMIZE clauses #} {%- if refresh_randomize -%} RANDOMIZE FOR {{ refresh_randomize }} {%- endif -%} From 3ad5d828ebd2957676149a7dbbcbe726ba22a37e Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 5 Jan 2025 09:51:12 +0100 Subject: [PATCH 65/96] organize the CHANGELOG.md release section of version 1.8.7 --- CHANGELOG.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9aba795d..10805031 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,15 @@ -### Unreleased +### Release [1.8.7], 2025-01-05 ### New Features * Added support for [refreshable materialized view](https://clickhouse.com/docs/en/materialized-view/refreshable-materialized-view) ([#401](https://github.com/ClickHouse/dbt-clickhouse/pull/401)) + ### Improvement * Avoid potential data loss by using `CREATE OR REPLACE DICTIONARY` to atomically update a dictionary ([#393](https://github.com/ClickHouse/dbt-clickhouse/pull/393)) +* Removed support in python 3.8 as it is no longer supported by dbt ([#402](https://github.com/ClickHouse/dbt-clickhouse/pull/402) + +### Bug Fixes +* Remove python 3.8 leftovers from the project ([#402](https://github.com/ClickHouse/dbt-clickhouse/pull/402)) +* Fix a minor bug related to validating existence of an old hanging mv ([#396]()) ### Release [1.8.6], 2024-12-05 From 35b70af33c3fb88a0924c867604cce2ae9120fe8 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 5 Jan 2025 09:54:30 +0100 Subject: [PATCH 66/96] increase version --- dbt/adapters/clickhouse/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index f22f38bf..8d30f618 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.8.6' +version = '1.8.7' From d2a63b06f4e6b53b80fb2288dd86d46b74f9bd15 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 5 Jan 2025 10:18:56 +0100 Subject: [PATCH 67/96] update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10805031..d4cbeae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ * Removed support in python 3.8 as it is no longer supported by dbt ([#402](https://github.com/ClickHouse/dbt-clickhouse/pull/402) ### Bug Fixes -* Remove python 3.8 leftovers from the project ([#402](https://github.com/ClickHouse/dbt-clickhouse/pull/402)) * Fix a minor bug related to validating existence of an old hanging mv ([#396]()) ### Release [1.8.6], 2024-12-05 From 0a6cc4d7134660afb145f82aaa4e05c8ef9fd110 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Mon, 6 Jan 2025 08:31:37 +0100 Subject: [PATCH 68/96] add indication for indexes in the README.md file --- README.md | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 11c1f728..59aba516 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ your_profile_name: | settings | A map/dictionary of "TABLE" settings to be used to DDL statements like 'CREATE TABLE' with this model | | | query_settings | A map/dictionary of ClickHouse user level settings to be used with `INSERT` or `DELETE` statements in conjunction with this model | | | ttl | A TTL expression to be used with the table. The TTL expression is a string that can be used to specify the TTL for the table. | | +| indexes | A list of indexes to create, available only for `table` materialization. For examples look at ([#397](https://github.com/ClickHouse/dbt-clickhouse/pull/397)) | | ## Column Configuration @@ -359,25 +360,32 @@ refreshable config object): | depends_on_validation | Whether to validate the existence of the dependencies provided in `depends_on`. In case a dependency doesn't contain a schema, the validation occurs on schema `default` | | False | A config example for refreshable materialized view: + ```python {{ config( materialized='materialized_view', refreshable={ - "interval": "EVERY 5 MINUTE", - "randomize": "1 MINUTE", - "append": True, - "depends_on": ['schema.depend_on_model'], - "depends_on_validation": True + "interval": "EVERY 5 MINUTE", + "randomize": "1 MINUTE", + "append": True, + "depends_on": ['schema.depend_on_model'], + "depends_on_validation": True } - ) + ) }} ``` ### Limitations -* When creating a refreshable materialized view (MV) in ClickHouse that has a dependency, ClickHouse does not throw an error if the specified dependency does not exist at the time of creation. Instead, the refreshable MV remains in an inactive state, waiting for the dependency to be satisfied before it starts processing updates or refreshing. -This behavior is by design, but it may lead to delays in data availability if the required dependency is not addressed promptly. Users are advised to ensure all dependencies are correctly defined and exist before creating a refreshable materialized view. -* As of today, there is no actual "dbt linkage" between the mv and its dependencies, therefore the creation order is not guaranteed. + +* When creating a refreshable materialized view (MV) in ClickHouse that has a dependency, ClickHouse does not throw an + error if the specified dependency does not exist at the time of creation. Instead, the refreshable MV remains in an + inactive state, waiting for the dependency to be satisfied before it starts processing updates or refreshing. + This behavior is by design, but it may lead to delays in data availability if the required dependency is not addressed + promptly. Users are advised to ensure all dependencies are correctly defined and exist before creating a refreshable + materialized view. +* As of today, there is no actual "dbt linkage" between the mv and its dependencies, therefore the creation order is not + guaranteed. * The refreshable feature was not tested with multiple mvs directing to the same target model. # Dictionary materializations (experimental) From a4305252500a8213f8cbe054a9f0b84cfb4420ab Mon Sep 17 00:00:00 2001 From: Shachi Bista Date: Tue, 7 Jan 2025 00:10:15 +0100 Subject: [PATCH 69/96] Change white-list to black-list --- dbt/adapters/clickhouse/impl.py | 167 ++------------------------------ 1 file changed, 10 insertions(+), 157 deletions(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 65df492f..d39a9166 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -47,156 +47,9 @@ GET_CATALOG_MACRO_NAME = 'get_catalog' LIST_SCHEMAS_MACRO_NAME = 'list_schemas' -ENGINE_SETTINGS = { - 'MergeTree': [ - "index_granularity", - "index_granularity_bytes", - "min_index_granularity_bytes", - "enable_mixed_granularity_parts", - "use_minimalistic_part_header_in_zookeeper", - "min_merge_bytes_to_use_direct_io", - "merge_with_ttl_timeout", - "merge_with_recompression_ttl_timeout", - "write_final_mark", - "storage_policy", - "min_bytes_for_wide_part", - "max_compress_block_size", - "min_compress_block_size", - "max_suspicious_broken_parts", - "parts_to_throw_insert", - "parts_to_delay_insert", - "inactive_parts_to_throw_insert", - "inactive_parts_to_delay_insert", - "max_delay_to_insert", - "max_parts_in_total", - "simultaneous_parts_removal_limit", - "replicated_deduplication_window", - "non_replicated_deduplication_window", - "replicated_deduplication_window_seconds", - "replicated_deduplication_window_for_async_inserts", - "replicated_deduplication_window_seconds_for_async_inserts", - "use_async_block_ids_cache", - "async_block_ids_cache_min_update_interval_ms", - "max_replicated_logs_to_keep", - "min_replicated_logs_to_keep", - "prefer_fetch_merged_part_time_threshold", - "prefer_fetch_merged_part_size_threshold", - "execute_merges_on_single_replica_time_threshold", - "remote_fs_execute_merges_on_single_replica_time_threshold", - "try_fetch_recompressed_part_timeout", - "always_fetch_merged_part", - "max_suspicious_broken_parts", - "max_suspicious_broken_parts_bytes", - "max_files_to_modify_in_alter_columns", - "max_files_to_remove_in_alter_columns", - "replicated_max_ratio_of_wrong_parts", - "replicated_max_parallel_fetches_for_host", - "replicated_fetches_http_connection_timeout", - "replicated_can_become_leader", - "zookeeper_session_expiration_check_period", - "detach_old_local_parts_when_cloning_replica", - "replicated_fetches_http_connection_timeout", - "replicated_fetches_http_send_timeout", - "replicated_fetches_http_receive_timeout", - "max_replicated_fetches_network_bandwidth", - "max_replicated_sends_network_bandwidth", - "old_parts_lifetime", - "max_bytes_to_merge_at_max_space_in_pool", - "max_bytes_to_merge_at_min_space_in_pool", - "merge_max_block_size", - "number_of_free_entries_in_pool_to_lower_max_size_of_merge", - "number_of_free_entries_in_pool_to_execute_mutation", - "max_part_loading_threads", - "max_partitions_to_read", - "min_age_to_force_merge_seconds", - "min_age_to_force_merge_on_partition_only", - "number_of_free_entries_in_pool_to_execute_optimize_entire_partition", - "allow_floating_point_partition_key", - "check_sample_column_is_correct", - "min_bytes_to_rebalance_partition_over_jbod", - "detach_not_byte_identical_parts", - "merge_tree_clear_old_temporary_directories_interval_seconds", - "merge_tree_clear_old_parts_interval_seconds", - "max_concurrent_queries", - "min_marks_to_honor_max_concurrent_queries", - "ratio_of_defaults_for_sparse_serialization", - "replace_long_file_name_to_hash", - "max_file_name_length", - "allow_experimental_block_number_column", - "exclude_deleted_rows_for_part_size_in_merge", - "load_existing_rows_count_for_old_parts", - "use_compact_variant_discriminators_serialization", - "merge_workload", - "mutation_workload", - "lightweight_mutation_projection_mode", - "deduplicate_merge_projection_mode", - "min_free_disk_bytes_to_perform_insert", - "min_free_disk_ratio_to_perform_insert", - ], - 'Memory': ['min_bytes_to_keep', 'max_bytes_to_keep', 'min_rows_to_keep', 'max_rows_to_keep'], - 'URL': ['engine_url_skip_empty_files', 'enable_url_encoding'], - 'File': [ - 'engine_file_empty_if_not_exists', - 'engine_file_truncate_on_insert', - 'engine_file_allow_create_multiple_files', - 'engine_file_skip_empty_files', - 'storage_file_read_method', - ], - 'Distributed': [ - "fsync_after_insert", - "fsync_directories", - "skip_unavailable_shards", - "bytes_to_throw_insert", - "bytes_to_delay_insert", - "max_delay_to_insert", - "background_insert_batch", - "background_insert_split_batch_on_failure", - "background_insert_sleep_time_ms", - "background_insert_max_sleep_time_ms", - "flush_on_detach", - ], - 'MySQL': [ - 'connection_pool_size', - 'connection_max_tries', - 'connection_wait_timeout', - 'connection_auto_close', - 'connection_timeout', - 'read_write_timeout', - ], - 'S3': [ - 's3_truncate_on_insert', - 's3_create_new_file_on_insert', - 's3_skip_empty_files', - 's3_max_single_part_upload_size', - 's3_min_upload_part_size', - 's3_max_redirects', - 's3_single_read_retries', - 's3_max_put_rps', - 's3_max_put_burst', - 's3_max_get_rps', - 's3_max_get_burst', - 's3_upload_part_size_multiply_factor', - 's3_upload_part_size_multiply_parts_count_threshold', - 's3_max_inflight_parts_for_one_file', - 'endpoint', - 'access_key_id', - 'secret_access_key', - 'use_environment_credentials', - 'region', - 'use_insecure_imds_request', - 'expiration_window_seconds', - 'no_sign_request', - 'header', - 'server_side_encryption_customer_key_base64', - 'server_side_encryption_kms_key_id', - 'server_side_encryption_kms_encryption_context', - 'server_side_encryption_kms_bucket_key_enabled', - 'max_single_read_retries', - 'max_put_rps', - 'max_put_burst', - 'max_get_rps', - 'max_get_burst', - ], +IGNORED_SETTINGS = { + 'Memory': ['replicated_deduplication_window'], + 'S3': ['replicated_deduplication_window'], } @@ -626,21 +479,21 @@ def get_model_settings(self, model, engine='MergeTree'): def filter_settings_by_engine(self, settings, engine): filtered_settings = {} - if engine not in ENGINE_SETTINGS: - # If the engine has no settings it will not be in the ENGINE_SETTINGS map. + if engine not in IGNORED_SETTINGS: + # If the engine has no settings it will not be in the IGNORED_SETTINGS map. return filtered_settings if engine.endswith('MergeTree'): # Special case for MergeTree due to all its variations. - allowed_settings = ENGINE_SETTINGS['MergeTree'] + ignored_settings = IGNORED_SETTINGS.get('MergeTree', []) else: - allowed_settings = ENGINE_SETTINGS[engine] + ignored_settings = IGNORED_SETTINGS.get(engine, []) for key, value in settings.items(): - if key in allowed_settings: - filtered_settings[key] = value - else: + if key in ignored_settings: logger.warning(f"Setting {key} not available for engine {engine}, ignoring.") + else: + filtered_settings[key] = value return filtered_settings From e299ecd2b193c448464b6a60bef63c698cdb775a Mon Sep 17 00:00:00 2001 From: Shachi Bista Date: Wed, 8 Jan 2025 17:19:27 +0100 Subject: [PATCH 70/96] Fixes from review --- dbt/adapters/clickhouse/impl.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 783f6efb..91238ddf 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -480,10 +480,6 @@ def get_model_settings(self, model, engine='MergeTree'): def filter_settings_by_engine(self, settings, engine): filtered_settings = {} - if engine not in IGNORED_SETTINGS: - # If the engine has no settings it will not be in the IGNORED_SETTINGS map. - return filtered_settings - if engine.endswith('MergeTree'): # Special case for MergeTree due to all its variations. ignored_settings = IGNORED_SETTINGS.get('MergeTree', []) From 53b20a2e745554aa909e4c404239e0ee794da4c4 Mon Sep 17 00:00:00 2001 From: Benjamin Dornel Date: Fri, 17 Jan 2025 18:33:01 +0800 Subject: [PATCH 71/96] chore: allow optional whitespace in multi mv implementation --- .../clickhouse/macros/materializations/materialized_view.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 741ec631..d1167432 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -37,13 +37,13 @@ {{ run_hooks(pre_hooks, inside_transaction=True) }} -- extract the names of the materialized views from the sql - {% set view_names = modules.re.findall('--([^:]+):begin', sql) %} + {% set view_names = modules.re.findall('--(?:\s)?([^:]+):begin', sql) %} -- extract the sql for each of the materialized view into a map {% set views = {} %} {% if view_names %} {% for view_name in view_names %} - {% set view_sql = modules.re.findall('--' + view_name + ':begin(.*)--' + view_name + ':end', sql, flags=modules.re.DOTALL)[0] %} + {% set view_sql = modules.re.findall('--(?:\s)?' + view_name + ':begin(.*)--(?:\s)?' + view_name + ':end', sql, flags=modules.re.DOTALL)[0] %} {%- set _ = views.update({view_name: view_sql}) -%} {% endfor %} {% else %} From 47dc632e1b3ce79a52bc1250ee4dd2bfe2f94d7c Mon Sep 17 00:00:00 2001 From: Avrora Date: Tue, 21 Jan 2025 10:19:28 +0200 Subject: [PATCH 72/96] docs: readme fixed a typo in the documentation --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 0105f0a1..a5bf2543 100644 --- a/README.md +++ b/README.md @@ -310,8 +310,7 @@ and a ClickHouse MATERIALIZED VIEW with the name `_mv`. Unlike Postg not "static" (and has no corresponding REFRESH operation). Instead, it acts as an "insert trigger", and will insert new rows into the target table using the defined `SELECT` -"transformation" in the view definition on rows inserted into the source table. See the [test file] -(https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/materialized_view/test_materialized_view.py) +"transformation" in the view definition on rows inserted into the source table. See the [test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/materialized_view/test_materialized_view.py) for an introductory example of how to use this functionality. From d5ba1255c2467f03ab0a95acc02c70508754b6e7 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 30 Jan 2025 10:40:56 +0100 Subject: [PATCH 73/96] fix merge - refreshable mvs --- .../materializations/materialized_view.sql | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 4ce8d860..7479d425 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -95,7 +95,7 @@ {%- endcall %} {% do exchange_tables_atomic(backup_relation, existing_relation) %} - {{ clickhouse__create_mvs(existing_relation, cluster_clause, views) }} + {{ clickhouse__create_mvs(existing_relation, cluster_clause, refreshable_clause, views) }} {% else %} -- we need to have a 'main' statement {% call statement('main') -%} @@ -103,7 +103,7 @@ {%- endcall %} -- try to alter view first to replace sql, else drop and create - {{ clickhouse__update_mvs(target_relation, cluster_clause, views) }} + {{ clickhouse__update_mvs(target_relation, cluster_clause, refreshable_clause, views) }} {% endif %} {% else %} @@ -163,9 +163,10 @@ {% endcall %} {%- endmacro %} -{% macro clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) -%} +{% macro clickhouse__create_mv(mv_relation, target_relation, cluster_clause, refreshable_clause, view_sql) -%} {% call statement('create existing mv: ' + mv_relation.name) -%} create materialized view if not exists {{ mv_relation }} {{ cluster_clause }} + {{ refreshable_clause }} to {{ target_relation }} as {{ view_sql }} {% endcall %} @@ -177,13 +178,13 @@ {% endcall %} {%- endmacro %} -{% macro clickhouse__update_mv(mv_relation, target_relation, cluster_clause, view_sql) -%} +{% macro clickhouse__update_mv(mv_relation, target_relation, cluster_clause, refreshable_clause, view_sql) -%} {% set existing_relation = adapter.get_relation(database=mv_relation.database, schema=mv_relation.schema, identifier=mv_relation.identifier) %} {% if existing_relation %} {{ clickhouse__modify_mv(mv_relation, cluster_clause, view_sql) }}; {% else %} {{ clickhouse__drop_mv(mv_relation, cluster_clause) }}; - {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, refreshable_clause, view_sql) }}; {% endif %} {%- endmacro %} @@ -195,17 +196,17 @@ {% endfor %} {%- endmacro %} -{% macro clickhouse__create_mvs(target_relation, cluster_clause, views) -%} +{% macro clickhouse__create_mvs(target_relation, cluster_clause, refreshable_clause, views) -%} {% for view, view_sql in views.items() %} {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} - {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {{ clickhouse__create_mv(mv_relation, target_relation, cluster_clause, refreshable_clause, view_sql) }}; {% endfor %} {%- endmacro %} -{% macro clickhouse__update_mvs(target_relation, cluster_clause, views) -%} +{% macro clickhouse__update_mvs(target_relation, cluster_clause, refreshable_clause, views) -%} {% for view, view_sql in views.items() %} {%- set mv_relation = target_relation.derivative('_' + view, 'materialized_view') -%} - {{ clickhouse__update_mv(mv_relation, target_relation, cluster_clause, view_sql) }}; + {{ clickhouse__update_mv(mv_relation, target_relation, cluster_clause, refreshable_clause, view_sql) }}; {% endfor %} {%- endmacro %} From dc6948d43cbfa22b41b66008c7128eb9576e4a71 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 4 Feb 2025 14:53:55 +0200 Subject: [PATCH 74/96] remove unreachable exchange code + support replace view --- .../macros/materializations/view.sql | 44 +++---------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index 8b4fde5f..42e51d77 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -2,52 +2,24 @@ {%- set existing_relation = load_cached_relation(this) -%} {%- set target_relation = this.incorporate(type='view') -%} - {%- set backup_relation = none -%} - {%- set preexisting_backup_relation = none -%} - {%- set preexisting_intermediate_relation = none -%} - - {% if existing_relation is not none %} - {%- set backup_relation_type = existing_relation.type -%} - {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%} - {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%} - {% if not existing_relation.can_exchange %} - {%- set intermediate_relation = make_intermediate_relation(target_relation) -%} - {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%} - {% endif %} - {% endif %} {% set grant_config = config.get('grants') %} {{ run_hooks(pre_hooks, inside_transaction=False) }} - -- drop the temp relations if they exist already in the database - {{ drop_relation_if_exists(preexisting_intermediate_relation) }} - {{ drop_relation_if_exists(preexisting_backup_relation) }} - -- `BEGIN` happens here: {{ run_hooks(pre_hooks, inside_transaction=True) }} - {% if backup_relation is none %} + {% if existing_relation is none %} {{ log('Creating new relation ' + target_relation.name )}} - -- There is not existing relation, so we can just create - {% call statement('main') -%} - {{ get_create_view_as_sql(target_relation, sql) }} - {%- endcall %} - {% elif existing_relation.can_exchange %} - -- We can do an atomic exchange, so no need for an intermediate - {% call statement('main') -%} - {{ get_create_view_as_sql(backup_relation, sql) }} - {%- endcall %} - {% do exchange_tables_atomic(backup_relation, existing_relation) %} {% else %} - -- We have to use an intermediate and rename accordingly - {% call statement('main') -%} - {{ get_create_view_as_sql(intermediate_relation, sql) }} - {%- endcall %} - {{ adapter.rename_relation(existing_relation, backup_relation) }} - {{ adapter.rename_relation(intermediate_relation, target_relation) }} + {{ log('Relation ' + target_relation.name + ' already exists, replacing it' )}} {% endif %} + {% call statement('main') -%} + {{ get_create_view_as_sql(target_relation, sql) }} + {%- endcall %} + -- cleanup {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} @@ -58,8 +30,6 @@ {{ adapter.commit() }} - {{ drop_relation_if_exists(backup_relation) }} - {{ run_hooks(post_hooks, inside_transaction=False) }} {{ return({'relations': [target_relation]}) }} @@ -71,7 +41,7 @@ {%- set sql_header = config.get('sql_header', none) -%} {{ sql_header if sql_header is not none }} - create view {{ relation.include(database=False) }} {{ on_cluster_clause(relation)}} + create or replace view {{ relation.include(database=False) }} {{ on_cluster_clause(relation)}} {% set contract_config = config.get('contract') %} {% if contract_config.enforced %} {{ get_assert_columns_equivalent(sql) }} From 14fd1316895ed2549d656dae5f0bda88b2fa8ce5 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 4 Feb 2025 14:54:23 +0200 Subject: [PATCH 75/96] add view tests for creat and update --- tests/integration/adapter/view/test_view.py | 71 +++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 tests/integration/adapter/view/test_view.py diff --git a/tests/integration/adapter/view/test_view.py b/tests/integration/adapter/view/test_view.py new file mode 100644 index 00000000..cb6f203b --- /dev/null +++ b/tests/integration/adapter/view/test_view.py @@ -0,0 +1,71 @@ +""" +Test ClickHouse view materialization in dbt-clickhouse +""" + +import json +import pytest +from dbt.tests.util import run_dbt + +PEOPLE_SEED_CSV = """ +id,name,age,department +1231,Dade,33,engineering +6666,Ksenia,48,engineering +8888,Kate,50,engineering +""".lstrip() + +PEOPLE_VIEW_MODEL = """ +{{ config( + materialized='view' +) }} + +{% if var('run_type', '') == '' %} + select id, name, age from {{ source('raw', 'people') }} +{% elif var('run_type', '') == 'update_view' %} + select id, name, age, department from {{ source('raw', 'people') }} +{% endif %} +""" + + +SEED_SCHEMA_YML = """ +version: 2 + +sources: + - name: raw + schema: "{{ target.schema }}" + tables: + - name: people +""" + + +class TestClickHouseView: + @pytest.fixture(scope="class") + def seeds(self): + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return {"people_view.sql": PEOPLE_VIEW_MODEL} + + def test_create_view(self, project): + # Load seed data + run_dbt(["seed"]) + + # Run dbt to create the view + run_dbt() + + # Query the view and check if it returns expected data + result = project.run_sql("SELECT COUNT(*) FROM people_view", fetch="one") + assert result[0] == 3 # 3 records in the seed data + + # Run dbt again to apply the update + run_dbt(["run", "--vars", json.dumps({"run_type": "update_view"})]) + + # Verify the new column is present + result = project.run_sql("DESCRIBE TABLE people_view", fetch="all") + columns = {row[0] for row in result} + assert "department" in columns # New column should be present + + From 8224eee8ff00adb0082fd4d2081f5b2d9f319eb4 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 4 Feb 2025 15:38:14 +0200 Subject: [PATCH 76/96] fix lint --- tests/integration/adapter/view/test_view.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/adapter/view/test_view.py b/tests/integration/adapter/view/test_view.py index cb6f203b..1ea1d945 100644 --- a/tests/integration/adapter/view/test_view.py +++ b/tests/integration/adapter/view/test_view.py @@ -67,5 +67,3 @@ def test_create_view(self, project): result = project.run_sql("DESCRIBE TABLE people_view", fetch="all") columns = {row[0] for row in result} assert "department" in columns # New column should be present - - From ff5b6b4da7fb34bc0a0470e5c76606f887a1faa9 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 4 Feb 2025 15:57:51 +0200 Subject: [PATCH 77/96] fix lint --- tests/integration/adapter/view/test_view.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/adapter/view/test_view.py b/tests/integration/adapter/view/test_view.py index 1ea1d945..fe47a335 100644 --- a/tests/integration/adapter/view/test_view.py +++ b/tests/integration/adapter/view/test_view.py @@ -3,6 +3,7 @@ """ import json + import pytest from dbt.tests.util import run_dbt From 9a4b435f1ccd9aa2027af69be920086b8f7d3cc1 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 4 Feb 2025 16:26:13 +0200 Subject: [PATCH 78/96] update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 322ded95..7d32987d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Improvements * Ignores incompatible settings based on the configured Engine. * Materialized view now attempts to use `ALTER TABLE...MODIFY QUERY` to update existing materialized views. This is an atomic operation so data is not lost. ([#390](https://github.com/ClickHouse/dbt-clickhouse/pull/390)) +* Make view materialization updates atomic. ([#412](https://github.com/ClickHouse/dbt-clickhouse/pull/412)) #### New Features From 41affa4be68c732acf2ebc38edc6ade544f1609e Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Wed, 5 Feb 2025 11:17:01 +0200 Subject: [PATCH 79/96] Fix docker user setup due to ClickHouse/ClickHouse/pull/75259 --- .github/workflows/test_matrix.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index b6942287..2d4f2226 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -19,6 +19,7 @@ jobs: runs-on: ubuntu-latest env: TEST_SETTINGS_FILE: latest + CLICKHOUSE_SKIP_USER_SETUP: 1 strategy: matrix: From 001047bf5f8031d49b3c70631e8b156e2fd94c67 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Wed, 5 Feb 2025 11:34:48 +0200 Subject: [PATCH 80/96] Fix docker user setup due to ClickHouse/ClickHouse/pull/75259 --- .github/workflows/test_matrix.yml | 1 - tests/integration/docker-compose.yml | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index 2d4f2226..b6942287 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -19,7 +19,6 @@ jobs: runs-on: ubuntu-latest env: TEST_SETTINGS_FILE: latest - CLICKHOUSE_SKIP_USER_SETUP: 1 strategy: matrix: diff --git a/tests/integration/docker-compose.yml b/tests/integration/docker-compose.yml index e7810f0f..578995a3 100644 --- a/tests/integration/docker-compose.yml +++ b/tests/integration/docker-compose.yml @@ -22,6 +22,7 @@ services: - SERVER_INDEX=1 - SHARD_NUM=${SHARD_NUM:-1} - REPLICA_NUM=${REPLICA_NUM:-1} + - CLICKHOUSE_SKIP_USER_SETUP=1 ports: - "8123:8123" - "8443:8443" @@ -37,6 +38,7 @@ services: - SERVER_INDEX=2 - SHARD_NUM=${SHARD_NUM:-2} - REPLICA_NUM=${REPLICA_NUM:-2} + - CLICKHOUSE_SKIP_USER_SETUP=1 <<: *ch-common ch2: image: clickhouse/clickhouse-server:${DBT_CH_TEST_CH_VERSION:-latest} @@ -44,6 +46,7 @@ services: - SERVER_INDEX=3 - SHARD_NUM=${SHARD_NUM:-3} - REPLICA_NUM=${REPLICA_NUM:-3} + - CLICKHOUSE_SKIP_USER_SETUP=1 <<: *ch-common networks: From e4ae3ec1313f00d564d496eab6edbf1aaa52824b Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Wed, 5 Feb 2025 12:36:56 +0200 Subject: [PATCH 81/96] updates for release 1.8.8 --- CHANGELOG.md | 3 +-- dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d32987d..88d616f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,5 @@ -### Release [x.x.x] +### Release [1.8.8], 2025-02-05 ### Improvements -* Ignores incompatible settings based on the configured Engine. * Materialized view now attempts to use `ALTER TABLE...MODIFY QUERY` to update existing materialized views. This is an atomic operation so data is not lost. ([#390](https://github.com/ClickHouse/dbt-clickhouse/pull/390)) * Make view materialization updates atomic. ([#412](https://github.com/ClickHouse/dbt-clickhouse/pull/412)) diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 8d30f618..e144cca5 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.8.7' +version = '1.8.8' From 7b959ae3f29849e4aa7a7de29e41de4ec0a140cc Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Wed, 5 Feb 2025 14:27:47 +0200 Subject: [PATCH 82/96] revert readme changes --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88d616f9..dfd45ee6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ### Improvements * Materialized view now attempts to use `ALTER TABLE...MODIFY QUERY` to update existing materialized views. This is an atomic operation so data is not lost. ([#390](https://github.com/ClickHouse/dbt-clickhouse/pull/390)) * Make view materialization updates atomic. ([#412](https://github.com/ClickHouse/dbt-clickhouse/pull/412)) +* Create a black list settings to ignore based on the configured Engine. ([#367](https://github.com/ClickHouse/dbt-clickhouse/pull/367)) + +#### New Features +* [ClickHouse indexes](https://clickhouse.com/docs/en/optimize/sparse-primary-indexes) are now fully supported for `table` materialization. #### New Features From 984e0a0cd59872bcc23eb79c931d087f0111d73e Mon Sep 17 00:00:00 2001 From: Bentsi Leviav Date: Thu, 6 Feb 2025 11:46:12 +0200 Subject: [PATCH 83/96] Revert "fix: create materialized view on cluster" --- dbt/adapters/clickhouse/relation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index e94c79d5..87134c5a 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -82,7 +82,7 @@ def get_on_cluster( ) -> bool: if cluster.strip(): return ( - materialized in ('materialized_view', 'view', 'dictionary') + materialized in ('view', 'dictionary') or 'distributed' in materialized or 'Replicated' in engine ) From bc33015e02c69a17bd0854477af394aee95623b9 Mon Sep 17 00:00:00 2001 From: Bartosz Wojno Date: Tue, 4 Feb 2025 22:46:05 +0100 Subject: [PATCH 84/96] feat: allow using TLS client certificates --- README.md | 2 ++ dbt/adapters/clickhouse/credentials.py | 4 ++++ dbt/adapters/clickhouse/httpclient.py | 2 ++ dbt/adapters/clickhouse/nativeclient.py | 2 ++ 4 files changed, 10 insertions(+) diff --git a/README.md b/README.md index a5bf2543..232f5ee5 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,8 @@ your_profile_name: cluster: [] # If set, certain DDL/table operations will be executed with the `ON CLUSTER` clause using this cluster. Distributed materializations require this setting to work. See the following ClickHouse Cluster section for more details. verify: [True] # Validate TLS certificate if using TLS/SSL secure: [False] # Use TLS (native protocol) or HTTPS (http protocol) + client_cert: [null] # Path to a TLS client certificate in .pem format + client_cert_key: [null] # Path to the private key for the TLS client certificate retries: [1] # Number of times to retry a "retriable" database exception (such as a 503 'Service Unavailable' error) compression: [] # Use gzip compression if truthy (http), or compression type for a native connection connect_timeout: [10] # Timeout in seconds to establish a connection to ClickHouse diff --git a/dbt/adapters/clickhouse/credentials.py b/dbt/adapters/clickhouse/credentials.py index 47d04b8a..640080f6 100644 --- a/dbt/adapters/clickhouse/credentials.py +++ b/dbt/adapters/clickhouse/credentials.py @@ -24,6 +24,8 @@ class ClickHouseCredentials(Credentials): cluster_mode: bool = False secure: bool = False verify: bool = True + client_cert: Optional[str] = None + client_cert_key: Optional[str] = None connect_timeout: int = 10 send_receive_timeout: int = 300 sync_request_timeout: int = 5 @@ -72,6 +74,8 @@ def _connection_keys(self): 'cluster_mode', 'secure', 'verify', + 'client_cert', + 'client_cert_key', 'connect_timeout', 'send_receive_timeout', 'sync_request_timeout', diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index bd707e5f..3f2dbaf9 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -66,6 +66,8 @@ def _create_client(self, credentials): send_receive_timeout=credentials.send_receive_timeout, client_name=f'dbt-adapters/{dbt_adapters_version} dbt-clickhouse/{dbt_clickhouse_version}', verify=credentials.verify, + client_cert=credentials.client_cert, + client_cert_key=credentials.client_cert_key, query_limit=0, settings=self._conn_settings, ) diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index 676b48a4..dfa4c5bd 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -68,6 +68,8 @@ def _create_client(self, credentials: ClickHouseCredentials): client_name=f'dbt-adapters/{dbt_adapters_version} dbt-clickhouse/{dbt_clickhouse_version} clickhouse-driver/{driver_version}', secure=credentials.secure, verify=credentials.verify, + certfile=credentials.client_cert, + keyfile=credentials.client_cert_key, connect_timeout=credentials.connect_timeout, send_receive_timeout=credentials.send_receive_timeout, sync_request_timeout=credentials.sync_request_timeout, From 15a4e8c190088b1f655320ebf608fd05b15df61e Mon Sep 17 00:00:00 2001 From: Bartosz Wojno Date: Tue, 4 Feb 2025 23:07:08 +0100 Subject: [PATCH 85/96] Update CHANGELOG.md --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dfd45ee6..fa1c6651 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### Release [x.x.x] + +#### Improvements +* It is now possible to configure a TLS client certificate using `client_cert` and `client_cert_key` profile parameters. ([#413](https://github.com/ClickHouse/dbt-clickhouse/pull/413)) + ### Release [1.8.8], 2025-02-05 ### Improvements * Materialized view now attempts to use `ALTER TABLE...MODIFY QUERY` to update existing materialized views. This is an atomic operation so data is not lost. ([#390](https://github.com/ClickHouse/dbt-clickhouse/pull/390)) From c36c49f10443e09c1640a69cb4668c1ed9a8be25 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Wed, 12 Feb 2025 12:28:40 +0200 Subject: [PATCH 86/96] Add workflow dispatch --- .github/workflows/test_matrix.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index b6942287..149f0bec 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -12,6 +12,7 @@ on: # yamllint disable-line rule:truthy paths-ignore: - '**.md' - 'LICENSE' + workflow_dispatch: jobs: tests: From 256f2380dba0be6ce602f4a1ed8228829af0c28a Mon Sep 17 00:00:00 2001 From: Stephen Nancekivell Date: Fri, 14 Feb 2025 14:05:27 +1100 Subject: [PATCH 87/96] optimise adding projections and indexes in one statement, to avoid err 517 --- .../macros/materializations/table.sql | 59 +++++++++++-------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 0a53fb56..0d99d11a 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -147,39 +147,46 @@ {% call statement('create_table_empty') %} {{ create_table }} {% endcall %} - {% if config.get('projections')%} - {{ projection_statement(relation) }} - {% endif %} - {% if config.get('indexes') %} - {{ indexes_statement(relation) }} - {% endif %} + {{ add_index_and_projections(relation) }} {{ clickhouse__insert_into(relation, sql, has_contract) }} {%- endif %} {%- endmacro %} -{% macro projection_statement(relation) %} +{# + A macro that adds any configured projections or indexes at the same time. + We optimise to reduce the number of ALTER TABLE statements that are run to avoid + Code: 517. + DB::Exception: Metadata on replica is not up to date with common metadata in Zookeeper. + It means that this replica still not applied some of previous alters. Probably too many + alters executing concurrently (highly not recommended). +#} +{% macro add_index_and_projections(relation) %} {%- set projections = config.get('projections', default=[]) -%} - - {%- for projection in projections %} - {% call statement('add_projections') %} - ALTER TABLE {{ relation }} ADD PROJECTION {{ projection.get('name') }} - ( - {{ projection.get('query') }} - ) - {%endcall %} - {%- endfor %} -{%- endmacro %} - -{% macro indexes_statement(relation) %} {%- set indexes = config.get('indexes', default=[]) -%} - - {%- for index in indexes %} - {% call statement('add_indexes') %} - ALTER TABLE {{ relation }} ADD INDEX {{ index.get('name') }} {{ index.get('definition') }} - {%endcall %} - {%- endfor %} -{%- endmacro %} + + {% if projections | length > 0 or indexes | length > 0 %} + {% call statement('add_projections_and_indexes') %} + ALTER TABLE {{ relation }} + {%- if projections %} + {%- for projection in projections %} + ADD PROJECTION {{ projection.get('name') }} ({{ projection.get('query') }}) + {%- if not loop.last or indexes | length > 0 -%} + , + {% endif %} + {%- endfor %} + {%- endif %} + {%- if indexes %} + {%- for index in indexes %} + ADD INDEX {{ index.get('name') }} {{ index.get('definition') }} + {%- if not loop.last -%} + , + {% endif %} + {% endfor %} + {% endif %} + {% endcall %} + {% endif %} +{% endmacro %} {% macro create_table_or_empty(temporary, relation, sql, has_contract) -%} {%- set sql_header = config.get('sql_header', none) -%} From e8e65b4b759fbdaf0b5f3d1ea6ce9a08c329cc9c Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 16 Feb 2025 16:33:58 +0200 Subject: [PATCH 88/96] Fix lint --- tests/integration/adapter/incremental/test_base_incremental.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/adapter/incremental/test_base_incremental.py b/tests/integration/adapter/incremental/test_base_incremental.py index a31cbb05..ea372b9b 100644 --- a/tests/integration/adapter/incremental/test_base_incremental.py +++ b/tests/integration/adapter/incremental/test_base_incremental.py @@ -236,6 +236,7 @@ def test_insert_overwrite_incremental(self, project): (3, 'p1', 2, 'f'), ] + # "ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/{table}/{uuid}/', '{replica}')" insert_overwrite_replicated_inc = """ {{ config( From 175500dab38b7f29aefa176da76729fb17561e58 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 16 Feb 2025 17:39:55 +0200 Subject: [PATCH 89/96] increase version --- dbt/adapters/clickhouse/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index e144cca5..49ac8b03 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.8.8' +version = '1.8.9' From dff943d3d02be4db12f34a1cf74cb02331b0081e Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 16 Feb 2025 17:40:46 +0200 Subject: [PATCH 90/96] Update CHANGELOG.md for version v1.8.9 --- CHANGELOG.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 585744c6..8062cf83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,14 @@ -### Release [x.x.x] +### Release [1.8.9], 2025-02-16 #### Improvements * It is now possible to configure a TLS client certificate using `client_cert` and `client_cert_key` profile parameters. ([#413](https://github.com/ClickHouse/dbt-clickhouse/pull/413)) +* Added Support of insert_overwrite in cluster setup with incremental and distributed_incremental materializations ([#394](https://github.com/ClickHouse/dbt-clickhouse/pull/394)) +* Improve index and projections creation process ([#421](https://github.com/ClickHouse/dbt-clickhouse/pull/421)) + +#### Bugs +* Reverted breaking changes in MV materialization ([#416](https://github.com/ClickHouse/dbt-clickhouse/pull/416)) +* A fix was introduced for distributed tables, where an incremental local table could have been dropped if the distributed table was missing. ([#363](https://github.com/ClickHouse/dbt-clickhouse/pull/363)) + ### Release [1.8.8], 2025-02-05 ### Improvements @@ -53,7 +60,6 @@ The index config should be added to the model config. for instance: * Added support for [range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#range_hashed) and [complex_key_range_hashed](https://clickhouse.com/docs/en/sql-reference/dictionaries#complex_key_range_hashed) layouts to the dictionary materialization. ([#361](https://github.com/ClickHouse/dbt-clickhouse/pull/361)) * Truncated stack trace for database errors for cleaner output when HIDE_STACK_TRACE variable is set to any value. ([#382](https://github.com/ClickHouse/dbt-clickhouse/pull/382)) * It is now possible to pass query settings not only on table creation but also on query. ([#362](https://github.com/ClickHouse/dbt-clickhouse/pull/362)) -* Added Support of insert_overwrite in cluster setup with incremental and distributed_incremental materializations ([#394](https://github.com/ClickHouse/dbt-clickhouse/pull/394)) ### Bug Fixes From 1a9aed52056bfba032100ef0fdf59357730381fe Mon Sep 17 00:00:00 2001 From: Xiatong Zheng Date: Thu, 20 Feb 2025 16:52:13 +0100 Subject: [PATCH 91/96] chore: https://github.com/ClickHouse/dbt-clickhouse/commit/67cf9db5285b9d1c41ce321bd902ece577b6cc34 --- dbt/include/clickhouse/macros/materializations/table.sql | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 7f0ee72e..3c5e4cc8 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -130,10 +130,11 @@ {% macro on_cluster_clause(relation, force_sync) %} {% set active_cluster = adapter.get_clickhouse_cluster_name() %} {%- if active_cluster is not none and relation.should_on_cluster %} - ON CLUSTER '{{ active_cluster }}' - {% if force_sync %} + {# Add trailing whitespace to avoid problems when this clause is not last #} + ON CLUSTER '{{ active_cluster + ' ' }}' + {%- if force_sync %} SYNC - {% endif %} + {%- endif %} {%- endif %} {%- endmacro -%} From 7f30e322b1c395eec5d2850ad91bba01e4c0b2e5 Mon Sep 17 00:00:00 2001 From: Xiatong Zheng Date: Fri, 21 Feb 2025 09:28:55 +0100 Subject: [PATCH 92/96] feat: change test branch --- .github/workflows/test_matrix.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index 149f0bec..675447b5 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -3,7 +3,7 @@ name: "test_matrix" on: # yamllint disable-line rule:truthy pull_request: - branches: main + branches: dap-main push: branches-ignore: - '*_test' From ff977b7dbc9d4834f3aa051b02b7bbdbe454e5ed Mon Sep 17 00:00:00 2001 From: Xiatong Zheng Date: Fri, 21 Feb 2025 10:16:02 +0100 Subject: [PATCH 93/96] ci: reduce test senarios --- .github/workflows/test_matrix.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index 675447b5..82342c60 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -24,16 +24,9 @@ jobs: strategy: matrix: python-version: - - '3.9' - - '3.10' - '3.11' - - '3.12' clickhouse-version: - - '23.8' - - '24.1' - - '24.2' - - '24.3' - - latest + - '24.12' steps: - name: Checkout From 5f974e76b78ff0d58dda588aa1d377b120bce342 Mon Sep 17 00:00:00 2001 From: Xiatong Zheng Date: Fri, 21 Feb 2025 10:51:15 +0100 Subject: [PATCH 94/96] fix: remove quotes around cluster --- dbt/include/clickhouse/macros/materializations/table.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 3c5e4cc8..c521b614 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -131,7 +131,7 @@ {% set active_cluster = adapter.get_clickhouse_cluster_name() %} {%- if active_cluster is not none and relation.should_on_cluster %} {# Add trailing whitespace to avoid problems when this clause is not last #} - ON CLUSTER '{{ active_cluster + ' ' }}' + ON CLUSTER {{ active_cluster + ' ' }} {%- if force_sync %} SYNC {%- endif %} From 3e4cc125b97d2af3c93b6519a4547c3d1b753c91 Mon Sep 17 00:00:00 2001 From: Xiatong Zheng Date: Fri, 21 Feb 2025 10:57:49 +0100 Subject: [PATCH 95/96] fix: view --- dbt/include/clickhouse/macros/materializations/view.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index 494e03b4..94a7494c 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -46,6 +46,7 @@ {% if contract_config.enforced %} {{ get_assert_columns_equivalent(sql) }} {%- endif %} + as ( {% if sql is none %} {{clickhouse__create_select_query_from_schema()}} {%- else -%} From fb00685c8c3e2bc3237db8606bd789eb1531a33b Mon Sep 17 00:00:00 2001 From: Xiatong Zheng Date: Fri, 21 Feb 2025 14:29:54 +0100 Subject: [PATCH 96/96] fix: insert overwrite signature --- .../macros/materializations/incremental/incremental.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql index 26b96d41..a193819c 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql @@ -266,7 +266,7 @@ {{ drop_relation_if_exists(distributed_new_data_relation) }} {% endmacro %} -{% macro clickhouse__incremental_insert_overwrite(existing_relation, intermediate_relation, partition_by) %} +{% macro clickhouse__incremental_insert_overwrite(existing_relation, partition_by, is_distributed=False) %} {%- set new_data_relation = make_intermediate_relation(existing_relation, '__dbt_new_data') -%} {{ drop_relation_if_exists(new_data_relation) }} {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_distributed_new_data'}) -%}