diff --git a/dags.yaml b/dags.yaml index 79e4d03e430..27f5e08537e 100644 --- a/dags.yaml +++ b/dags.yaml @@ -2323,6 +2323,19 @@ bqetl_monitoring_hourly: - impact/tier_1 - repo/bigquery-etl +bqetl_content_ml_daily: + schedule_interval: 0 4 * * * + description: | + Daily extracts for corpus item data to be evaluated for new tab presentation. + default_args: + owner: skamath@mozilla.com + start_date: "2025-05-05" #TODO: Change to merge date + email: ["skamath@mozilla.com", "mlcooper@mozilla.com"] + retries: 2 + retry_delay: 30m + tags: + - impact/tier_2 + bqetl_ech_adoption_rate: default_args: depends_on_past: false diff --git a/sql/moz-fx-data-shared-prod/snowflake_migration/dataset_metadata.yaml b/sql/moz-fx-data-shared-prod/snowflake_migration/dataset_metadata.yaml new file mode 100644 index 00000000000..31ea0e5e63f --- /dev/null +++ b/sql/moz-fx-data-shared-prod/snowflake_migration/dataset_metadata.yaml @@ -0,0 +1,12 @@ +friendly_name: Snowflake Migration +description: |- + Queries for modeling raw snowplow data about prospects + (candidate links to be curated by ML models and the editorial team for New Tab). + +user_facing: false +dataset_base_acl: derived +labels: {} +workgroup_access: + - role: roles/bigquery.dataViewer + members: + - workgroup:mozilla-confidential diff --git a/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/metadata.yaml b/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/metadata.yaml new file mode 100644 index 00000000000..cee41256b11 --- /dev/null +++ b/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/metadata.yaml @@ -0,0 +1,28 @@ +friendly_name: Prospects v1 +description: Model prospective article options for the Fx New Tab from raw Snowplow data +owners: + - skamath@mozilla.com + - mlcooper@mozilla.com +labels: + schedule: daily + incremental: true + owner1: skamath + owner2: mlcooper +scheduling: + dag_name: bqetl_content_ml_daily + # destination is the whole table, not a single partition, + # so don't use date_partition_parameter + date_partition_parameter: null +bigquery: + time_partitioning: + type: day + # TODO: confirm + field: happened_at + require_partition_filter: true + expiration_days: 775 + clustering: + fields: + - prospect_source + - language + - topic + - publisher diff --git a/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/query.sql b/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/query.sql new file mode 100644 index 00000000000..c4e2db6122b --- /dev/null +++ b/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/query.sql @@ -0,0 +1,81 @@ +WITH stg_prospects AS ( + SELECT + event_id, + unstruct_event_com_pocket_object_update_1.trigger AS object_update_trigger, + -- prospect info + contexts_com_pocket_prospect_1[0].prospect_id AS prospect_id, + contexts_com_pocket_prospect_1[0].url AS url, + contexts_com_pocket_prospect_1[0].scheduled_surface_id AS scheduled_surface_id, + contexts_com_pocket_prospect_1[0].prospect_source AS prospect_source, + TIMESTAMP_SECONDS(contexts_com_pocket_prospect_1[0].created_at) AS created_at, + TIMESTAMP_SECONDS(DIV(contexts_com_pocket_prospect_1[0].reviewed_at, 1000)) AS reviewed_at, + contexts_com_pocket_prospect_1[0].prospect_review_status AS prospect_review_status, + ARRAY_TO_STRING(contexts_com_pocket_prospect_1[0].status_reasons, ",") AS status_reasons, + contexts_com_pocket_prospect_1[0].status_reason_comment AS status_reason_comment, + contexts_com_pocket_prospect_1[0].reviewed_by AS reviewed_by, + contexts_com_pocket_prospect_1[0].title AS title, + contexts_com_pocket_prospect_1[0].excerpt AS excerpt, + contexts_com_pocket_prospect_1[0].image_url AS image_url, + contexts_com_pocket_prospect_1[0].language AS language, + contexts_com_pocket_prospect_1[0].topic AS topic, + contexts_com_pocket_prospect_1[0].is_collection AS is_collection, + contexts_com_pocket_prospect_1[0].is_syndicated AS is_syndicated, + ARRAY_TO_STRING(contexts_com_pocket_prospect_1[0].authors, ",") AS authors, + contexts_com_pocket_prospect_1[0].publisher AS publisher, + contexts_com_pocket_prospect_1[0].domain AS domain, + TO_JSON(contexts_com_pocket_prospect_1[0].features) AS features, + TO_JSON(contexts_com_pocket_prospect_1[0].run_details) AS run_details, + contexts_com_pocket_prospect_1[0]._schema_version AS schema_version, + -- event info + derived_tstamp AS happened_at, + FROM + `moz-fx-data-shared-prod.snowplow_external.events` + WHERE + event_name = 'object_update' + AND unstruct_event_com_pocket_object_update_1.object = 'prospect' + AND SAFE_CAST(contexts_com_pocket_prospect_1[0].created_at AS INT64) IS NOT NULL + AND SAFE_CAST(contexts_com_pocket_prospect_1[0].created_at AS INT64) + BETWEEN 946684800 + AND UNIX_MILLIS(CURRENT_TIMESTAMP()) + AND SAFE_CAST(contexts_com_pocket_prospect_1[0].reviewed_at AS INT64) IS NOT NULL + -- reviewed_at is in miliseconds (for some reason), so we need to divide + AND SAFE_CAST(DIV(contexts_com_pocket_prospect_1[0].reviewed_at, 1000) AS INT64) + BETWEEN 946684800 + AND UNIX_MILLIS(CURRENT_TIMESTAMP()) + -- This ensures recommended_at is between Jan 1, 2000, and the current time to remain within BQ limits for dates + QUALIFY + ROW_NUMBER() OVER (PARTITION BY happened_at ORDER BY happened_at) = 1 +) +SELECT + p.prospect_id, + p.object_update_trigger, + p.url, + p.scheduled_surface_id, + p.prospect_source, + p.created_at, + p.reviewed_at, + p.prospect_review_status, + p.status_reasons, + p.status_reason_comment, + p.reviewed_by, + p.title, + p.excerpt, + p.image_url, + p.language, + p.topic, + p.authors, + p.publisher, + p.domain, + p.is_collection, + p.is_syndicated, + p.happened_at, + p.features, + p.run_details, + p.schema_version, + TO_BASE64( + SHA256(CONCAT(p.prospect_id, p.object_update_trigger)) + ) AS prospect_id_object_update_trigger_key +FROM + stg_prospects p +QUALIFY + ROW_NUMBER() OVER (PARTITION BY prospect_id, object_update_trigger ORDER BY happened_at DESC) = 1; diff --git a/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/schema.yaml b/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/schema.yaml new file mode 100644 index 00000000000..aa33f434876 --- /dev/null +++ b/sql/moz-fx-data-shared-prod/snowflake_migration/prospects_v1/schema.yaml @@ -0,0 +1,116 @@ +fields: +- mode: NULLABLE + type: STRING + name: prospect_id + description: Unique identifier for prospects +- mode: NULLABLE + type: STRING + name: url + description: The url of the prospects item +- mode: NULLABLE + type: STRING + name: scheduled_surface_id + description: Recommended destination where the prospect item is expected to appear + (NEW_TAB_EN_INTL, NEW_TAB_EN_US, NEW_TAB_DE_DE, NEW_TAB_EN_GB). +- mode: NULLABLE + type: STRING + name: prospect_source + description: Source identified by the ML process for the prospect (SYNDICATED, ORGANIC_TIMESPENT, + GLOBAL). +- mode: NULLABLE + type: TIMESTAMP + name: created_at + description: timestamp when the prospect was first created +- mode: NULLABLE + type: TIMESTAMP + name: reviewed_at + description: timestamp when the prospect was reviewed by the curator. It is also + the timestamp that caused this event action +- mode: NULLABLE + type: STRING + name: prospect_review_status + description: The curator's review status for the prospect +- mode: NULLABLE + type: STRING + name: status_reasons + description: The list of curator review status reasons +- mode: NULLABLE + type: STRING + name: status_reason_comment + description: Curator review status reason comment +- mode: NULLABLE + type: STRING + name: reviewed_by + description: The curator who reviewed the prospect +- mode: NULLABLE + type: STRING + name: title + description: The title of the reviewed corpus item +- mode: NULLABLE + type: STRING + name: excerpt + description: The excerpt for the reviewed corpus item +- mode: NULLABLE + type: STRING + name: image_url + description: The url of the main image of the reviewed corpus item +- mode: NULLABLE + type: STRING + name: language + description: The language of the reviewed_corpus_item +- mode: NULLABLE + type: STRING + name: topic + description: The topic of the reviewed_corpus_item +- mode: NULLABLE + type: STRING + name: authors + description: The list of authors of the reviewed_corpus_item +- mode: NULLABLE + type: STRING + name: publisher + description: The name of the online publication that published this story. +- mode: NULLABLE + type: BOOLEAN + name: is_collection + description: Indicates whether the reviewed_corpus_item is a collection +- mode: NULLABLE + type: BOOLEAN + name: is_syndicated + description: Indicates whether the reviewed_corpus_item is a syndicated article +- mode: NULLABLE + type: JSON + name: features + description: 'ML features for the prospect item. Note: These features are being + passed through for ML use and will not be engineered in the warehouse + + ' +- mode: NULLABLE + type: JSON + name: run_details + description: 'Details about the run, including the flow name and run ID. Note: The + ML team could include additional debug properties about the run, but these properties + will not be engineered in the warehouse + + ' +- mode: NULLABLE + type: STRING + name: schema_version + description: Snowplow version identifier. +- mode: NULLABLE + type: TIMESTAMP + name: happened_at + description: Event creation timestamp +- mode: NULLABLE + type: STRING + name: prospect_id_object_update_trigger_key + description: 'A combination of the prospect_id and object_update_trigger columns + to create a unique event identifier + + ' +- mode: NULLABLE + type: STRING + name: object_update_trigger +- mode: NULLABLE + type: STRING + name: domain