discourse
diff --git a/‎lib/completions/dialects/nova.rb
+1 b/‎lib/completions/dialects/nova.rb
+1
diff --git a/‎lib/completions/endpoints/anthropic.rb
+18-5 b/‎lib/completions/endpoints/anthropic.rb
+18-5
diff --git a/‎lib/completions/endpoints/aws_bedrock.rb
+17-5 b/‎lib/completions/endpoints/aws_bedrock.rb
+17-5
diff --git a/‎lib/completions/endpoints/canned_response.rb
+9 b/‎lib/completions/endpoints/canned_response.rb
+9
diff --git a/‎lib/completions/endpoints/gemini.rb
+10-1 b/‎lib/completions/endpoints/gemini.rb
+10-1
diff --git a/‎lib/completions/endpoints/samba_nova.rb
+6-1 b/‎lib/completions/endpoints/samba_nova.rb
+6-1
diff --git a/‎lib/completions/llm.rb
+5-1 b/‎lib/completions/llm.rb
+5-1
diff --git a/‎lib/summarization/fold_content.rb
+14-16 b/‎lib/summarization/fold_content.rb
+14-16
diff --git a/‎spec/jobs/regular/fast_track_topic_gist_spec.rb
+3-7 b/‎spec/jobs/regular/fast_track_topic_gist_spec.rb
+3-7
diff --git a/‎spec/jobs/regular/stream_topic_ai_summary_spec.rb
+2-6 b/‎spec/jobs/regular/stream_topic_ai_summary_spec.rb
+2-6
diff --git a/‎spec/jobs/scheduled/summaries_backfill_spec.rb
+1-5 b/‎spec/jobs/scheduled/summaries_backfill_spec.rb
+1-5
@@ -42,6 +42,7 @@ def to_payload(options = nil)
             result = { system: system, messages: messages }
             result[:inferenceConfig] = inference_config if inference_config.present?
             result[:toolConfig] = tool_config if tool_config.present?
+            result[:response_format] = { type: "json_object" } if options[:response_format].present?
 
             result
           end
 
@@ -88,10 +88,16 @@ def xml_tools_enabled?
         def prepare_payload(prompt, model_params, dialect)
           @native_tool_support = dialect.native_tool_support?
 
-          payload = default_options(dialect).merge(model_params).merge(messages: prompt.messages)
+          payload =
+            default_options(dialect).merge(model_params.except(:response_format)).merge(
+              messages: prompt.messages,
+            )
 
           payload[:system] = prompt.system_prompt if prompt.system_prompt.present?
           payload[:stream] = true if @streaming_mode
+
+          preffiled_message = +""
+
           if prompt.has_tools?
             payload[:tools] = prompt.tools
             if dialect.tool_choice.present?
@@ -100,16 +106,23 @@ def prepare_payload(prompt, model_params, dialect)
 
                 # prefill prompt to nudge LLM to generate a response that is useful.
                 # without this LLM (even 3.7) can get confused and start text preambles for a tool calls.
-                payload[:messages] << {
-                  role: "assistant",
-                  content: dialect.no_more_tool_calls_text,
-                }
+                preffiled_message << dialect.no_more_tool_calls_text
               else
                 payload[:tool_choice] = { type: "tool", name: prompt.tool_choice }
               end
             end
           end
 
+          # Prefill prompt to force JSON output.
+          if model_params[:response_format].present?
+            preffiled_message << " " if !preffiled_message.empty?
+            preffiled_message << "{"
+          end
+
+          if !preffiled_message.empty?
+            payload[:messages] << { role: "assistant", content: preffiled_message }
+          end
+
           payload
         end
 
 
@@ -116,9 +116,14 @@ def prepare_payload(prompt, model_params, dialect)
           payload = nil
 
           if dialect.is_a?(DiscourseAi::Completions::Dialects::Claude)
-            payload = default_options(dialect).merge(model_params).merge(messages: prompt.messages)
+            payload =
+              default_options(dialect).merge(model_params.except(:response_format)).merge(
+                messages: prompt.messages,
+              )
             payload[:system] = prompt.system_prompt if prompt.system_prompt.present?
 
+            preffiled_message = +""
+
             if prompt.has_tools?
               payload[:tools] = prompt.tools
               if dialect.tool_choice.present?
@@ -128,15 +133,22 @@ def prepare_payload(prompt, model_params, dialect)
                   # payload[:tool_choice] = { type: "none" }
 
                   # prefill prompt to nudge LLM to generate a response that is useful, instead of trying to call a tool
-                  payload[:messages] << {
-                    role: "assistant",
-                    content: dialect.no_more_tool_calls_text,
-                  }
+                  preffiled_message << dialect.no_more_tool_calls_text
                 else
                   payload[:tool_choice] = { type: "tool", name: prompt.tool_choice }
                 end
               end
             end
+
+            # Prefill prompt to force JSON output.
+            if model_params[:response_format].present?
+              preffiled_message << " " if !preffiled_message.empty?
+              preffiled_message << "{"
+            end
+
+            if !preffiled_message.empty?
+              payload[:messages] << { role: "assistant", content: preffiled_message }
+            end
           elsif dialect.is_a?(DiscourseAi::Completions::Dialects::Nova)
             payload = prompt.to_payload(default_options(dialect).merge(model_params))
           else
 
@@ -40,6 +40,8 @@ def perform_completion!(
                   "The number of completions you requested exceed the number of canned responses"
           end
 
+          response = transform_from_schema(response) if model_params[:response_format].present?
+
           raise response if response.is_a?(StandardError)
 
           @completions += 1
@@ -80,6 +82,13 @@ def is_thinking?(response)
         def is_tool?(response)
           response.is_a?(DiscourseAi::Completions::ToolCall)
         end
+
+        def transform_from_schema(response)
+          key = model_params[:response_format].dig(:json_schema, :schema, :properties)&.keys&.first
+          return response if key.nil?
+
+          { key => response }.to_json
+        end
       end
     end
   end
 
@@ -84,7 +84,16 @@ def prepare_payload(prompt, model_params, dialect)
 
             payload[:tool_config] = { function_calling_config: function_calling_config }
           end
-          payload[:generationConfig].merge!(model_params) if model_params.present?
+          if model_params.present?
+            payload[:generationConfig].merge!(model_params.except(:response_format))
+
+            if model_params[:response_format].present?
+              # https://ai.google.dev/api/generate-content#generationconfig
+              payload[:generationConfig][:responseSchema] = model_params[:response_format]
+              payload[:generationConfig][:responseMimeType] = "application/json"
+            end
+          end
+
           payload
         end
 
 
@@ -34,7 +34,12 @@ def model_uri
         end
 
         def prepare_payload(prompt, model_params, dialect)
-          payload = default_options.merge(model_params).merge(messages: prompt)
+          payload =
+            default_options.merge(model_params.except(:response_format)).merge(messages: prompt)
+
+          if model_params[:response_format].present?
+            payload[:response_format] = { type: "json_object" }
+          end
 
           payload[:stream] = true if @streaming_mode
 
 
@@ -241,7 +241,8 @@ def initialize(dialect_klass, gateway_klass, llm_model, gateway: nil)
       # @param feature_context { Hash - Optional } - The feature context to use for the completion.
       # @param partial_tool_calls { Boolean - Optional } - If true, the completion will return partial tool calls.
       # @param output_thinking { Boolean - Optional } - If true, the completion will return the thinking output for thinking models.
-      # @param extra_model_params { Hash - Optional } - Other params that are not available accross models. e.g. response_format JSON schema.
+      # @param response_format { Hash - Optional } - JSON schema passed to the API as the desired structured output.
+      # @param [Experimental] extra_model_params { Hash - Optional } - Other params that are not available accross models. e.g. response_format JSON schema.
       #
       # @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
       #
@@ -259,6 +260,7 @@ def generate(
         feature_context: nil,
         partial_tool_calls: false,
         output_thinking: false,
+        response_format: nil,
         extra_model_params: nil,
         &partial_read_blk
       )
@@ -274,6 +276,7 @@ def generate(
             feature_context: feature_context,
             partial_tool_calls: partial_tool_calls,
             output_thinking: output_thinking,
+            response_format: response_format,
             extra_model_params: extra_model_params,
           },
         )
@@ -282,6 +285,7 @@ def generate(
 
         model_params[:temperature] = temperature if temperature
         model_params[:top_p] = top_p if top_p
+        model_params[:response_format] = response_format if response_format
         model_params.merge!(extra_model_params) if extra_model_params
 
         if prompt.is_a?(String)
 
@@ -115,7 +115,7 @@ def fold(items, user, &on_partial_blk)
         # Auxiliary variables to get the summary content from the JSON response.
         raw_buffer = +""
         json_start_found = false
-        json_reply_start_regex = /\{\s*"summary"\s*:\s*"/
+        json_reply_start_regex = /\{?\s*"summary"\s*:\s*"/ # { is optional because Claude uses prefill, so it's not incldued.
         unescape_regex = %r{\\(["/bfnrt])}
         json_reply_end = "\"}"
 
@@ -143,7 +143,7 @@ def fold(items, user, &on_partial_blk)
             end
           end
 
-        bot.reply(context, llm_args: { extra_model_params: response_format }, &buffer_blk)
+        bot.reply(context, llm_args: { response_format: response_format_schema }, &buffer_blk)
 
         summary.chomp(json_reply_end)
       end
@@ -172,24 +172,22 @@ def truncate(item)
         item
       end
 
-      def response_format
+      def response_format_schema
         {
-          response_format: {
-            type: "json_schema",
-            json_schema: {
-              name: "reply",
-              schema: {
-                type: "object",
-                properties: {
-                  summary: {
-                    type: "string",
-                  },
+          type: "json_schema",
+          json_schema: {
+            name: "reply",
+            schema: {
+              type: "object",
+              properties: {
+                summary: {
+                  type: "string",
                 },
-                required: ["summary"],
-                additionalProperties: false,
               },
-              strict: true,
+              required: ["summary"],
+              additionalProperties: false,
             },
+            strict: true,
           },
         }
       end
 
@@ -24,13 +24,9 @@
 
       let(:updated_gist) { "They updated me :(" }
 
-      def in_json_format(summary)
-        "{\"summary\":\"#{summary}\"}"
-      end
-
       context "when it's up to date" do
         it "does nothing" do
-          DiscourseAi::Completions::Llm.with_prepared_responses([in_json_format(updated_gist)]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([updated_gist]) do
             subject.execute(topic_id: topic_1.id)
           end
 
@@ -44,7 +40,7 @@ def in_json_format(summary)
         before { Fabricate(:post, topic: topic_1, post_number: 3) }
 
         it "regenerates the gist using the latest data" do
-          DiscourseAi::Completions::Llm.with_prepared_responses([in_json_format(updated_gist)]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([updated_gist]) do
             subject.execute(topic_id: topic_1.id)
           end
 
@@ -57,7 +53,7 @@ def in_json_format(summary)
         it "does nothing if the gist was created less than 5 minutes ago" do
           ai_gist.update!(created_at: 2.minutes.ago)
 
-          DiscourseAi::Completions::Llm.with_prepared_responses([in_json_format(updated_gist)]) do
+          DiscourseAi::Completions::Llm.with_prepared_responses([updated_gist]) do
             subject.execute(topic_id: topic_1.id)
           end
 
 
@@ -50,14 +50,10 @@ def with_responses(responses)
       end
     end
 
-    def in_json_format(summary)
-      "{\"summary\":\"#{summary}\"}"
-    end
-
     it "publishes updates with a partial summary" do
       summary = "dummy"
 
-      with_responses([in_json_format(summary)]) do
+      with_responses([summary]) do
         messages =
           MessageBus.track_publish("/discourse-ai/summaries/topic/#{topic.id}") do
             job.execute(topic_id: topic.id, user_id: user.id)
@@ -74,7 +70,7 @@ def in_json_format(summary)
     it "publishes a final update to signal we're done and provide metadata" do
       summary = "dummy"
 
-      with_responses([in_json_format(summary)]) do
+      with_responses([summary]) do
         messages =
           MessageBus.track_publish("/discourse-ai/summaries/topic/#{topic.id}") do
             job.execute(topic_id: topic.id, user_id: user.id)
 
@@ -84,10 +84,6 @@
     end
   end
 
-  def in_json_format(summary)
-    "{\"summary\":\"#{summary}\"}"
-  end
-
   describe "#execute" do
     it "backfills a batch" do
       topic_2 =
@@ -102,7 +98,7 @@ def in_json_format(summary)
       gist_2 = "Updated gist of topic"
 
       DiscourseAi::Completions::Llm.with_prepared_responses(
-        [gist_1, gist_2, summary_1, summary_2].map { |s| in_json_format(s) },
+        [gist_1, gist_2, summary_1, summary_2],
       ) { subject.execute({}) }
 
       expect(AiSummary.complete.find_by(target: topic_2).summarized_text).to eq(summary_1)