server: Implement /v1/responses (text generation only)

openingnow · openingnow · commit 10c38c18e42b · 2025-12-20T14:51:55.000Z
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
@@ -1082,6 +1082,43 @@ json oaicompat_chat_params_parse(
     return llama_params;
 }
 
+json convert_responses_to_chatcmpl(const json & body) {
+    if (!body.contains("input")) {
+        throw std::invalid_argument("'input' is required");
+    }
+    if (!json_value(body, "previous_response_id", std::string{}).empty()) {
+        throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
+    }
+
+    const json input_value = body.at("input");
+    json chatcmpl_messages = json::array();
+
+    if (input_value.is_array()) {
+        chatcmpl_messages = input_value;
+    } else if (input_value.is_string()) {
+        chatcmpl_messages.push_back({
+            {"role",    "user"},
+            {"content", input_value},
+        });
+    } else {
+        std::invalid_argument("'input' must be a string or array of objects");
+    }
+
+    const std::string instructions = json_value(body, "instructions", std::string{});
+    if (instructions != "") {
+        chatcmpl_messages.push_back({
+            {"role",    "system"},
+            {"content", instructions},
+        });
+    }
+
+    json chatcmpl_body = body;
+    chatcmpl_body.erase("input");
+    chatcmpl_body["messages"] = chatcmpl_messages;
+
+    return chatcmpl_body;
+}
+
 json convert_anthropic_to_oai(const json & body) {
     json oai_body;
 
@@ -1485,6 +1522,24 @@ std::string format_oai_sse(const json & data) {
     return ss.str();
 }
 
+std::string format_oai_resp_sse(const json & data) {
+    std::ostringstream ss;
+    auto send_single = [&ss](const json & event_obj) {
+        ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
+        ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
+    };
+
+    if (data.is_array()) {
+        for (const auto & item : data) {
+            send_single(item);
+        }
+    } else {
+        send_single(data);
+    }
+
+    return ss.str();
+}
+
 std::string format_anthropic_sse(const json & data) {
     std::ostringstream ss;
 
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
@@ -297,6 +297,9 @@ json oaicompat_chat_params_parse(
     const oaicompat_parser_options & opt,
     std::vector<raw_buffer> & out_files);
 
+// convert OpenAI Responses API format to OpenAI Chat Completions API format
+json convert_responses_to_chatcmpl(const json & body);
+
 // convert Anthropic Messages API format to OpenAI Chat Completions API format
 json convert_anthropic_to_oai(const json & body);
 
@@ -333,6 +336,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
 // note: if data is a json array, it will be sent as multiple events, one per item
 std::string format_oai_sse(const json & data);
 
+std::string format_oai_resp_sse(const json & data);
+
 // format Anthropic-style SSE with event types
 std::string format_anthropic_sse(const json & data);
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -2842,6 +2842,58 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
         json first_result_json = first_result->to_json();
         if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
             res->data = format_anthropic_sse(first_result_json);
+        } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
+            const json created = {
+                {"event", "response.created"},
+                {"data", json {
+                    {"type", "response.created"},
+                    {"response", json {
+                        {"object", "response"},
+                        {"status", "in_progress"}
+                    }}
+                }}
+            };
+            const json in_progress = {
+                {"event", "response.in_progress"},
+                {"data", json {
+                    {"type", "response.in_progress"},
+                    {"response", json {
+                        {"object", "response"},
+                        {"status", "in_progress"}
+                    }}
+                }}
+            };
+            const json output_item_added = {
+                {"event", "response.output_item.added"},
+                {"data", json {
+                    {"type", "response.output_item.added"},
+                    {"item", json {
+                        {"type", "message"},
+                        {"status", "in_progress"},
+                        {"content", json::array()},
+                        {"role", "assistant"}
+                    }}
+                }}
+            };
+            const json content_part_added = {
+                {"event", "response.content_part.added"},
+                {"data", json {
+                    {"type", "response.content_part.added"},
+                    {"part", json {
+                        {"type", "output_text"},
+                        {"text", ""}
+                    }}
+                }}
+            };
+
+            std::string res_data;
+            res_data += format_oai_resp_sse(created);
+            res_data += format_oai_resp_sse(in_progress);
+            res_data += format_oai_resp_sse(output_item_added);
+            res_data += format_oai_resp_sse(content_part_added);
+            res_data += format_oai_resp_sse(first_result_json);
+
+            res->data = res_data;
         } else {
             res->data = format_oai_sse(first_result_json);
         }
@@ -2876,13 +2928,16 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
 
                 // check if there is more data
                 if (!rd.has_next()) {
-                    if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
-                        // Anthropic doesn't send [DONE], message_stop was already sent
-                        output = "";
-                    } else if (res_type != TASK_RESPONSE_TYPE_NONE) {
-                        output = "data: [DONE]\n\n";
-                    } else {
-                        output = "";
+                    switch (res_type) {
+                        case TASK_RESPONSE_TYPE_NONE:
+                        case TASK_RESPONSE_TYPE_OAI_RESP:
+                        case TASK_RESPONSE_TYPE_ANTHROPIC:
+                            output = "";
+                            break;
+
+                        default:
+                            output = "data: [DONE]\n\n";
+                            break;
                     }
                     SRV_DBG("%s", "all results received, terminating stream\n");
                     return false; // no more data, terminate
@@ -2909,6 +2964,8 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
                     json res_json = result->to_json();
                     if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
                         output = format_anthropic_sse(res_json);
+                    } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
+                        output = format_oai_resp_sse(res_json);
                     } else {
                         output = format_oai_sse(res_json);
                     }
@@ -3320,6 +3377,22 @@ void server_routes::init_routes() {
             TASK_RESPONSE_TYPE_OAI_CHAT);
     };
 
+    this->post_responses_oai = [this](const server_http_req & req) {
+        std::vector<raw_buffer> files;
+        json body = convert_responses_to_chatcmpl(json::parse(req.body));
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            ctx_server.oai_parser_opt,
+            files);
+        return handle_completions_impl(
+            ctx_server,
+            SERVER_TASK_TYPE_COMPLETION,
+            body_parsed,
+            files,
+            req.should_stop,
+            TASK_RESPONSE_TYPE_OAI_RESP);
+    };
+
     this->post_anthropic_messages = [this](const server_http_req & req) {
         std::vector<raw_buffer> files;
         json body = convert_anthropic_to_oai(json::parse(req.body));
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
@@ -69,6 +69,7 @@ struct server_routes {
     server_http_context::handler_t post_completions;
     server_http_context::handler_t post_completions_oai;
     server_http_context::handler_t post_chat_completions;
+    server_http_context::handler_t post_responses_oai;
     server_http_context::handler_t post_anthropic_messages;
     server_http_context::handler_t post_anthropic_count_tokens;
     server_http_context::handler_t post_apply_template;
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
@@ -580,6 +580,8 @@ json server_task_result_cmpl_final::to_json() {
             return to_json_oaicompat();
         case TASK_RESPONSE_TYPE_OAI_CHAT:
             return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
+        case TASK_RESPONSE_TYPE_OAI_RESP:
+            return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
         case TASK_RESPONSE_TYPE_ANTHROPIC:
             return stream ? to_json_anthropic_stream() : to_json_anthropic();
         default:
@@ -797,6 +799,122 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
     return deltas;
 }
 
+json server_task_result_cmpl_final::to_json_oaicompat_resp() {
+    common_chat_msg msg;
+    if (!oaicompat_msg.empty()) {
+        msg = oaicompat_msg;
+    } else {
+        msg.role = "assistant";
+        msg.content = content;
+    }
+
+    const json reasoning = {
+        {"type",    "reasoning"},
+        {"summary", json::array({json {
+            {"type", "summary_text"},
+            {"text", msg.reasoning_content}
+        }})}
+    };
+    const json message = {
+        {"type",    "message"},
+        {"status",  "completed"},
+        {"content", json::array({json {
+            {"type",        "output_text"},
+            {"annotations", json::array()},
+            {"logprobs",    json::array()},
+            {"text",        msg.content}
+        }})},
+        {"role", msg.role}
+    };
+
+    std::time_t t = std::time(0);
+    json res = {
+        {"object",     "response"},
+        {"created_at", t},
+        {"status",     "completed"},
+        {"model",      oaicompat_model},
+        {"output",     json::array({reasoning, message})},
+        {"usage",      json {
+            {"input_tokens",  n_prompt_tokens},
+            {"output_tokens", n_decoded},
+            {"total_tokens",  n_decoded + n_prompt_tokens}
+        }},
+    };
+
+    if (verbose) {
+        res["__verbose"] = to_json_non_oaicompat();
+    }
+    if (timings.prompt_n >= 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
+    json server_sent_events = json::array();
+
+    server_sent_events.push_back(json {
+        {"event", "response.output_text.done"},
+        {"data", json {
+            {"type", "response.output_text.done"},
+            {"text", oaicompat_msg.content}
+        }}
+    });
+
+    const json part = {
+        {"type",        "output_text"},
+        {"annotations", json::array()},
+        {"logprobs",    json::array()},
+        {"text",        oaicompat_msg.content}
+    };
+
+    server_sent_events.push_back(json {
+        {"event", "response.content_part.done"},
+        {"data", json {
+            {"type", "response.content_part.done"},
+            {"part", part}
+        }}
+    });
+
+    const json item = {
+        {"type",    "message"},
+        {"status",  "completed"},
+        {"content", json::array({part})},
+        {"role",    "assistant"}
+    };
+
+    server_sent_events.push_back(json {
+        {"event", "response.output_item.done"},
+        {"data", json {
+            {"type", "response.output_item.done"},
+            {"item", item}
+        }}
+    });
+
+    std::time_t t = std::time(0);
+    server_sent_events.push_back(json {
+        {"event", "response.completed"},
+        {"data", json {
+            {"type", "response.completed"},
+            {"response", json {
+                {"object",     "response"},
+                {"created_at", t},
+                {"status",     "completed"},
+                {"model",      oaicompat_model},
+                {"output",     json::array({item})},
+                {"usage",      json {
+                    {"input_tokens",  n_prompt_tokens},
+                    {"output_tokens", n_decoded},
+                    {"total_tokens",  n_decoded + n_prompt_tokens}
+                }}
+            }},
+        }}
+    });
+
+    return server_sent_events;
+}
+
 json server_task_result_cmpl_final::to_json_anthropic() {
     std::string stop_reason = "max_tokens";
     if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
@@ -991,6 +1109,8 @@ json server_task_result_cmpl_partial::to_json() {
             return to_json_oaicompat();
         case TASK_RESPONSE_TYPE_OAI_CHAT:
             return to_json_oaicompat_chat();
+        case TASK_RESPONSE_TYPE_OAI_RESP:
+            return to_json_oaicompat_resp();
         case TASK_RESPONSE_TYPE_ANTHROPIC:
             return to_json_anthropic();
         default:
@@ -1115,6 +1235,33 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
     return deltas;
 }
 
+json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
+    std::vector<json> deltas;
+
+    for (const auto & diff : oaicompat_msg_diffs) {
+        if (!diff.reasoning_content_delta.empty()) {
+            deltas.push_back(json {
+                {"event", "response.reasoning_text.delta"},
+                {"data", json {
+                    {"type",  "response.reasoning_text.delta"},
+                    {"delta", diff.reasoning_content_delta}
+                }}
+            });
+        }
+        if (!diff.content_delta.empty()) {
+            deltas.push_back(json {
+                {"event", "response.output_text.delta"},
+                {"data", json {
+                    {"type",  "response.output_text.delta"},
+                    {"delta", diff.content_delta}
+                }}
+            });
+        }
+    }
+
+    return deltas;
+}
+
 //
 // server_task_result_embd
 //
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
diff --git a/tools/server/server.cpp b/tools/server/server.cpp