Skip to content

Commit 10c38c1

Browse files
author
openingnow
committed
server: Implement /v1/responses (text generation only)
1 parent 10b4f82 commit 10c38c1

File tree

7 files changed

+296
-7
lines changed

7 files changed

+296
-7
lines changed

tools/server/server-common.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,43 @@ json oaicompat_chat_params_parse(
10821082
return llama_params;
10831083
}
10841084

1085+
json convert_responses_to_chatcmpl(const json & body) {
1086+
if (!body.contains("input")) {
1087+
throw std::invalid_argument("'input' is required");
1088+
}
1089+
if (!json_value(body, "previous_response_id", std::string{}).empty()) {
1090+
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
1091+
}
1092+
1093+
const json input_value = body.at("input");
1094+
json chatcmpl_messages = json::array();
1095+
1096+
if (input_value.is_array()) {
1097+
chatcmpl_messages = input_value;
1098+
} else if (input_value.is_string()) {
1099+
chatcmpl_messages.push_back({
1100+
{"role", "user"},
1101+
{"content", input_value},
1102+
});
1103+
} else {
1104+
std::invalid_argument("'input' must be a string or array of objects");
1105+
}
1106+
1107+
const std::string instructions = json_value(body, "instructions", std::string{});
1108+
if (instructions != "") {
1109+
chatcmpl_messages.push_back({
1110+
{"role", "system"},
1111+
{"content", instructions},
1112+
});
1113+
}
1114+
1115+
json chatcmpl_body = body;
1116+
chatcmpl_body.erase("input");
1117+
chatcmpl_body["messages"] = chatcmpl_messages;
1118+
1119+
return chatcmpl_body;
1120+
}
1121+
10851122
json convert_anthropic_to_oai(const json & body) {
10861123
json oai_body;
10871124

@@ -1485,6 +1522,24 @@ std::string format_oai_sse(const json & data) {
14851522
return ss.str();
14861523
}
14871524

1525+
std::string format_oai_resp_sse(const json & data) {
1526+
std::ostringstream ss;
1527+
auto send_single = [&ss](const json & event_obj) {
1528+
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
1529+
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
1530+
};
1531+
1532+
if (data.is_array()) {
1533+
for (const auto & item : data) {
1534+
send_single(item);
1535+
}
1536+
} else {
1537+
send_single(data);
1538+
}
1539+
1540+
return ss.str();
1541+
}
1542+
14881543
std::string format_anthropic_sse(const json & data) {
14891544
std::ostringstream ss;
14901545

tools/server/server-common.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,9 @@ json oaicompat_chat_params_parse(
297297
const oaicompat_parser_options & opt,
298298
std::vector<raw_buffer> & out_files);
299299

300+
// convert OpenAI Responses API format to OpenAI Chat Completions API format
301+
json convert_responses_to_chatcmpl(const json & body);
302+
300303
// convert Anthropic Messages API format to OpenAI Chat Completions API format
301304
json convert_anthropic_to_oai(const json & body);
302305

@@ -333,6 +336,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
333336
// note: if data is a json array, it will be sent as multiple events, one per item
334337
std::string format_oai_sse(const json & data);
335338

339+
std::string format_oai_resp_sse(const json & data);
340+
336341
// format Anthropic-style SSE with event types
337342
std::string format_anthropic_sse(const json & data);
338343

tools/server/server-context.cpp

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2842,6 +2842,58 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
28422842
json first_result_json = first_result->to_json();
28432843
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
28442844
res->data = format_anthropic_sse(first_result_json);
2845+
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
2846+
const json created = {
2847+
{"event", "response.created"},
2848+
{"data", json {
2849+
{"type", "response.created"},
2850+
{"response", json {
2851+
{"object", "response"},
2852+
{"status", "in_progress"}
2853+
}}
2854+
}}
2855+
};
2856+
const json in_progress = {
2857+
{"event", "response.in_progress"},
2858+
{"data", json {
2859+
{"type", "response.in_progress"},
2860+
{"response", json {
2861+
{"object", "response"},
2862+
{"status", "in_progress"}
2863+
}}
2864+
}}
2865+
};
2866+
const json output_item_added = {
2867+
{"event", "response.output_item.added"},
2868+
{"data", json {
2869+
{"type", "response.output_item.added"},
2870+
{"item", json {
2871+
{"type", "message"},
2872+
{"status", "in_progress"},
2873+
{"content", json::array()},
2874+
{"role", "assistant"}
2875+
}}
2876+
}}
2877+
};
2878+
const json content_part_added = {
2879+
{"event", "response.content_part.added"},
2880+
{"data", json {
2881+
{"type", "response.content_part.added"},
2882+
{"part", json {
2883+
{"type", "output_text"},
2884+
{"text", ""}
2885+
}}
2886+
}}
2887+
};
2888+
2889+
std::string res_data;
2890+
res_data += format_oai_resp_sse(created);
2891+
res_data += format_oai_resp_sse(in_progress);
2892+
res_data += format_oai_resp_sse(output_item_added);
2893+
res_data += format_oai_resp_sse(content_part_added);
2894+
res_data += format_oai_resp_sse(first_result_json);
2895+
2896+
res->data = res_data;
28452897
} else {
28462898
res->data = format_oai_sse(first_result_json);
28472899
}
@@ -2876,13 +2928,16 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
28762928

28772929
// check if there is more data
28782930
if (!rd.has_next()) {
2879-
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
2880-
// Anthropic doesn't send [DONE], message_stop was already sent
2881-
output = "";
2882-
} else if (res_type != TASK_RESPONSE_TYPE_NONE) {
2883-
output = "data: [DONE]\n\n";
2884-
} else {
2885-
output = "";
2931+
switch (res_type) {
2932+
case TASK_RESPONSE_TYPE_NONE:
2933+
case TASK_RESPONSE_TYPE_OAI_RESP:
2934+
case TASK_RESPONSE_TYPE_ANTHROPIC:
2935+
output = "";
2936+
break;
2937+
2938+
default:
2939+
output = "data: [DONE]\n\n";
2940+
break;
28862941
}
28872942
SRV_DBG("%s", "all results received, terminating stream\n");
28882943
return false; // no more data, terminate
@@ -2909,6 +2964,8 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
29092964
json res_json = result->to_json();
29102965
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
29112966
output = format_anthropic_sse(res_json);
2967+
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
2968+
output = format_oai_resp_sse(res_json);
29122969
} else {
29132970
output = format_oai_sse(res_json);
29142971
}
@@ -3320,6 +3377,22 @@ void server_routes::init_routes() {
33203377
TASK_RESPONSE_TYPE_OAI_CHAT);
33213378
};
33223379

3380+
this->post_responses_oai = [this](const server_http_req & req) {
3381+
std::vector<raw_buffer> files;
3382+
json body = convert_responses_to_chatcmpl(json::parse(req.body));
3383+
json body_parsed = oaicompat_chat_params_parse(
3384+
body,
3385+
ctx_server.oai_parser_opt,
3386+
files);
3387+
return handle_completions_impl(
3388+
ctx_server,
3389+
SERVER_TASK_TYPE_COMPLETION,
3390+
body_parsed,
3391+
files,
3392+
req.should_stop,
3393+
TASK_RESPONSE_TYPE_OAI_RESP);
3394+
};
3395+
33233396
this->post_anthropic_messages = [this](const server_http_req & req) {
33243397
std::vector<raw_buffer> files;
33253398
json body = convert_anthropic_to_oai(json::parse(req.body));

tools/server/server-context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ struct server_routes {
6969
server_http_context::handler_t post_completions;
7070
server_http_context::handler_t post_completions_oai;
7171
server_http_context::handler_t post_chat_completions;
72+
server_http_context::handler_t post_responses_oai;
7273
server_http_context::handler_t post_anthropic_messages;
7374
server_http_context::handler_t post_anthropic_count_tokens;
7475
server_http_context::handler_t post_apply_template;

tools/server/server-task.cpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,8 @@ json server_task_result_cmpl_final::to_json() {
580580
return to_json_oaicompat();
581581
case TASK_RESPONSE_TYPE_OAI_CHAT:
582582
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
583+
case TASK_RESPONSE_TYPE_OAI_RESP:
584+
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
583585
case TASK_RESPONSE_TYPE_ANTHROPIC:
584586
return stream ? to_json_anthropic_stream() : to_json_anthropic();
585587
default:
@@ -797,6 +799,122 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
797799
return deltas;
798800
}
799801

802+
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
803+
common_chat_msg msg;
804+
if (!oaicompat_msg.empty()) {
805+
msg = oaicompat_msg;
806+
} else {
807+
msg.role = "assistant";
808+
msg.content = content;
809+
}
810+
811+
const json reasoning = {
812+
{"type", "reasoning"},
813+
{"summary", json::array({json {
814+
{"type", "summary_text"},
815+
{"text", msg.reasoning_content}
816+
}})}
817+
};
818+
const json message = {
819+
{"type", "message"},
820+
{"status", "completed"},
821+
{"content", json::array({json {
822+
{"type", "output_text"},
823+
{"annotations", json::array()},
824+
{"logprobs", json::array()},
825+
{"text", msg.content}
826+
}})},
827+
{"role", msg.role}
828+
};
829+
830+
std::time_t t = std::time(0);
831+
json res = {
832+
{"object", "response"},
833+
{"created_at", t},
834+
{"status", "completed"},
835+
{"model", oaicompat_model},
836+
{"output", json::array({reasoning, message})},
837+
{"usage", json {
838+
{"input_tokens", n_prompt_tokens},
839+
{"output_tokens", n_decoded},
840+
{"total_tokens", n_decoded + n_prompt_tokens}
841+
}},
842+
};
843+
844+
if (verbose) {
845+
res["__verbose"] = to_json_non_oaicompat();
846+
}
847+
if (timings.prompt_n >= 0) {
848+
res.push_back({"timings", timings.to_json()});
849+
}
850+
851+
return res;
852+
}
853+
854+
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
855+
json server_sent_events = json::array();
856+
857+
server_sent_events.push_back(json {
858+
{"event", "response.output_text.done"},
859+
{"data", json {
860+
{"type", "response.output_text.done"},
861+
{"text", oaicompat_msg.content}
862+
}}
863+
});
864+
865+
const json part = {
866+
{"type", "output_text"},
867+
{"annotations", json::array()},
868+
{"logprobs", json::array()},
869+
{"text", oaicompat_msg.content}
870+
};
871+
872+
server_sent_events.push_back(json {
873+
{"event", "response.content_part.done"},
874+
{"data", json {
875+
{"type", "response.content_part.done"},
876+
{"part", part}
877+
}}
878+
});
879+
880+
const json item = {
881+
{"type", "message"},
882+
{"status", "completed"},
883+
{"content", json::array({part})},
884+
{"role", "assistant"}
885+
};
886+
887+
server_sent_events.push_back(json {
888+
{"event", "response.output_item.done"},
889+
{"data", json {
890+
{"type", "response.output_item.done"},
891+
{"item", item}
892+
}}
893+
});
894+
895+
std::time_t t = std::time(0);
896+
server_sent_events.push_back(json {
897+
{"event", "response.completed"},
898+
{"data", json {
899+
{"type", "response.completed"},
900+
{"response", json {
901+
{"object", "response"},
902+
{"created_at", t},
903+
{"status", "completed"},
904+
{"model", oaicompat_model},
905+
{"output", json::array({item})},
906+
{"usage", json {
907+
{"input_tokens", n_prompt_tokens},
908+
{"output_tokens", n_decoded},
909+
{"total_tokens", n_decoded + n_prompt_tokens}
910+
}}
911+
}},
912+
}}
913+
});
914+
915+
return server_sent_events;
916+
}
917+
800918
json server_task_result_cmpl_final::to_json_anthropic() {
801919
std::string stop_reason = "max_tokens";
802920
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
@@ -991,6 +1109,8 @@ json server_task_result_cmpl_partial::to_json() {
9911109
return to_json_oaicompat();
9921110
case TASK_RESPONSE_TYPE_OAI_CHAT:
9931111
return to_json_oaicompat_chat();
1112+
case TASK_RESPONSE_TYPE_OAI_RESP:
1113+
return to_json_oaicompat_resp();
9941114
case TASK_RESPONSE_TYPE_ANTHROPIC:
9951115
return to_json_anthropic();
9961116
default:
@@ -1115,6 +1235,33 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
11151235
return deltas;
11161236
}
11171237

1238+
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
1239+
std::vector<json> deltas;
1240+
1241+
for (const auto & diff : oaicompat_msg_diffs) {
1242+
if (!diff.reasoning_content_delta.empty()) {
1243+
deltas.push_back(json {
1244+
{"event", "response.reasoning_text.delta"},
1245+
{"data", json {
1246+
{"type", "response.reasoning_text.delta"},
1247+
{"delta", diff.reasoning_content_delta}
1248+
}}
1249+
});
1250+
}
1251+
if (!diff.content_delta.empty()) {
1252+
deltas.push_back(json {
1253+
{"event", "response.output_text.delta"},
1254+
{"data", json {
1255+
{"type", "response.output_text.delta"},
1256+
{"delta", diff.content_delta}
1257+
}}
1258+
});
1259+
}
1260+
}
1261+
1262+
return deltas;
1263+
}
1264+
11181265
//
11191266
// server_task_result_embd
11201267
//

0 commit comments

Comments
 (0)