Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/requirements-tool_bench.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0
matplotlib~=3.10.0
numpy~=1.26.4
openai~=1.55.3
openai~=2.14.0
pandas~=2.2.3
prometheus-client~=0.20.0
requests~=2.32.3
Expand Down
60 changes: 60 additions & 0 deletions tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1082,6 +1082,48 @@ json oaicompat_chat_params_parse(
return llama_params;
}

json convert_responses_to_chatcmpl(const json & body) {
if (!body.contains("input")) {
throw std::invalid_argument("'input' is required");
}
if (!json_value(body, "previous_response_id", std::string{}).empty()) {
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
}

const json input_value = body.at("input");
json chatcmpl_messages = json::array();

if (input_value.is_array()) {
chatcmpl_messages = input_value;
} else if (input_value.is_string()) {
chatcmpl_messages.push_back({
{"role", "user"},
{"content", input_value},
});
} else {
std::invalid_argument("'input' must be a string or array of objects");
}

const std::string instructions = json_value(body, "instructions", std::string{});
if (instructions != "") {
chatcmpl_messages.push_back({
{"role", "system"},
{"content", instructions},
});
}

json chatcmpl_body = body;
chatcmpl_body.erase("input");
chatcmpl_body["messages"] = chatcmpl_messages;

if (body.contains("max_output_tokens")) {
chatcmpl_body.erase("max_output_tokens");
chatcmpl_body["max_tokens"] = body["max_output_tokens"];
}

return chatcmpl_body;
}

json convert_anthropic_to_oai(const json & body) {
json oai_body;

Expand Down Expand Up @@ -1485,6 +1527,24 @@ std::string format_oai_sse(const json & data) {
return ss.str();
}

std::string format_oai_resp_sse(const json & data) {
std::ostringstream ss;
auto send_single = [&ss](const json & event_obj) {
ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
};

if (data.is_array()) {
for (const auto & item : data) {
send_single(item);
}
} else {
send_single(data);
}

return ss.str();
}

std::string format_anthropic_sse(const json & data) {
std::ostringstream ss;

Expand Down
5 changes: 5 additions & 0 deletions tools/server/server-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,9 @@ json oaicompat_chat_params_parse(
const oaicompat_parser_options & opt,
std::vector<raw_buffer> & out_files);

// convert OpenAI Responses API format to OpenAI Chat Completions API format
json convert_responses_to_chatcmpl(const json & body);

// convert Anthropic Messages API format to OpenAI Chat Completions API format
json convert_anthropic_to_oai(const json & body);

Expand Down Expand Up @@ -333,6 +336,8 @@ std::string tokens_to_output_formatted_string(const llama_context * ctx, const l
// note: if data is a json array, it will be sent as multiple events, one per item
std::string format_oai_sse(const json & data);

std::string format_oai_resp_sse(const json & data);

// format Anthropic-style SSE with event types
std::string format_anthropic_sse(const json & data);

Expand Down
89 changes: 82 additions & 7 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2945,6 +2945,58 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
json first_result_json = first_result->to_json();
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
res->data = format_anthropic_sse(first_result_json);
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
const json created = {
{"event", "response.created"},
{"data", json {
{"type", "response.created"},
{"response", json {
{"object", "response"},
{"status", "in_progress"}
}}
}}
};
const json in_progress = {
{"event", "response.in_progress"},
{"data", json {
{"type", "response.in_progress"},
{"response", json {
{"object", "response"},
{"status", "in_progress"}
}}
}}
};
const json output_item_added = {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"type", "message"},
{"status", "in_progress"},
{"content", json::array()},
{"role", "assistant"}
}}
}}
};
const json content_part_added = {
{"event", "response.content_part.added"},
{"data", json {
{"type", "response.content_part.added"},
{"part", json {
{"type", "output_text"},
{"text", ""}
}}
}}
};

const json initial_events = json::array({
created,
in_progress,
output_item_added,
content_part_added
});

res->data = format_oai_resp_sse(initial_events) + format_oai_resp_sse(first_result_json);
} else {
res->data = format_oai_sse(first_result_json);
}
Expand Down Expand Up @@ -2979,13 +3031,16 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(

// check if there is more data
if (!rd.has_next()) {
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
// Anthropic doesn't send [DONE], message_stop was already sent
output = "";
} else if (res_type != TASK_RESPONSE_TYPE_NONE) {
output = "data: [DONE]\n\n";
} else {
output = "";
switch (res_type) {
case TASK_RESPONSE_TYPE_NONE:
case TASK_RESPONSE_TYPE_OAI_RESP:
case TASK_RESPONSE_TYPE_ANTHROPIC:
output = "";
break;

default:
output = "data: [DONE]\n\n";
break;
}
SRV_DBG("%s", "all results received, terminating stream\n");
return false; // no more data, terminate
Expand All @@ -3012,6 +3067,8 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
json res_json = result->to_json();
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
output = format_anthropic_sse(res_json);
} else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
output = format_oai_resp_sse(res_json);
} else {
output = format_oai_sse(res_json);
}
Expand Down Expand Up @@ -3397,6 +3454,24 @@ void server_routes::init_routes() {
TASK_RESPONSE_TYPE_OAI_CHAT);
};

this->post_responses_oai = [this](const server_http_req & req) {
auto res = std::make_unique<server_res_generator>(ctx_server);
std::vector<raw_buffer> files;
json body = convert_responses_to_chatcmpl(json::parse(req.body));
json body_parsed = oaicompat_chat_params_parse(
body,
ctx_server.oai_parser_opt,
files);
return handle_completions_impl(
std::move(res),
ctx_server,
SERVER_TASK_TYPE_COMPLETION,
body_parsed,
files,
req.should_stop,
TASK_RESPONSE_TYPE_OAI_RESP);
};

this->post_anthropic_messages = [this](const server_http_req & req) {
auto res = std::make_unique<server_res_generator>(ctx_server);
std::vector<raw_buffer> files;
Expand Down
1 change: 1 addition & 0 deletions tools/server/server-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ struct server_routes {
server_http_context::handler_t post_completions;
server_http_context::handler_t post_completions_oai;
server_http_context::handler_t post_chat_completions;
server_http_context::handler_t post_responses_oai;
server_http_context::handler_t post_anthropic_messages;
server_http_context::handler_t post_anthropic_count_tokens;
server_http_context::handler_t post_apply_template;
Expand Down
Loading
Loading