Skip to content

Commit cddf872

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 4244093 + 9496bbb commit cddf872

File tree

13 files changed

+359
-125
lines changed

13 files changed

+359
-125
lines changed

common/CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ add_library(${TARGET} STATIC
8585
unicode.h
8686
)
8787

88+
target_include_directories(${TARGET} PUBLIC . ../vendor)
89+
target_compile_features (${TARGET} PUBLIC cxx_std_17)
90+
8891
if (BUILD_SHARED_LIBS)
8992
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
9093
endif()
@@ -151,9 +154,7 @@ if (LLAMA_LLGUIDANCE)
151154
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
152155
endif ()
153156

154-
target_include_directories(${TARGET} PUBLIC . ../vendor)
155-
target_compile_features (${TARGET} PUBLIC cxx_std_17)
156-
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
157+
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
157158

158159

159160
#

common/arg.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2887,6 +2887,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28872887
params.lora_init_without_apply = true;
28882888
}
28892889
).set_examples({LLAMA_EXAMPLE_SERVER}));
2890+
add_opt(common_arg(
2891+
{"--sleep-idle-seconds"}, "SECONDS",
2892+
string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
2893+
[](common_params & params, int value) {
2894+
if (value == 0 || value < -1) {
2895+
throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
2896+
}
2897+
params.sleep_idle_seconds = value;
2898+
}
2899+
).set_examples({LLAMA_EXAMPLE_SERVER}));
28902900
add_opt(common_arg(
28912901
{"--simple-io"},
28922902
"use basic IO for better compatibility in subprocesses and limited consoles",

common/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,8 @@ struct common_params {
475475
bool enable_chat_template = true;
476476
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
477477
int reasoning_budget = -1;
478-
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
478+
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
479+
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
479480

480481
std::vector<std::string> api_keys;
481482

tools/cli/cli.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,6 @@ int main(int argc, char ** argv) {
209209
return 1;
210210
}
211211

212-
ctx_cli.ctx_server.init();
213-
214212
console::spinner::stop();
215213
console::log("\n");
216214

tools/server/README-dev.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ For detailed instructions, see the [test documentation](./tests/README.md).
107107
- Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362
108108
- Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470
109109
- Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
110+
- INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169)
111+
- Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228
110112

111113

112114

tools/server/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,16 @@ Example of an error:
16211621
}
16221622
```
16231623

1624+
## Sleeping on Idle
1625+
1626+
The server supports an automatic sleep mode that activates after a specified period of inactivity (no incoming tasks). This feature, introduced in [PR #18228](https://github.com/ggml-org/llama.cpp/pull/18228), can be enabled using the `--sleep-idle-seconds` command-line argument. It works seamlessly in both single-model and multi-model configurations.
1627+
1628+
When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.
1629+
1630+
Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:
1631+
- `GET /health`
1632+
- `GET /props`
1633+
16241634
## More examples
16251635

16261636
### Interactive mode

0 commit comments

Comments
 (0)