Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions src/main/cpp/jllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,16 +452,6 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
llama_init_dft.context.reset();
}

ctx_server->chat_templates = common_chat_templates_init(ctx_server->model, params.chat_template);
try {
common_chat_format_example(ctx_server->chat_templates.get(), params.use_jinja);
} catch (const std::exception &e) {
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This "
"may cause the model to output suboptimal responses\n",
__func__);
ctx_server->chat_templates = common_chat_templates_init(ctx_server->model, "chatml");
}

// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
common_chat_templates_source(ctx_server->chat_templates.get()),
Expand Down Expand Up @@ -860,4 +850,4 @@ JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_jsonSchemaToGrammar
nlohmann::ordered_json c_schema_json = nlohmann::ordered_json::parse(c_schema);
const std::string c_grammar = json_schema_to_grammar(c_schema_json);
return parse_jbytes(env, c_grammar);
}
}
148 changes: 0 additions & 148 deletions src/main/cpp/server.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3269,151 +3269,3 @@ struct server_context {
};
}
};

static void common_params_handle_model_default(std::string &model, const std::string &model_url, std::string &hf_repo,
std::string &hf_file, const std::string &hf_token) {
if (!hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (hf_file.empty()) {
if (model.empty()) {
auto auto_detected = common_get_hf_file(hf_repo, hf_token);
if (auto_detected.first.empty() || auto_detected.second.empty()) {
exit(1); // built without CURL, error message already printed
}
hf_repo = auto_detected.first;
hf_file = auto_detected.second;
} else {
hf_file = model;
}
}
// make sure model path is present (for caching purposes)
if (model.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = hf_repo + "_" + hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
model = fs_get_cache_file(filename);
}
} else if (!model_url.empty()) {
if (model.empty()) {
auto f = string_split<std::string>(model_url, '#').front();
f = string_split<std::string>(f, '?').front();
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
} else if (model.empty()) {
model = DEFAULT_MODEL_PATH;
}
}

// parse the given jparams (see de.kherud.llama.args.ModelParameters#toString()) from JSON to the required C++ struct.
static void server_params_parse(json jparams, common_params &params) {
common_params default_params;

params.sampling.seed = json_value(jparams, "seed", default_params.sampling.seed);
params.cpuparams.n_threads = json_value(jparams, "n_threads", default_params.cpuparams.n_threads);
params.speculative.cpuparams.n_threads =
json_value(jparams, "n_threads_draft", default_params.speculative.cpuparams.n_threads);
params.cpuparams_batch.n_threads = json_value(jparams, "n_threads_batch", default_params.cpuparams_batch.n_threads);
params.speculative.cpuparams_batch.n_threads =
json_value(jparams, "n_threads_batch_draft", default_params.speculative.cpuparams_batch.n_threads);
params.n_predict = json_value(jparams, "n_predict", default_params.n_predict);
params.n_ctx = json_value(jparams, "n_ctx", default_params.n_ctx);
params.n_batch = json_value(jparams, "n_batch", default_params.n_batch);
params.n_ubatch = json_value(jparams, "n_ubatch", default_params.n_ubatch);
params.n_keep = json_value(jparams, "n_keep", default_params.n_keep);

params.speculative.n_max = json_value(jparams, "n_draft", default_params.speculative.n_max);
params.speculative.n_min = json_value(jparams, "n_draft_min", default_params.speculative.n_min);

params.n_chunks = json_value(jparams, "n_chunks", default_params.n_chunks);
params.n_parallel = json_value(jparams, "n_parallel", default_params.n_parallel);
params.n_sequences = json_value(jparams, "n_sequences", default_params.n_sequences);
params.speculative.p_split = json_value(jparams, "p_split", default_params.speculative.p_split);
params.grp_attn_n = json_value(jparams, "grp_attn_n", default_params.grp_attn_n);
params.grp_attn_w = json_value(jparams, "grp_attn_w", default_params.grp_attn_w);
params.n_print = json_value(jparams, "n_print", default_params.n_print);
params.rope_freq_base = json_value(jparams, "rope_freq_base", default_params.rope_freq_base);
params.rope_freq_scale = json_value(jparams, "rope_freq_scale", default_params.rope_freq_scale);
params.yarn_ext_factor = json_value(jparams, "yarn_ext_factor", default_params.yarn_ext_factor);
params.yarn_attn_factor = json_value(jparams, "yarn_attn_factor", default_params.yarn_attn_factor);
params.yarn_beta_fast = json_value(jparams, "yarn_beta_fast", default_params.yarn_beta_fast);
params.yarn_beta_slow = json_value(jparams, "yarn_beta_slow", default_params.yarn_beta_slow);
params.yarn_orig_ctx = json_value(jparams, "yarn_orig_ctx", default_params.yarn_orig_ctx);
params.defrag_thold = json_value(jparams, "defrag_thold", default_params.defrag_thold);
params.numa = json_value(jparams, "numa", default_params.numa);
params.rope_scaling_type = json_value(jparams, "rope_scaling_type", default_params.rope_scaling_type);
params.pooling_type = json_value(jparams, "pooling_type", default_params.pooling_type);
params.model = json_value(jparams, "model", default_params.model);
params.speculative.model = json_value(jparams, "model_draft", default_params.speculative.model);
params.model_alias = json_value(jparams, "model_alias", default_params.model_alias);
params.model_url = json_value(jparams, "model_url", default_params.model_url);
params.hf_repo = json_value(jparams, "hf_repo", default_params.hf_repo);
params.hf_file = json_value(jparams, "hf_file", default_params.hf_file);
params.prompt = json_value(jparams, "prompt", default_params.prompt);
params.prompt_file = json_value(jparams, "prompt_file", default_params.prompt_file);
params.path_prompt_cache = json_value(jparams, "path_prompt_cache", default_params.path_prompt_cache);
params.input_prefix = json_value(jparams, "input_prefix", default_params.input_prefix);
params.input_suffix = json_value(jparams, "input_suffix", default_params.input_suffix);
params.antiprompt = json_value(jparams, "antiprompt", default_params.antiprompt);
params.lookup_cache_static = json_value(jparams, "lookup_cache_static", default_params.lookup_cache_static);
params.lookup_cache_dynamic = json_value(jparams, "lookup_cache_dynamic", default_params.lookup_cache_dynamic);
params.logits_file = json_value(jparams, "logits_file", default_params.logits_file);
// params.lora_adapters = json_value(jparams, "lora_adapter", default_params.lora_adapters);
params.embedding = json_value(jparams, "embedding", default_params.embedding);
params.escape = json_value(jparams, "escape", default_params.escape);
params.cont_batching = json_value(jparams, "cont_batching", default_params.cont_batching);
params.flash_attn = json_value(jparams, "flash_attn", default_params.flash_attn);
params.input_prefix_bos = json_value(jparams, "input_prefix_bos", default_params.input_prefix_bos);
params.sampling.ignore_eos = json_value(jparams, "ignore_eos", default_params.sampling.ignore_eos);
params.use_mmap = json_value(jparams, "use_mmap", default_params.use_mmap);
params.use_mlock = json_value(jparams, "use_mlock", default_params.use_mlock);
params.no_kv_offload = json_value(jparams, "no_kv_offload", default_params.no_kv_offload);
params.chat_template = json_value(jparams, "chat_template", default_params.chat_template);

if (jparams.contains("n_gpu_layers")) {
if (llama_supports_gpu_offload()) {
params.n_gpu_layers = json_value(jparams, "n_gpu_layers", default_params.n_gpu_layers);
params.speculative.n_gpu_layers =
json_value(jparams, "n_gpu_layers_draft", default_params.speculative.n_gpu_layers);
} else {
SRV_WRN("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
"See main README.md for information on enabling GPU BLAS support: %s = %d",
"n_gpu_layers", params.n_gpu_layers);
}
}

if (jparams.contains("split_mode")) {
params.split_mode = json_value(jparams, "split_mode", default_params.split_mode);
// todo: the definition checks here currently don't work due to cmake visibility reasons
#ifndef GGML_USE_CUDA
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
#endif
}

if (jparams.contains("tensor_split")) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
std::vector<float> tensor_split = jparams["tensor_split"].get<std::vector<float>>();
GGML_ASSERT(tensor_split.size() <= llama_max_devices());

for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
if (i_device < tensor_split.size()) {
params.tensor_split[i_device] = tensor_split.at(i_device);
} else {
params.tensor_split[i_device] = 0.0f;
}
}
#else
SRV_WRN("%s", "llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUDA
}

if (jparams.contains("main_gpu")) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
params.main_gpu = json_value(jparams, "main_gpu", default_params.main_gpu);
#else
SRV_WRN("%s", "llama.cpp was compiled without CUDA. It is not possible to set a main GPU.");
#endif
}

common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token);
}