Skip to content

Commit f486f6e

Browse files
bmtwlrootcebtenzzreggerganov
authored
ggml : add numa options (ggml-org#5377)
* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h * Reverted Makefile * Fixed include * Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables * removed trailing whitespace * Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h * Reverting Makefile * Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet * Removing MIRROR_MODE code for this PR * Removing last bit of MIRROR_MODE code for this PR * Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static * Fixed lingering init_llama_backend() bool calls in tests and examples * Remote enum llama_numa_strategies * Revert bad merge with dynatemp flags * add missing enum ggml_numa_strategies declaration and revert sync problem with master * add missing enum ggml_numa_strategies declaration * fixed ggml_init_numa variable * Update ggml.h Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges * split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples * Fix up some boolean vs enum comparisons * Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype * Update ggml.h Align enum values Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update ggml.c Remove whitespace Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update ggml.c align paremeters Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/server/server.cpp remove whitespace and align brace Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/common.cpp Remove whitespace and align brace Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * unified ggml_numa_strategy enum and fixed text alignment in server.cpp example * Update ggml.c simplified return for platforms without NUMA support Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * removed redundant else from cli argument processing of --numa * whitespace --------- Co-authored-by: root <root@nenya.lothlorien.ca> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Jared Van Bortel <jared@nomic.ai>
1 parent 60ed04c commit f486f6e

36 files changed

Lines changed: 178 additions & 62 deletions

File tree

common/common.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
671671
} else if (arg == "--no-mmap") {
672672
params.use_mmap = false;
673673
} else if (arg == "--numa") {
674-
params.numa = true;
674+
if (++i >= argc) {
675+
invalid_param = true;
676+
break;
677+
}
678+
std::string value(argv[i]);
679+
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
680+
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
681+
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
682+
else { invalid_param = true; break; }
675683
} else if (arg == "--verbose-prompt") {
676684
params.verbose_prompt = true;
677685
} else if (arg == "--no-display-prompt") {
@@ -935,7 +943,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
935943
printf(" -tb N, --threads-batch N\n");
936944
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
937945
printf(" -td N, --threads-draft N");
938-
printf(" number of threads to use during generation (default: same as --threads)");
946+
printf(" number of threads to use during generation (default: same as --threads)\n");
939947
printf(" -tbd N, --threads-batch-draft N\n");
940948
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
941949
printf(" -p PROMPT, --prompt PROMPT\n");
@@ -1005,7 +1013,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
10051013
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
10061014
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
10071015
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
1008-
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
1016+
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
10091017
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
10101018
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
10111019
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
@@ -1022,7 +1030,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
10221030
if (llama_supports_mmap()) {
10231031
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
10241032
}
1025-
printf(" --numa attempt optimizations that help on some NUMA systems\n");
1033+
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
1034+
printf(" - distribute: spread execution evenly over all nodes\n");
1035+
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
1036+
printf(" - numactl: use the CPU map provided by numactl\n");
10261037
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
10271038
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
10281039
if (llama_supports_gpu_offload()) {
@@ -1689,7 +1700,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
16891700
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
16901701
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
16911702
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
1692-
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
16931703
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
16941704
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
16951705
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ struct gpt_params {
7676
float yarn_beta_slow = 1.0f; // YaRN high correction dim
7777
int32_t yarn_orig_ctx = 0; // YaRN original context length
7878
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
79+
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
7980

8081
// // sampling parameters
8182
struct llama_sampling_params sparams;
@@ -134,7 +135,6 @@ struct gpt_params {
134135
bool logits_all = false; // return logits for all tokens in the batch
135136
bool use_mmap = true; // use mmap for faster loads
136137
bool use_mlock = false; // use mlock to keep model in memory
137-
bool numa = false; // attempt optimizations that help on some NUMA systems
138138
bool verbose_prompt = false; // print prompt tokens before generation
139139
bool display_prompt = true; // print prompt before generation
140140
bool infill = false; // use infill mode

examples/batched-bench/batched-bench.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ int main(int argc, char ** argv) {
8282

8383
// init LLM
8484

85-
llama_backend_init(params.numa);
85+
llama_backend_init();
86+
llama_numa_init(params.numa);
8687

8788
// initialize the model
8889

examples/batched.swift/Sources/main.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu
1717
let n_len: Int = 32
1818

1919
// init LLM
20-
llama_backend_init(false)
20+
llama_backend_init()
2121
defer {
2222
llama_backend_free()
2323
}

examples/batched/batched.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
5050

5151
// init LLM
5252

53-
llama_backend_init(params.numa);
53+
llama_backend_init();
54+
llama_numa_init(params.numa);
5455

5556
// initialize the model
5657

examples/beam-search/beam-search.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ int main(int argc, char ** argv)
119119
// Init LLM :
120120
//---------------------------------
121121

122-
llama_backend_init(params.numa);
122+
llama_backend_init();
123+
llama_numa_init(params.numa);
123124

124125
llama_model * model;
125126
llama_context * ctx;

examples/embedding/embedding.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ int main(int argc, char ** argv) {
7474
params.prompt = gpt_random_prompt(rng);
7575
}
7676

77-
llama_backend_init(params.numa);
77+
llama_backend_init();
78+
llama_numa_init(params.numa);
7879

7980
llama_model * model;
8081
llama_context * ctx;

examples/imatrix/imatrix.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,8 @@ int main(int argc, char ** argv) {
568568
params.prompt = gpt_random_prompt(rng);
569569
}
570570

571-
llama_backend_init(params.numa);
571+
llama_backend_init();
572+
llama_numa_init(params.numa);
572573

573574
llama_model_params mparams = llama_model_params_from_gpt_params(params);
574575

examples/infill/infill.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,8 @@ int main(int argc, char ** argv) {
202202
std::mt19937 rng(params.seed);
203203

204204
LOG("%s: llama backend init\n", __func__);
205-
llama_backend_init(params.numa);
205+
llama_backend_init();
206+
llama_numa_init(params.numa);
206207

207208
llama_model * model;
208209
llama_context * ctx;

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,8 +1151,7 @@ int main(int argc, char ** argv) {
11511151
if (!params.verbose) {
11521152
llama_log_set(llama_null_log_callback, NULL);
11531153
}
1154-
bool numa = false;
1155-
llama_backend_init(numa);
1154+
llama_backend_init();
11561155

11571156
// initialize printer
11581157
std::unique_ptr<printer> p;

0 commit comments

Comments
 (0)