This one simple trick allows D programmer use llama.cpp, rust programmers hate him!

evilrat evilrat666 at gmail.com
Thu Mar 21 16:45:09 UTC 2024


Just like the title says, no need to make bindings anymore, just 
make a dummy C file with single include and start building your D 
app powered with llama.cpp, that's it, no extra work needed 
except making a dummy file.

Tested with dmd v2.107 on Windows.

## Code

__llamad.c__:

```c
#include "llama.h"
```

A ported D version of `simple` example from llama.cpp

__llamad.d__:

```d
module llama_d;

import std.string;
import std.stdio;

import llamad; // imports llamad.c

// pragma(msg, __traits(allMembers, llamad));

void main(string[] args)
{
     if (args.length < 3) {
         writeln("LLAMA D DEMO USAGE: llama-d <path_to_model> 
<your_prompt>");
         return;
     }

     llama_backend_init();
     llama_numa_init(GGML_NUMA_STRATEGY_DISABLED);

     auto mparams = llama_model_default_params();
     // mparams.n_gpu_layers = 30; // offload layers to the GPU to 
accelerate inference

     auto ctx_params = llama_context_default_params();
     ctx_params.n_ctx = 2048;

     import std.parallelism;
     ctx_params.n_threads = totalCPUs-1;
     ctx_params.n_threads_batch = ctx_params.n_threads_batch == -1 
? ctx_params.n_threads : ctx_params.n_threads_batch;

     llama_model*  model = 
llama_load_model_from_file(toStringz(args[1]), mparams);
     llama_context*	ctx = llama_new_context_with_model(model, 
ctx_params);

     const bool add_bos = llama_vocab_type(model) == 
LLAMA_VOCAB_TYPE_SPM;
     const bool allow_special = false;

     string prompt = args[2];

     if (!prompt.length)
         return;

     // convert prompt to embedings
     llama_token[] embd_inp;
     embd_inp.length = prompt.length;

     writeln("tokenizing...");

     auto n_of_tok = llama_tokenize(llama_get_model(ctx), 
prompt.ptr, cast(int) prompt.length, embd_inp.ptr, cast(int) 
embd_inp.length, add_bos, allow_special);
     embd_inp.length = n_of_tok;

     if (!n_of_tok) {
         writeln("no tokens generated, something gone wrong");
         return;
     }

     writeln("input has ", n_of_tok, " tokens");

     foreach (id; embd_inp) {
         write(llama_token_to_piece(ctx, id));
     }
     writeln();

     // total length of the sequence including the prompt
     const int n_len = 128;

     const int n_ctx = llama_n_ctx(ctx);
     const int n_kv_req = cast(int)(embd_inp.length + (n_len - 
embd_inp.length));

     if (n_kv_req > n_ctx) {
         writeln("error: prompt is too long");
         return;
     }

     writeln("building batch");

     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
     llama_batch batch = llama_batch_init(512, 0, 1);

     // evaluate the initial prompt
     for (size_t i = 0; i < embd_inp.length; i++) {
         // note that seq_pos_id = [0] is required as there should 
be at least one token
         llama_batch_add(batch, embd_inp[i], cast(int) i, [0], 
false);
     }

     // llama_decode will output logits only for the last token of 
the prompt
     batch.logits[batch.n_tokens - 1] = true;

     writeln("decoding batch");

     if (llama_decode(ctx, batch) != 0) {
         writeln("llama_decode() failed");
         return;
     }

     // main loop

     int n_cur    = batch.n_tokens;
     int n_decode = 0;

     const auto t_main_start = ggml_time_us();

     while (n_cur <= n_len) {
         // sample the next token
         {
             auto   n_vocab = llama_n_vocab(model);
             auto   logits  = llama_get_logits_ith(ctx, 
batch.n_tokens - 1);

             llama_token_data[] candidates;
             candidates.length = n_vocab;

             for (llama_token token_id = 0; token_id < n_vocab; 
token_id++) {
                 candidates ~= llama_token_data(token_id, 
logits[token_id], 0.0f);
             }

             llama_token_data_array candidates_p = { 
candidates.ptr, cast(int) candidates.length, false };

             // sample the most likely token
             const llama_token new_token_id = 
llama_sample_token_greedy(ctx, &candidates_p);

             // is it an end of stream?
             if (new_token_id == llama_token_eos(model) || n_cur 
== n_len) {
                 writeln();

                 break;
             }

             writef("%s", llama_token_to_piece(ctx, new_token_id));

             // prepare the next batch
             llama_batch_clear(batch);

             // push this new token for next evaluation
             llama_batch_add(batch, new_token_id, n_cur, [0], 
true);

             n_decode += 1;
         }

         n_cur += 1;

         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
             writefln("%s : failed to eval, return code %d\n", 
__FUNCTION__, 1);
             return;
         }
     }

     const auto t_main_end = ggml_time_us();
     llama_print_timings(ctx);
     writeln();

     // cleanup
     llama_batch_free(batch);
     llama_free(ctx);
     llama_free_model(model);
     llama_backend_free();
}


void llama_batch_add(
                     ref llama_batch batch,
                     llama_token id,
                     llama_pos pos,
                     const llama_seq_id[] seq_ids,
                     bool logits) {
     batch.token   [batch.n_tokens] = id;
     batch.pos     [batch.n_tokens] = pos;
     batch.n_seq_id[batch.n_tokens] = cast(int) seq_ids.length;
     for (size_t i = 0; i < seq_ids.length; ++i) {
         batch.seq_id[batch.n_tokens][i] = seq_ids[i];
     }
     batch.logits  [batch.n_tokens] = logits;

     batch.n_tokens++;
}

string llama_token_to_piece(llama_context* ctx, llama_token 
token) {
     char[] result;
     result.length = 8;
     const int n_tokens = 
llamad.llama_token_to_piece(llama_get_model(ctx), token, 
result.ptr, cast(int) result.length);
     if (n_tokens < 0) {
         result.length = -n_tokens;
         int check = 
llamad.llama_token_to_piece(llama_get_model(ctx), token, 
result.ptr, cast(int) result.length);
         assert(check == -n_tokens);
     } else {
         result.length = n_tokens;
     }

     return cast(string) result;
}

void llama_batch_clear(ref llama_batch batch) {
     batch.n_tokens = 0;
}

```

## Build

Build inside llama.cpp folder with this command (I've been using 
CUDA but it is possible to use without it)

```bat
dmd llama-d.d llamad.c -m64 build/ggml_static.lib build/llama.lib 
-L/LIBPATH:"C:/Program Files/NVIDIA GPU Computing 
Toolkit/CUDA/v12.3/lib/x64" cuda.lib cudart.lib cufft.lib 
cublas.lib ucrtd.lib -L/NODEFAULTLIB:libucrt.lib 
-L/NODEFAULTLIB:libcmt.lib msvcprtd.lib
```

## Run

And **run**

```bat
llama-d 
"E:\ML\pretrained\speechless-llama2-hermes-orca-platypus-wizardlm-13b.Q5_K_M.gguf" "How to quit vim?"
```



More information about the Digitalmars-d mailing list