b4rtaz · b4rtaz · May 19, 2024 · May 11, 2024 · May 11, 2024 · May 11, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -28,7 +28,8 @@ jobs:
       - name: Build
         id: build
         run: |
-          make main
+          make dllama
+          make dllama-api
           make funcs-test
           make quants-test
           make transformer-test

diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,6 @@ __pycache__
 *-test
 main
 run.sh
+server
+/dllama
+/dllama-*
diff --git a/Makefile b/Makefile
@@ -23,9 +23,13 @@ mixtral-tasks: src/mixtral-tasks.cpp
 	$(CXX) $(CXXFLAGS) -c src/mixtral-tasks.cpp -o mixtral-tasks.o
 tokenizer: src/tokenizer.cpp
 	$(CXX) $(CXXFLAGS) -c src/tokenizer.cpp -o tokenizer.o
+app: src/app.cpp
+	$(CXX) $(CXXFLAGS) -c src/app.cpp -o app.o
 
-main: src/main.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer
-	$(CXX) $(CXXFLAGS) src/main.cpp -o main utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o -lpthread
+dllama: src/apps/dllama/dllama.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app
+	$(CXX) $(CXXFLAGS) src/apps/dllama/dllama.cpp -o dllama utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o -lpthread
+dllama-api: src/apps/dllama-api/dllama-api.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks mixtral-tasks tokenizer app
+	$(CXX) $(CXXFLAGS) src/apps/dllama-api/dllama-api.cpp -o dllama-api utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o mixtral-tasks.o tokenizer.o app.o -lpthread
 funcs-test: src/funcs-test.cpp funcs utils quants
 	$(CXX) $(CXXFLAGS) src/funcs-test.cpp -o funcs-test funcs.o utils.o quants.o -lpthread
 quants-test: src/quants.cpp utils quants

diff --git a/README.md b/README.md
@@ -127,7 +127,7 @@ git clone https://github.com/b4rtaz/distributed-llama.git
 ```
 6. Compile Distributed Llama:
 ```sh
-make main
+make dllama
 ```
 7. Transfer weights and the tokenizer file to the root device.
 8. Optional: assign static IP addresses.
@@ -137,17 +137,17 @@ sudo ip addr add 10.0.0.2/24 dev eth0 # 2th device
 ```
 9. Run worker nodes on worker devices:
 ```sh
-sudo nice -n -20 ./main worker --port 9998 --nthreads 4
+sudo nice -n -20 ./dllama worker --port 9998 --nthreads 4
 ```
 10. Run root node on the root device:
 ```sh
-sudo nice -n -20 ./main inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 10.0.0.2:9998
+sudo nice -n -20 ./dllama inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 10.0.0.2:9998
 ```
 
 To add more worker nodes, just add more addresses to the `--workers` argument.
 
 ```
-./main inference ... --workers 10.0.0.2:9998 10.0.0.3:9998 10.0.0.4:9998
+./dllama inference ... --workers 10.0.0.2:9998 10.0.0.3:9998 10.0.0.4:9998
 ```
 
 [Share your results](https://github.com/b4rtaz/distributed-llama/discussions)!
@@ -166,20 +166,20 @@ git clone https://github.com/b4rtaz/distributed-llama.git
 ```
 3. Compile Distributed Llama:
 ```sh
-make main
+make dllama
 ```
 4. Transfer weights and the tokenizer file to the root node.
 5. Run worker nodes on worker devices:
 ```sh
-sudo nice -n -20 ./main worker --port 9998 --nthreads 4
+sudo nice -n -20 ./dllama worker --port 9998 --nthreads 4
 ```
 6. Run root node on the root device:
 ```sh
-sudo nice -n -20 ./main inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9998
+sudo nice -n -20 ./dllama inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9998
 ```
 7. To run the root node in the chat mode:
 ```sh
-sudo nice -n -20 ./main chat --model ../dllama_llama-2-7b-chat_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --workers 192.168.0.1:9998
+sudo nice -n -20 ./dllama chat --model ../dllama_llama-2-7b-chat_q40.bin --tokenizer ../dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --workers 192.168.0.1:9998
 ```
 
 [Share your results](https://github.com/b4rtaz/distributed-llama/discussions)!

diff --git a/docs/GROK.md b/docs/GROK.md
@@ -11,9 +11,9 @@ wget https://huggingface.co/b4rtaz/grok-1-distributed-llama/resolve/main/dllama-
 ```
 4. Build the project:
 ```bash
-make main
+make dllama
 ```
 5. Run the model:
 ```bash
-./main inference --weights-float-type q40 --buffer-float-type q80 --prompt "Hello" --steps 128 --nthreads 8 --model dllama-grok-1-q40-final.bin --tokenizer dllama-grok1-tokenizer.t
+./dllama inference --weights-float-type q40 --buffer-float-type q80 --prompt "Hello" --steps 128 --nthreads 8 --model dllama-grok-1-q40-final.bin --tokenizer dllama-grok1-tokenizer.t
 ```
diff --git a/docs/LLAMA.md b/docs/LLAMA.md
@@ -20,11 +20,11 @@ wget https://huggingface.co/b4rtaz/llama-2-distributed-llama/resolve/main/dllama
 ```
 6. Build the project:
 ```bash
-make main
+make dllama
 ```
 7. Run:
 ```bash
-./main inference --model dllama_llama-2-7b_q40.bin --tokenizer dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4
+./dllama inference --model dllama_llama-2-7b_q40.bin --tokenizer dllama-llama2-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4
 ```
 
 In the table below, you can find the expected size of the converted weights with different floating-point types.
@@ -60,9 +60,9 @@ python converter/convert-tokenizer-llama3.py path/to/tokenizer.model
 ```
 10. Build the project:
 ```bash
-make main
+make dllama
 ```
 11. Run the Distributed Llama:
 ```bash
-./main inference --weights-float-type q40 --buffer-float-type q80 --prompt "My name is" --steps 128 --nthreads 8 --model dllama_meta-llama-3-8b_q40.bin --tokenizer llama3-tokenizer.t
+./dllama inference --weights-float-type q40 --buffer-float-type q80 --prompt "My name is" --steps 128 --nthreads 8 --model dllama_meta-llama-3-8b_q40.bin --tokenizer llama3-tokenizer.t
 ```
diff --git a/examples/chat-api-client.js b/examples/chat-api-client.js
@@ -0,0 +1,48 @@
+// This is a simple client for dllama-api.
+//
+// Usage:
+//
+// 1. Start the server, how to do it is described in the `src/apps/dllama-api/README.md` file.
+// 2. Run this script: `node examples/chat-api-client.js`
+
+const HOST = '127.0.0.1';
+const PORT = 9990;
+
+async function chat(messages, maxTokens) {
+    const response = await fetch(`http://${HOST}:${PORT}/v1/chat/completions`, {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+            messages,
+            temperature: 0.7,
+            stop: ['<|eot_id|>'],
+            max_tokens: maxTokens
+        }),
+    });
+    return await response.json();
+}
+
+async function ask(system, user, maxTokens) {
+    console.log(`> system: ${system}`);
+    console.log(`> user: ${user}`);
+    const response = await chat([
+        {
+            role: 'system',
+            content: system
+        },
+        {
+            role: 'user',
+            content: user
+        }
+    ], maxTokens);
+    console.log(`${response.choices[0].message.content}`);
+}
+
+async function main() {
+    await ask('You are an excellent math teacher.', 'What is 1 + 2?', 64);
+    await ask('You are a romantic.', 'Where is Europe?', 64);
+}
+
+main();
diff --git a/examples/macbeth.sh b/examples/macbeth.sh
@@ -189,7 +189,7 @@ Macbeth. Thou seest the moon"
 
 echo "Generating, it can take a while..."
 
-OUTPUT=$(( ./main generate --seed 12345 --temperature 0.9 --topp 0.9 --prompt "$PROMPT" --weights-float-type q40 --buffer-float-type f32 --nthreads 8 --steps 2048 --model converter/dllama_meta-llama-3-8b_q40.bin --tokenizer converter/dllama_meta-llama3-tokenizer.t ) 2>&1)
+OUTPUT=$(( ./dllama generate --seed 12345 --temperature 0.9 --topp 0.9 --prompt "$PROMPT" --weights-float-type q40 --buffer-float-type f32 --nthreads 8 --steps 2048 --model converter/dllama_meta-llama-3-8b_q40.bin --tokenizer converter/dllama_meta-llama3-tokenizer.t ) 2>&1)
 
 echo "$OUTPUT"
 

diff --git a/examples/nodejs-example.cjs b/examples/nodejs-example.cjs
@@ -2,7 +2,7 @@ const { Socket } = require('net');
 
 // Run Distributed Llama server:
 //
-// `./main simple-server --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --model converter/dllama_meta-llama-3-8b_q40.bin --tokenizer converter/llama3-tokenizer.t --workers 10.0.0.1:9999`
+// `./dllama simple-server --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --model converter/dllama_meta-llama-3-8b_q40.bin --tokenizer converter/llama3-tokenizer.t --workers 10.0.0.1:9999`
 //
 // Then run this script:
 //

diff --git a/src/app.cpp b/src/app.cpp
@@ -0,0 +1,129 @@
+#include <cstring>
+#include <cstdlib>
+#include <cstdint>
+#include <cstdio>
+#include <cassert>
+#include <stdexcept>
+#include "app.hpp"
+
+FloatType parseFloatType(char* val) {
+    if (strcmp(val, "f32") == 0) return F32;
+    if (strcmp(val, "f16") == 0) return F16;
+    if (strcmp(val, "q40") == 0) return Q40;
+    if (strcmp(val, "q80") == 0) return Q80;
+    printf("Invalid float type %s\n", val);
+    exit(EXIT_FAILURE);
+}
+
+AppArgs AppArgs::parse(int argc, char** argv, bool hasMode) {
+    AppArgs args;
+    args.mode = NULL;
+    args.nThreads = 4;
+    args.modelPath = NULL;
+    args.tokenizerPath = NULL;
+    args.prompt = NULL;
+    args.weightsFloatType = F32;
+    args.bufferFloatType = F32;
+    args.nWorkers = 0;
+    args.port = 9990;
+    args.temperature = 0.8f;
+    args.topp = 0.9f;
+    args.steps = 0;
+    args.seed = (unsigned long long)time(NULL);
+
+    int i = 1;
+    if (hasMode && argc > 1) {
+        args.mode = argv[1];
+        i++;
+    }
+    for (; i + 1 < argc; i += 2) {
+        if (strcmp(argv[i], "--model") == 0) {
+            args.modelPath = argv[i + 1];
+        } else if (strcmp(argv[i], "--tokenizer") == 0) {
+            args.tokenizerPath = argv[i + 1];
+        } else if (strcmp(argv[i], "--prompt") == 0) {
+            args.prompt = argv[i + 1];
+        } else if (strcmp(argv[i], "--weights-float-type") == 0) {
+            args.weightsFloatType = parseFloatType(argv[i + 1]);
+        } else if (strcmp(argv[i], "--buffer-float-type") == 0) {
+            args.bufferFloatType = parseFloatType(argv[i + 1]);
+        } else if (strcmp(argv[i], "--workers") == 0) {
+            int j = i + 1;
+            for (; j < argc && argv[j][0] != '-'; j++);
+            int count = j - i - 1;
+
+            args.nWorkers = count;
+            args.workerHosts = new char*[count];
+            args.workerPorts = new int[count];
+
+            for (int s = 0; s < count; s++) {
+                char* v = argv[i + 1 + s];
+                char* sep = strstr(v, ":");
+                if (sep == NULL) {
+                    printf("Invalid address %s\n", v);
+                    exit(EXIT_FAILURE);
+                }
+                int hostLen = sep - v;
+                args.workerHosts[s] = new char[hostLen + 1];
+                memcpy(args.workerHosts[s], v, hostLen);
+                args.workerHosts[s][hostLen] = '\0';
+                args.workerPorts[s] = atoi(sep + 1);
+            }
+
+            i += count - 1;
+        } else if (strcmp(argv[i], "--port") == 0) {
+            args.port = atoi(argv[i + 1]);
+        } else if (strcmp(argv[i], "--nthreads") == 0) {
+            args.nThreads = atoi(argv[i + 1]);
+        } else if (strcmp(argv[i], "--steps") == 0) {
+            args.steps = atoi(argv[i + 1]);
+        } else if (strcmp(argv[i], "--temperature") == 0) {
+            args.temperature = atof(argv[i + 1]);
+        } else if (strcmp(argv[i], "--topp") == 0) {
+            args.topp = atof(argv[i + 1]);
+        } else if (strcmp(argv[i], "--seed") == 0) {
+            args.seed = atoll(argv[i + 1]);
+        } else {
+            printf("Unknown option %s\n", argv[i]);
+            exit(EXIT_FAILURE);
+        }
+    }
+    return args;
+}
+
+TransformerArch TransformerArchFactory::create(TransformerSpec* spec) {
+    if (spec->archType == LLAMA2) return buildLlama2Arch(spec);
+    if (spec->archType == GROK1) return buildGrok1Arch(spec);
+    if (spec->archType == MIXTRAL) return buildMixtralArch(spec);
+    printf("Unsupported arch type: %d\n", spec->archType);
+    exit(EXIT_FAILURE);
+}
+
+void App::run(AppArgs* args, void (*program)(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec)) {
+    if (args->modelPath == NULL) {
+        throw std::runtime_error("Model is required");
+    }
+    if (args->tokenizerPath == NULL) {
+        throw std::runtime_error("Tokenizer is required");
+    }
+
+    SocketPool* socketPool = SocketPool::connect(args->nWorkers, args->workerHosts, args->workerPorts);
+    unsigned int nSlices = args->nWorkers + 1;
+
+    TransformerSpec spec = Transformer::loadSpecFromFile(args->modelPath, nSlices, args->weightsFloatType, args->bufferFloatType);
+    TransformerArch arch = TransformerArchFactory::create(&spec);
+
+    if (args->steps == 0 || args->steps > spec.seqLen) {
+        args->steps = spec.seqLen;
+    }
+
+    Tokenizer tokenizer(args->tokenizerPath, spec.vocabSize);
+    Transformer transformer = Transformer::loadRootFromFile(args->modelPath, &spec, socketPool);
+    Inference inference = Inference(&arch, args->nThreads, &transformer, socketPool);
+
+    Sampler sampler(spec.vocabSize, args->temperature, args->topp, args->seed);
+
+    program(&inference, socketPool, &tokenizer, &sampler, args, &spec);
+
+    delete socketPool;
+}
diff --git a/src/app.hpp b/src/app.hpp
@@ -0,0 +1,52 @@
+#ifndef FUNCS_HPP
+#define FUNCS_HPP
+
+#include "quants.hpp"
+#include "transformer.hpp"
+#include "utils.hpp"
+#include "socket.hpp"
+#include "app.hpp"
+#include "transformer.hpp"
+#include "tasks.hpp"
+#include "llama2-tasks.hpp"
+#include "grok1-tasks.hpp"
+#include "mixtral-tasks.hpp"
+#include "tokenizer.hpp"
+
+class AppArgs {
+public:
+    char* mode;
+    int nThreads; 
+
+    // inference
+    char* modelPath;
+    char* tokenizerPath;
+    char* prompt;
+    FloatType weightsFloatType;
+    FloatType bufferFloatType;
+    int nWorkers;
+    char** workerHosts;
+    int* workerPorts;
+    float temperature;
+    float topp;
+    pos_t steps;
+    bool benchmark;
+    unsigned long long seed;
+
+    // worker
+    int port;
+
+    static AppArgs parse(int argc, char** argv, bool hasMode);
+};
+
+class TransformerArchFactory {
+public:
+    static TransformerArch create(TransformerSpec* spec);
+};
+
+class App {
+public:
+    static void run(AppArgs* args, void (*program)(Inference* inference, SocketPool* socketPool, Tokenizer* tokenizer, Sampler* sampler, AppArgs* args, TransformerSpec* spec));
+};
+
+#endif
diff --git a/src/apps/dllama-api/README.md b/src/apps/dllama-api/README.md
@@ -0,0 +1,11 @@
+# Distributed Llama API
+
+This is an early version of the server that is compatible with the OpenAi API. It supports only the `/v1/chat/completions` endpoint. Currently it's adjusted to the Llama 3 8B Instruct only.
+
+How to run?
+
+1. Download the model and the tokenizer from [here](https://huggingface.co/Azamorn/Meta-Llama-3-8B-Instruct-Distributed).
+2. Run the server with the following command:
+```bash
+./dllama-api --model converter/dllama_original_q40.bin --tokenizer converter/dllama-llama3-tokenizer.t --weights-float-type q40 --buffer-float-type q80 --nthreads 4
+```