Merge pull request #1937 from orionpapadakis/fix/gpu-resources-utilization

geoand · web-flow · commit ab2b900e302b · 2025-11-13T16:17:28.000+02:00
Fix GPU resources utilization
diff --git a/docs/modules/ROOT/pages/gpullama3-chat-model.adoc b/docs/modules/ROOT/pages/gpullama3-chat-model.adoc
@@ -29,8 +29,7 @@ The above steps:
 
 - Set the `TORNADOVM_SDK` environment variable to the TornadoVM SDK path.
 - Create a `tornado-argfile` under `~/TornadoVM` containing the JVM arguments required to enable TornadoVM.
-- The `tornado-argfile` is automatically used in Quarkus *dev mode*.
-- For *production mode*, you must manually pass the argfile to the JVM (see step 3).
+- ⚠️ The `tornado-argfile` should be used for *building* and *running* the Quarkus application (see section Building & Running the Quarkus Application).
 
 == Using GPULlama3.java
 
@@ -130,6 +129,66 @@ quarkus.langchain4j.gpu-llama3.chat-model.max-tokens=1024
 
 Model files are automatically downloaded from https://huggingface.co/beehive-lab[Beehive Lab HuggingFace] if not available locally.
 
+== Building & Running the Quarkus Application
+
+=== Dev Mode
+
+To run your Quarkus application in **dev mode** with TornadoVM:
+
+1. Ensure your `pom.xml` contains the `quarkus-langchain4j-gpu-llama3` dependency (shown earlier).
+
+2. Add the TornadoVM argfile as a Maven property:
+
+[source,xml]
+----
+<properties>
+    <tornado.argfile>/path/to/tornado-argfile</tornado.argfile>
+</properties>
+----
+
+3. Pass the argfile to the JVM in the plugin configuration for dev mode:
+
+[source,xml]
+----
+<plugin>
+    <groupId>io.quarkus</groupId>
+    <artifactId>quarkus-maven-plugin</artifactId>
+    <configuration>
+        <jvmArgs>@${tornado.argfile}</jvmArgs>
+    </configuration>
+</plugin>
+----
+
+4. Launch dev mode explicitly:
+
+[source,shell]
+----
+mvn quarkus:dev
+----
+
+---
+
+=== Production Mode
+
+To build and run your application in **production mode**:
+
+1. Build the Quarkus application:
+
+[source,shell]
+----
+mvn clean package
+----
+
+2. Run the generated jar with the TornadoVM argfile:
+
+[source,shell]
+----
+java @/path/to/tornado-argfile -jar target/quarkus-app/quarkus-run.jar
+----
+
+⚠ **Important:** Ensure `TORNADOVM_SDK` and the `tornado-argfile` path are correctly set.
+
+
 == Supported Models and Quantizations
 
 The following models have been tested with GPULlama3.java and can be found in link:++https://huggingface.co/beehive-lab/collections[Beehive Lab's HuggingFace Collections].
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ChatModel.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3ChatModel.java
@@ -7,6 +7,8 @@
 import java.nio.file.Path;
 import java.util.Optional;
 
+import org.jboss.logging.Logger;
+
 import dev.langchain4j.data.message.AiMessage;
 import dev.langchain4j.internal.ChatRequestValidationUtils;
 import dev.langchain4j.model.chat.ChatModel;
@@ -16,19 +18,83 @@
 
 public class GPULlama3ChatModel extends GPULlama3BaseModel implements ChatModel {
 
-    // @formatter:off
+    private static final Logger LOG = Logger.getLogger(GPULlama3ChatModel.class);
+
+    private final Builder builderConfig;
+    private volatile boolean initialized = false;
+
+    /**
+     * Default constructor.
+     *
+     * @param builder
+     */
     private GPULlama3ChatModel(Builder builder) {
+        this(builder, false);
+    }
+
+    /**
+     * Constructor with lazy initialization.
+     *
+     * @param builder the builder used to configure the model.
+     * @param lazy if true, the model is not initialized until the first call to doChat.
+     */
+    private GPULlama3ChatModel(Builder builder, boolean lazy) {
+        if (lazy) {
+            // lazy initialization
+            this.builderConfig = builder;
+        } else {
+            this.builderConfig = null;
+            // original immediate initialization
+            doInitialization(builder);
+        }
+    }
+
+    /**
+     * The factory method for creating a lazy initialized model.
+     *
+     * @param builder the builder used to configure the model.
+     * @return the model.
+     */
+    public static GPULlama3ChatModel createLazy(Builder builder) {
+        return new GPULlama3ChatModel(builder, true);
+    }
+
+    /**
+     * Ensure that the model is initialized.
+     */
+    private void ensureInitialized() {
+        if (!initialized && builderConfig != null) {
+            if (!initialized) {
+                doInitialization(builderConfig);
+                initialized = true;
+            }
+        }
+    }
+
+    // @formatter:off
+    /**
+     * Performs the actual initialization.
+     */
+    private void doInitialization(Builder builder) {
         GPULlama3ModelRegistry gpuLlama3ModelRegistry = GPULlama3ModelRegistry.getOrCreate(builder.modelCachePath);
         try {
             Path modelPath = gpuLlama3ModelRegistry.downloadModel(builder.modelName, builder.quantization,
                     Optional.empty(), Optional.empty());
-            init(
-                    modelPath,
-                    getOrDefault(builder.temperature, 0.1),
-                    getOrDefault(builder.topP, 1.0),
-                    getOrDefault(builder.seed, 12345),
-                    getOrDefault(builder.maxTokens, 512),
-                    getOrDefault(builder.onGPU, Boolean.TRUE));
+            Double temp = getOrDefault(builder.temperature, 0.1);
+            Double topP = getOrDefault(builder.topP, 1.0);
+            Integer seed = getOrDefault(builder.seed, 12345);
+            Integer maxTokens = getOrDefault(builder.maxTokens, 512);
+            Boolean onGPU = getOrDefault(builder.onGPU, Boolean.TRUE);
+
+            LOG.info("GPULlama3ChatModel Instantiation {modelPath=" + modelPath +
+                    ", temperature=" + temp +
+                    ", topP=" + topP +
+                    ", seed=" + seed +
+                    ", maxTokens=" + maxTokens +
+                    ", onGPU=" + onGPU + "}...");
+
+            init(modelPath, temp, topP, seed, maxTokens, onGPU);
+            LOG.info("GPULlama3ChatModel Instantiation Complete!");
         } catch (IOException e) {
             throw new UncheckedIOException(e);
         } catch (InterruptedException e) {
@@ -43,6 +109,8 @@ public static Builder builder() {
 
     @Override
     public ChatResponse doChat(ChatRequest chatRequest) {
+        ensureInitialized(); // If in lazy path, init model
+
         ChatRequestValidationUtils.validateMessages(chatRequest.messages());
         ChatRequestParameters parameters = chatRequest.parameters();
         ChatRequestValidationUtils.validateParameters(parameters);
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3StreamingChatModel.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/GPULlama3StreamingChatModel.java
@@ -7,8 +7,6 @@
 import java.io.UncheckedIOException;
 import java.nio.file.Path;
 import java.util.Optional;
-import java.util.concurrent.CompletableFuture;
-import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.jboss.logging.Logger;
 
@@ -50,20 +48,68 @@ public class GPULlama3StreamingChatModel extends GPULlama3BaseModel implements S
 
     private static final Logger LOG = Logger.getLogger(GPULlama3StreamingChatModel.class);
 
-    // Fields to track initialization state
-    private final CompletableFuture<Void> initializationFuture = new CompletableFuture<>();
-    private final AtomicBoolean initialized = new AtomicBoolean(false);
+    private final Builder builderConfig;
+    private volatile boolean initialized = false;
+
+    private GPULlama3StreamingChatModel(Builder builder, boolean lazy) {
+        if (lazy) {
+            this.builderConfig = builder;
+            // Don't initialize yet!
+        } else {
+            this.builderConfig = null;
+            // Original background initialization
+            runOutEventLoop(() -> {
+                LOG.debug("Starting GPULlama3 StreamingChatModel initialization on worker thread");
+                doInitialization(builder);
+                initialized = true;
+            });
+        }
+    }
 
     private GPULlama3StreamingChatModel(Builder builder) {
-        // Schedule the initialization to happen on a background thread
-        runOutEventLoop(() -> {
-            LOG.debug("Starting GPULlama3 model initialization on worker thread");
-            coreInit(builder);
-        });
+        this(builder, false); // Default to original background initialization
     }
 
-    public static Builder builder() {
-        return new Builder();
+    // Add factory method for lazy initialization
+    public static GPULlama3StreamingChatModel createLazy(Builder builder) {
+        return new GPULlama3StreamingChatModel(builder, true);
+    }
+
+    private void ensureInitialized() {
+        if (!initialized && builderConfig != null) {
+            if (!initialized) {
+                LOG.debug("Lazy initialization of GPULlama3StreamingChatModel");
+                doInitialization(builderConfig);
+                initialized = true;
+            }
+        }
+    }
+
+    private void doInitialization(Builder builder) {
+        GPULlama3ModelRegistry gpuLlama3ModelRegistry = GPULlama3ModelRegistry.getOrCreate(builder.modelCachePath);
+        try {
+            Path modelPath = gpuLlama3ModelRegistry.downloadModel(builder.modelName, builder.quantization,
+                    Optional.empty(), Optional.empty());
+            Double temp = getOrDefault(builder.temperature, 0.1);
+            Double topP = getOrDefault(builder.topP, 1.0);
+            Integer seed = getOrDefault(builder.seed, 12345);
+            Integer maxTokens = getOrDefault(builder.maxTokens, 512);
+            Boolean onGPU = getOrDefault(builder.onGPU, Boolean.TRUE);
+
+            LOG.info("GPULlama3StreamingChatModel Instantiation {modelPath=" + modelPath +
+                    ", temperature=" + temp +
+                    ", topP=" + topP +
+                    ", seed=" + seed +
+                    ", maxTokens=" + maxTokens +
+                    ", onGPU=" + onGPU + "}...");
+
+            init(modelPath, temp, topP, seed, maxTokens, onGPU);
+            LOG.info("GPULlama3StreamingChatModel Instantiation Complete!");
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        } catch (InterruptedException e) {
+            throw new RuntimeException(e);
+        }
     }
 
     @Override
@@ -75,59 +121,18 @@ public void doChat(ChatRequest chatRequest, StreamingChatResponseHandler handler
         ChatRequestValidationUtils.validate(parameters.responseFormat());
 
         // Run the GPU operations on a worker thread using runOutEventLoop
-        runOutEventLoop(new Runnable() {
-            @Override
-            public void run() {
-                // Wait for initialization to complete if it hasn't yet
-                if (!initialized.get()) {
-                    LOG.debug("Waiting for model initialization to complete");
-                    try {
-                        initializationFuture.get();
-                    } catch (Exception e) {
-                        LOG.error("Failed to initialize model", e);
-                        handler.onError(e);
-                        return;
-                    }
-                }
+        runOutEventLoop(() -> {
+            try {
+                ensureInitialized(); // Build happens HERE on first call!
                 LOG.debug("Executing GPU Llama inference on worker thread");
                 coreDoChat(chatRequest, handler);
-                LOG.debug("GPULlama3 model initialization completed");
+            } catch (Exception e) {
+                LOG.error("Failed during lazy initialization or inference", e);
+                handler.onError(e);
             }
         });
     }
 
-    /**
-     * The actual initialization logic.
-     * It is called by a worker thread in a non-blocking manner.
-     */
-    private void coreInit(Builder builder) {
-        GPULlama3ModelRegistry gpuLlama3ModelRegistry = GPULlama3ModelRegistry.getOrCreate(builder.modelCachePath);
-        try {
-            Path modelPath = gpuLlama3ModelRegistry.downloadModel(builder.modelName, builder.quantization,
-                    Optional.empty(), Optional.empty());
-            init(
-                    modelPath,
-                    getOrDefault(builder.temperature, 0.1),
-                    getOrDefault(builder.topP, 1.0),
-                    getOrDefault(builder.seed, 12345),
-                    getOrDefault(builder.maxTokens, 512),
-                    getOrDefault(builder.onGPU, Boolean.TRUE));
-
-            // Mark initialization as complete
-            initialized.set(true);
-            initializationFuture.complete(null);
-        } catch (IOException e) {
-            initializationFuture.completeExceptionally(new UncheckedIOException(e));
-            throw new UncheckedIOException(e);
-        } catch (InterruptedException e) {
-            initializationFuture.completeExceptionally(e);
-            throw new RuntimeException(e);
-        } catch (Exception e) {
-            initializationFuture.completeExceptionally(e);
-            throw e;
-        }
-    }
-
     /**
      * The actual doChat logic.
      * It is called by a worker thread in a non-blocking manner.
@@ -152,11 +157,15 @@ private void coreDoChat(ChatRequest chatRequest, StreamingChatResponseHandler ha
 
             handler.onCompleteResponse(chatResponse);
         } catch (Exception e) {
-            LOG.error("Error in GPULlama3 asyncDoChat", e);
+            LOG.error("Error in GPULlama3 coreDoChat", e);
             handler.onError(e);
         }
     }
 
+    public static Builder builder() {
+        return new Builder();
+    }
+
     public static class Builder {
 
         private Optional<Path> modelCachePath;
diff --git a/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/GPULlama3Recorder.java b/model-providers/gpu-llama3/runtime/src/main/java/io/quarkiverse/langchain4j/gpullama3/runtime/GPULlama3Recorder.java