autodeployai
diff --git a/‎.github/workflows/unittest.yml‎
Lines changed: 20 additions & 0 deletions b/‎.github/workflows/unittest.yml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎build.sbt‎
Lines changed: 1 addition & 1 deletion b/‎build.sbt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/xgb-iris.pmml‎
Lines changed: 7631 additions & 227 deletions b/‎examples/models/xgb-iris.pmml‎
Lines changed: 7631 additions & 227 deletions
diff --git a/‎src/main/resources/application.conf‎
Lines changed: 5 additions & 1 deletion b/‎src/main/resources/application.conf‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/main/resources/logback.xml‎
Lines changed: 4 additions & 0 deletions b/‎src/main/resources/logback.xml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/main/scala/com/autodeployai/serving/AIServer.scala‎
Lines changed: 6 additions & 0 deletions b/‎src/main/scala/com/autodeployai/serving/AIServer.scala‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/main/scala/com/autodeployai/serving/deploy/BatchProcessor.scala‎
Lines changed: 39 additions & 14 deletions b/‎src/main/scala/com/autodeployai/serving/deploy/BatchProcessor.scala‎
Lines changed: 39 additions & 14 deletions
diff --git a/‎src/main/scala/com/autodeployai/serving/deploy/InferenceService.scala‎
Lines changed: 59 additions & 51 deletions b/‎src/main/scala/com/autodeployai/serving/deploy/InferenceService.scala‎
Lines changed: 59 additions & 51 deletions
@@ -0,0 +1,20 @@
+name: Unittest
+on:
+  push:
+    branches: [master, main]
+permissions:
+  contents: read
+
+jobs:
+  unittest:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+          cache: sbt
+      - run: sbt test
@@ -9,3 +9,6 @@
 /src/test/resources/response_0.pb
 /.bloop/
 /.metals/
+/.bsp/
+/examples/.ipynb_checkpoints
+/examples/__pycache__/
@@ -3,7 +3,7 @@ name := (sys.props.getOrElse("gpu", "false") match {
   case _            => "ai-serving"
 })
 
-version := "2.1.0"
+version := "2.2.0"
 
 organization := "com.autodeployai"
 
 
@@ -42,6 +42,11 @@ service {
   }
 
   home = "/opt/ai-serving"
+
+  logging {
+    request-timing-enabled = false
+    request-timing-level = "DEBUG" // one of "DEBUG", "INFO", "WARNING", "ERROR"
+  }
 }
 
 onnxruntime {
@@ -53,4 +58,3 @@ onnxruntime {
   logger-id = "onnxruntime"
   logging-level = 3             // 0: VERBOSE, 1: INFO, 2: WARNING, 3: ERROR, 4: FATAL
 }
-
 
@@ -11,6 +11,10 @@
         <appender-ref ref="STDOUT"/>
     </logger>
 
+    <logger name="akka" level="${LOG_LEVEL:-INFO}" additivity="false">
+        <appender-ref ref="STDOUT"/>
+    </logger>
+
     <logger name="io.netty" level="${LOG_LEVEL:-INFO}" additivity="false">
         <appender-ref ref="STDOUT"/>
     </logger>
 
@@ -39,6 +39,12 @@ object AIServer extends Endpoints with EndpointsV2 {
   if (config.hasPath(defaultFixedPoolSizePath)) {
     var numCores = config.getInt(defaultFixedPoolSizePath)
     if (numCores == -1) {
+      val onnxBackend = if (config.hasPath("onnxruntime.backend")) config.getString("onnxruntime.backend").toLowerCase else "cpu"
+      val onnxThreads = if (config.hasPath("onnxruntime.cpu-num-threads")) config.getInt("onnxruntime.cpu-num-threads") else -1
+
+      if (onnxBackend == "cpu" && onnxThreads == -1) {
+        log.warn("Please reserve sufficient CPU capacity for ONNX Runtime to prevent oversubscription when serving ONNX models on CPU.")
+      }
       numCores = Utils.getNumCores
     }
     config = config.withValue(defaultFixedPoolSizePath, ConfigValueFactory.fromAnyRef(numCores))
 
@@ -21,26 +21,27 @@ import com.autodeployai.serving.utils.{DataUtils, Utils}
 import org.slf4j.{Logger, LoggerFactory}
 
 import java.nio.{ByteBuffer, ByteOrder, DoubleBuffer, FloatBuffer, IntBuffer, LongBuffer, ShortBuffer}
+import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger}
 import java.util.concurrent.{ConcurrentLinkedQueue, Executors, TimeUnit}
 import scala.concurrent.{ExecutionContext, Future, Promise}
 
 trait BatchRequest[Request, Response] {
   def request: Request
   def promise: Promise[Response]
-  def options: RunOptions
+  def options: Option[RunOptions]
   def timestamp: Long
 }
 
 case class BatchRequestV2(
   request: InferenceRequest,
   promise: Promise[InferenceResponse],
-  options: RunOptions,
+  options: Option[RunOptions],
   timestamp: Long = System.currentTimeMillis(),
 ) extends BatchRequest[InferenceRequest, InferenceResponse]
 
 trait BatchProcessor[Request, Response] extends AutoCloseable {
 
-  def predict(request: Request, options: RunOptions): Future[Response]
+  def predict(request: Request, options: Option[RunOptions]): Future[Response]
 
   def merge(requests: Array[Request]): Request
 
@@ -53,6 +54,8 @@ class BatchProcessorV2(model: PredictModel,
 
   val log: Logger = LoggerFactory.getLogger(this.getClass)
   private val queue = new ConcurrentLinkedQueue[BatchRequestV2]()
+  private val queueLength = new AtomicInteger(0)
+  private val processing = new AtomicBoolean(false)
   private val enabled: Boolean = maxBatchSize > 1
   private val checkInterval: Long = Math.max(maxBatchDelayMs / 2, 1L)
 
@@ -76,16 +79,15 @@ class BatchProcessorV2(model: PredictModel,
 
   log.info(s"BatchProcessor for model ${model.modelName}:${model.modelVersion} initialized: max-batch-size=$maxBatchSize, max-batch-delay-ms=$maxBatchDelayMs")
 
-  override def predict(request: InferenceRequest, options: RunOptions): Future[InferenceResponse] = {
+  override def predict(request: InferenceRequest, options: Option[RunOptions]): Future[InferenceResponse] = {
     if (enabled) {
       val promise = Promise[InferenceResponse]()
       val batchRequest = BatchRequestV2(request=request, promise=promise, options=options)
       queue.offer(batchRequest)
+      val currentLength = queueLength.incrementAndGet()
 
-      if (queue.size() >= maxBatchSize) {
-        Future{
-          processBatch()
-        }
+      if (currentLength >= maxBatchSize) {
+        processBatchAsync()
       }
       promise.future
     } else {
@@ -99,18 +101,38 @@ class BatchProcessorV2(model: PredictModel,
     if (!queue.isEmpty) {
       val oldestRequest = queue.peek()
       if (oldestRequest != null && (timestamp - oldestRequest.timestamp) >= maxBatchDelayMs) {
-        processBatch()
+        processBatchAsync()
+      }
+    }
+  }
+
+  private def processBatchAsync(): Unit = {
+    if (processing.compareAndSet(false, true)) {
+      Future {
+        try {
+          var keepRunning = true
+          while (keepRunning) {
+            val processed = processBatch()
+            keepRunning = processed && queueLength.get() >= maxBatchSize
+          }
+        } finally {
+          processing.set(false)
+          if (queueLength.get() >= maxBatchSize) {
+            processBatchAsync()
+          }
+        }
       }
     }
   }
 
-  private def processBatch(): Unit = this.synchronized {
+  private def processBatch(): Boolean = this.synchronized {
     val builder = Array.newBuilder[BatchRequestV2]
     builder.sizeHint(maxBatchSize)
     var exit = false
     while (builder.length < maxBatchSize && !exit) {
       val item = queue.poll()
       if (item != null) {
+        queueLength.decrementAndGet()
         builder += item
       } else {
         exit = true
@@ -128,7 +150,7 @@ class BatchProcessorV2(model: PredictModel,
 
         val startTime = System.currentTimeMillis()
         val batchResponse = model.predict(mergedRequest, options)
-        log.info(s"Batched ${batch.length} requests elapsed time: ${System.currentTimeMillis() - startTime}")
+        log.debug(s"Batched ${batch.length} requests elapsed time: ${System.currentTimeMillis() - startTime}")
 
         val recordCounts = requests.map(req => {
           if (req.inputs.nonEmpty) {
@@ -147,8 +169,11 @@ class BatchProcessorV2(model: PredictModel,
           batch.foreach(_.promise.failure(ex))
       } finally {
         // Close options of other requests
-        batch.tail.foreach(x => Utils.safeClose(x.options))
+        batch.tail.foreach(x => Utils.safeClose(x.options.orNull))
       }
+      true
+    } else {
+      false
     }
   }
 
@@ -313,7 +338,7 @@ class BatchProcessorV2(model: PredictModel,
     scheduler.foreach(s =>{
       s.shutdown()
       try {
-        if (s.awaitTermination(5, TimeUnit.SECONDS)) {
+        if (!s.awaitTermination(5, TimeUnit.SECONDS)) {
           s.shutdownNow()
         }
       } catch {
@@ -323,7 +348,7 @@ class BatchProcessorV2(model: PredictModel,
     })
 
     if (!queue.isEmpty) {
-      processBatch()
+      while (processBatch()) {}
     }
   }
 }
@@ -24,10 +24,11 @@ import com.autodeployai.serving.utils.Utils.toOption
 import com.typesafe.config.{Config, ConfigFactory, ConfigRenderOptions}
 import org.slf4j.{Logger, LoggerFactory}
 
-import java.util.{Timer, TimerTask}
+import java.util.concurrent.{Executors, ScheduledExecutorService, ThreadFactory, TimeUnit}
 import scala.collection.concurrent.TrieMap
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.{ExecutionContext, Future, Promise}
+import scala.util.Using
 
 /**
  * Main entry of models validation, management and deployment.
@@ -49,6 +50,14 @@ object InferenceService extends JsonSupport {
 
   private val repositories: TrieMap[String, ModelRepository] = TrieMap.empty
 
+  // Timeout scheduler
+  private val timeoutScheduler: ScheduledExecutorService = Executors.newSingleThreadScheduledExecutor(
+    (r: Runnable) => {
+      val thread = new Thread(r, "ai-serving-timeout-scheduler")
+      thread.setDaemon(true)
+      thread
+    })
+
   // A flag if the inference service is ready to response requests
   var isReady = false
 
@@ -82,16 +91,16 @@ object InferenceService extends JsonSupport {
   def loadModels()(implicit ec: ExecutionContext): Unit = {
     log.info(s"Loading models under the directory: $modelsPath")
 
-    val files = Files.list(modelsPath)
-    val it = files.iterator()
-    while (it.hasNext) {
-      val path = it.next()
-      if (Files.isDirectory(path)) {
-        val modelRepository = loadModel(path)
-        modelRepository.foreach(x => repositories.put(x.modelName, x))
+    Using(Files.list(modelsPath)) { files =>
+      val it = files.iterator()
+      while (it.hasNext) {
+        val path = it.next()
+        if (Files.isDirectory(path)) {
+          val modelRepository = loadModel(path)
+          modelRepository.foreach(x => repositories.put(x.modelName, x))
+        }
       }
     }
-
     isReady = true
   }
 
@@ -204,7 +213,7 @@ object InferenceService extends JsonSupport {
       model.predict(request, runOptions)
     }
 
-    withTimeout(futureResult, modelName, modelVersion, runOptions)
+    withTimeout(futureResult, model, runOptions)
   }
 
   /**
@@ -229,7 +238,7 @@ object InferenceService extends JsonSupport {
         }
     }
 
-    withTimeout(futureResult, modelName, modelVersion, runOptions)
+    withTimeout(futureResult, model, runOptions)
   }
 
   /**
@@ -519,26 +528,27 @@ object InferenceService extends JsonSupport {
     val modelRepository = new ModelRepository(modelName, modelConfig)
 
     val versions = ArrayBuffer.empty[String]
-    val files = Files.list(modelPath)
-    val it = files.iterator()
-    while (it.hasNext) {
-      val path = it.next()
-      if (Files.isDirectory(path)) {
-        val modelVersion = path.getFileName.toString
-        versions += modelVersion
-        val (modelObjectPath, modelType) = getModelObjectPath(path)
-        if (modelObjectPath != null) {
-          try {
-            log.info(s"Loading model: $modelName with the version $modelVersion")
-
-            // Load the version config
-            val versionConfig = getModelConfig(path)
-
-            val model = PredictModel.load(modelObjectPath, modelType, modelName, modelVersion, versionConfig.orElse(modelConfig))
-            modelRepository.put(modelVersion, model)
-          } catch {
-            case e: Exception =>
-              log.error(s"Failed to load model $modelName with the version $modelVersion caused: $e")
+    Using(Files.list(modelPath)) { files =>
+      val it = files.iterator()
+      while (it.hasNext) {
+        val path = it.next()
+        if (Files.isDirectory(path)) {
+          val modelVersion = path.getFileName.toString
+          versions += modelVersion
+          val (modelObjectPath, modelType) = getModelObjectPath(path)
+          if (modelObjectPath != null) {
+            try {
+              log.info(s"Loading model: $modelName with the version $modelVersion")
+
+              // Load the version config
+              val versionConfig = getModelConfig(path)
+
+              val model = PredictModel.load(modelObjectPath, modelType, modelName, modelVersion, versionConfig.orElse(modelConfig))
+              modelRepository.put(modelVersion, model)
+            } catch {
+              case e: Exception =>
+                log.error(s"Failed to load model $modelName with the version $modelVersion caused: $e")
+            }
           }
         }
       }
@@ -590,29 +600,27 @@ object InferenceService extends JsonSupport {
    * @tparam T
    * @return
    */
-  private def withTimeout[T](future: Future[T], modelName: String, modelVersion: Option[String], runOptions: RunOptions)(implicit ec: ExecutionContext): Future[T] = {
-    val repository = repositories.get(modelName)
-    repository.flatMap(_.getTimeoutDuration(modelVersion)) match {
-      case Some(timeout) =>
-        val promise = Promise[T]()
-        val task = new TimerTask {
+  private def withTimeout[T](future: Future[T], model: PredictModel, runOptions: Option[RunOptions])(implicit ec: ExecutionContext): Future[T] = {
+    val timeout = model.timeout
+    if (timeout > 0) {
+      val promise = Promise[T]()
+      val timeoutFuture = timeoutScheduler.schedule(
+        new Runnable {
           override def run(): Unit = {
             if (!promise.isCompleted) {
-              runOptions.terminate()
-              promise.tryFailure(InferTimeoutException(modelName, modelVersion, timeout.toMillis))
+              runOptions.foreach(_.terminate())
+              promise.tryFailure(InferTimeoutException(model.modelName, Option(model.modelVersion), timeout))
             }
           }
-        }
-        val timer = new Timer(true)
-        timer.schedule(task, timeout.toMillis)
-        future.onComplete { result =>
-          timer.cancel()
-          promise.tryComplete(result)
-        }
-        promise.future
-      case _ => future
-    }
+        },
+        timeout,
+        TimeUnit.MILLISECONDS
+      )
+      future.onComplete { result =>
+        timeoutFuture.cancel(false)
+        promise.tryComplete(result)
+      }
+      promise.future
+    } else future
   }
-
 }
-
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,11 @@ service {`
`42`	`42`	`}`
`43`	`43`
`44`	`44`	`home = "/opt/ai-serving"`
	`45`	`+`
	`46`	`+ logging {`
	`47`	`+ request-timing-enabled = false`
	`48`	`+ request-timing-level = "DEBUG" // one of "DEBUG", "INFO", "WARNING", "ERROR"`
	`49`	`+ }`
`45`	`50`	`}`
`46`	`51`
`47`	`52`	`onnxruntime {`
`@@ -53,4 +58,3 @@ onnxruntime {`
`53`	`58`	`logger-id = "onnxruntime"`
`54`	`59`	`logging-level = 3 // 0: VERBOSE, 1: INFO, 2: WARNING, 3: ERROR, 4: FATAL`
`55`	`60`	`}`
`56`		`-`