diff --git a/expose.h b/expose.h
index 1ae4fad0692..b6511de09fd 100644
--- a/expose.h
+++ b/expose.h
@@ -190,7 +190,7 @@ struct sd_load_model_inputs
     const char * clip2_filename = nullptr;
     const char * vae_filename = nullptr;
     const char * lora_filenames[lora_filenames_max] = {};
-    const float lora_multiplier = 1.0f;
+    const float lora_multiplier[lora_filenames_max] = {};
     const int lora_apply_mode = 0;
     const char * photomaker_filename = nullptr;
     const char * upscaler_filename = nullptr;
@@ -226,6 +226,7 @@ struct sd_generation_inputs
     const bool circular_x = false;
     const bool circular_y = false;
     const bool upscale = false;
+    const float lora_multipliers[lora_filenames_max] = {};
 };
 struct sd_generation_outputs
 {
diff --git a/koboldcpp.py b/koboldcpp.py
index 8b31b09e5b8..7960060a7a2 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -88,6 +88,7 @@
 ttsmodelpath = "" #if empty, not initialized
 embeddingsmodelpath = "" #if empty, not initialized
 musicdiffusionmodelpath = "" #if empty, not initialized
+imglorainfo = []
 maxctx = 8192
 maxhordectx = 0 #set to whatever maxctx is if 0
 maxhordelen = 1024
@@ -320,7 +321,7 @@ class sd_load_model_inputs(ctypes.Structure):
                 ("clip2_filename", ctypes.c_char_p),
                 ("vae_filename", ctypes.c_char_p),
                 ("lora_filenames", ctypes.c_char_p * lora_filenames_max),
-                ("lora_multiplier", ctypes.c_float),
+                ("lora_multipliers", ctypes.c_float * lora_filenames_max),
                 ("lora_apply_mode", ctypes.c_int),
                 ("photomaker_filename", ctypes.c_char_p),
                 ("upscaler_filename", ctypes.c_char_p),
@@ -354,7 +355,8 @@ class sd_generation_inputs(ctypes.Structure):
                 ("remove_limits", ctypes.c_bool),
                 ("circular_x", ctypes.c_bool),
                 ("circular_y", ctypes.c_bool),
-                ("upscale", ctypes.c_bool)]
+                ("upscale", ctypes.c_bool),
+                ("lora_multipliers", ctypes.c_float * lora_filenames_max)]
 
 class sd_generation_outputs(ctypes.Structure):
     _fields_ = [("status", ctypes.c_int),
@@ -1995,8 +1997,11 @@ def sd_load_model(model_filename,vae_filename,lora_filenames,t5xxl_filename,clip
             inputs.lora_filenames[n] = "".encode("UTF-8")
         else:
             inputs.lora_filenames[n] = lora_filenames[n].encode("UTF-8")
+        if n >= len(args.sdloramult):
+            inputs.lora_multipliers[n] = 0.
+        else:
+            inputs.lora_multipliers[n] = args.sdloramult[n]
 
-    inputs.lora_multiplier = args.sdloramult
     inputs.t5xxl_filename = t5xxl_filename.encode("UTF-8")
     inputs.clip1_filename = clip1_filename.encode("UTF-8")
     inputs.clip2_filename = clip2_filename.encode("UTF-8")
@@ -2199,6 +2204,16 @@ def sd_generate(genparams):
     inputs.circular_x = tryparseint(adapter_obj.get("circular_x", genparams.get("circular_x",0)),0)
     inputs.circular_y = tryparseint(adapter_obj.get("circular_y", genparams.get("circular_y",0)),0)
     inputs.upscale = (True if tryparseint(genparams.get("enable_hr", 0),0) else False)
+
+    lora_multipliers = prepare_lora_multipliers(genparams.get("lora", []))
+    for i in range(lora_filenames_max):
+        if i < len(lora_multipliers):
+            inputs.lora_multipliers[i] = lora_multipliers[i]
+        else:
+            inputs.lora_multipliers[i] = 0.
+    print('lora multipliers Python:', lora_multipliers)
+    print('lora multipliers C:', [x for x in inputs.lora_multipliers])
+
     ret = handle.sd_generate(inputs)
     data_main = ""
     data_extra = ""
@@ -4072,6 +4087,9 @@ def do_GET(self):
         elif clean_path.endswith('/v1/models') or clean_path=='/models':
             response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":int(time.time()),"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
 
+        elif clean_path.endswith('/sdapi/v1/loras'):
+            response_body = (json.dumps([{'name': name, 'path': path} for _, name, path, multiplier in imglorainfo if multiplier == 0.])).encode()
+
         elif clean_path.endswith('/sdapi/v1/upscalers'):
             if args.sdupscaler:
                 response_body = (json.dumps([{"name":"ESRGAN_4x","model_name":"ESRGAN_4x","model_path":"upscaler_model.gguf","model_url":None,"scale":4}]).encode())
@@ -5068,6 +5086,12 @@ def do_POST(self):
                             genparams = sd_comfyui_tranform_params(genparams)
                         elif is_oai_imggen:
                             genparams = sd_oai_tranform_params(genparams)
+                        if not genparams.get('lora'):
+                            # XXX hack to help testing with the stable-ui
+                            prompt, loras = extract_loras_from_prompt(genparams['prompt'])
+                            if loras:
+                                genparams['prompt'] = prompt
+                                genparams['lora'] = loras
                         gen = sd_generate(genparams)
                         gendat = gen["data"]
                         genanim = gen["animated"]
@@ -6897,9 +6921,10 @@ def export_vars():
         args.sdquant = sd_quant_option(sd_quant_var.get())
         if sd_lora_var.get() != "":
             args.sdlora = [item.strip() for item in sd_lora_var.get().split("|") if item]
-            args.sdloramult = float(sd_loramult_var.get())
+            args.sdloramult = [float(item) for item in sd_loramult_var.get().split("|") if item]
         else:
             args.sdlora = None
+            args.sdloramult = None
 
         if gen_defaults_var.get() != "":
             args.gendefaults = gen_defaults_var.get()
@@ -7158,7 +7183,13 @@ def import_vars(dict):
                 sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "")
         else:
             sd_lora_var.set("")
-        sd_loramult_var.set(str(dict["sdloramult"]) if ("sdloramult" in dict and dict["sdloramult"]) else "1.0")
+        if "sdloramult" in dict and dict["sdloramult"]:
+            sdloramult = dict["sdloramult"]
+            if not isinstance(sdloramult, list):
+                sdloramult = [sdloramult]
+            sd_loramult_var.set("|".join([tryparsefloat(n, 0.) for n in sdloramult]))
+        else:
+            sd_loramult_var.set("")
         gen_defaults_var.set(dict["gendefaults"] if ("gendefaults" in dict and dict["gendefaults"]) else "")
         gen_defaults_overwrite_var.set(1 if "gendefaultsoverwrite" in dict and dict["gendefaultsoverwrite"] else 0)
 
@@ -7602,6 +7633,8 @@ def convert_invalid_args(args):
         dict["noflashattention"] = not dict["flashattention"]
     if "sdlora" in dict and isinstance(dict["sdlora"], str):
         dict["sdlora"] = ([dict["sdlora"]] if dict["sdlora"] else None)
+    if "sdloramult" in dict and isinstance(dict["sdloramult"], float):
+        dict["sdloramult"] = ([dict["sdloramult"]] if dict["sdloramult"] else None)
     return args
 
 def setuptunnel(global_memory, has_sd):
@@ -8280,6 +8313,88 @@ def main(launch_args, default_args):
                 print("Press ENTER key to exit.", flush=True)
                 input()
 
+
+def mk_lora_info(imgloras, multipliers):
+    # (full path, name, name+extension, can change multiplier)
+    # XXX for each LoRA, sdapi needs a name and a path; we could use
+    # the full filename as a path, but we don't know if we can expose it
+    used_lora_names = set()
+    result = []
+    for i, lora_path in enumerate(imgloras):
+        multiplier = 0. if i >= len(multipliers) else multipliers[i]
+        lora_file = os.path.basename(lora_path)
+        lora_name, lora_ext = os.path.splitext(lora_file)
+        # ensure unique names
+        i = 1
+        mapped_name = lora_name
+        while True:
+            if mapped_name not in used_lora_names:
+                result.append((lora_path, mapped_name, mapped_name + lora_ext, multiplier))
+                used_lora_names.add(mapped_name)
+                break
+            i += 1
+            mapped_name = lora_name + '_' + str(i)
+    return result
+
+def extract_loras_from_prompt(prompt):
+
+    pattern = r'<lora:([^:>]+):([^>]+)>'
+    lora_data = []
+
+    matches = list(re.finditer(pattern, prompt))
+
+    for match in matches:
+        raw_path = match.group(1)
+        raw_mul = match.group(2)
+        try:
+            mul = float(raw_mul)
+        except ValueError:
+            continue
+
+        is_high_noise = False
+        prefix = "|high_noise|"
+        if raw_path.startswith(prefix):
+            raw_path = raw_path[len(prefix):]
+            is_high_noise = True
+
+        lora_data.append({
+            'name': raw_path,
+            'multiplier': mul,
+            'is_high_noise': is_high_noise,
+            })
+
+        prompt = prompt.replace(match.group(0), "", 1)
+
+    return prompt, lora_data
+
+
+def prepare_lora_multipliers(request_list):
+    result = [0.] * len(imglorainfo)
+    map_lora_path = {}
+    map_lora_name = {}
+    for i, (fullpath, name, path, origmul) in enumerate(imglorainfo):
+        map_lora_path[path] = i
+        map_lora_name[name] = i
+    if not isinstance(request_list, list):
+        request_list = [request_list]
+    for r in request_list:
+        if not isinstance(r, dict):
+            continue
+        multiplier = tryparsefloat(r.get('multiplier'), None)
+        if multiplier is None:
+            continue
+        idx = None
+        if 'path' in r:
+            idx = map_lora_path.get(r['path'])
+        elif 'name' in r:
+            # XXX this is convenient for the prompt,
+            # but is the API supposed to support it?
+            idx = map_lora_name.get(r['name'])
+        if idx is not None:
+            result[idx] += multiplier
+    return result
+
+
 def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
     global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, embedded_kailite_gz, embedded_kcpp_docs_gz, embedded_kcpp_sdui_gz, embedded_lcpp_ui_gz, embedded_musicui, embedded_musicui_gz, start_time, exitcounter, global_memory, using_gui_launcher
     global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, musicdiffusionmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support, cached_chat_template
@@ -8725,6 +8840,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
                         imgloras.append(os.path.abspath(curr))
                     else:
                         print(f"Missing SD LORA model file {curr}...")
+            global imglorainfo
+            imglorainfo = mk_lora_info(imgloras, args.sdloramult)
             if args.sdvae:
                 if os.path.exists(args.sdvae):
                     imgvae = os.path.abspath(args.sdvae)
@@ -9291,7 +9408,7 @@ def range_checker(arg: str):
     sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group()
     sdparsergrouplora.add_argument("--sdquant",  metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0)
     sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify image generation LoRAs safetensors models to be applied. Multiple LoRAs are accepted.", nargs='+')
-    sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LoRA model to be applied.", type=float, default=1.0)
+    sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LoRA model to be applied.", type=float, nargs='+', default=[1.0])
     sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold)
     whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
     whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index a65811a6cab..914cdee3630 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -79,8 +79,8 @@ struct SDParams {
     bool chroma_use_dit_mask     = true;
 
     std::vector<std::string> lora_paths;
-    std::vector<sd_lora_t> lora_specs;
-    uint32_t lora_count;
+
+    float lora_multipliers[lora_filenames_max] = {};
 };
 
 //shared
@@ -380,6 +380,17 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
         params.chroma_use_dit_mask = false;
     }
 
+    // check initial LoRA multipliers
+    for(size_t i=0;i<sd_params->lora_paths.size();++i)
+    {
+        float multiplier = inputs.lora_multiplier[i];
+        if (params.lora_apply_mode != LORA_APPLY_AT_RUNTIME && multiplier == 0.) {
+            printf("\nForcing LoRA at_runtime mode to allow runtime customization\n");
+            params.lora_apply_mode = LORA_APPLY_AT_RUNTIME;
+        }
+        sd_params->lora_multipliers[i] = multiplier;
+    }
+
     if(inputs.debugmode==1)
     {
         char* buf = sd_ctx_params_to_str(&params);
@@ -416,21 +427,20 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     std::filesystem::path mpath(inputs.model_filename);
     sdmodelfilename = mpath.filename().string();
 
-    sd_params->lora_specs.clear();
-    sd_params->lora_specs.reserve(lora_filenames_max*2);
+    // preload the LoRAs with the initial multipliers
+    std::vector<sd_lora_t> lora_specs;
     for(int i=0;i<sd_params->lora_paths.size();++i)
     {
         sd_lora_t spec = {};
         spec.path = sd_params->lora_paths[i].c_str();
-        spec.multiplier = inputs.lora_multiplier;
-        sd_params->lora_specs.push_back(spec);
+        spec.multiplier = sd_params->lora_multipliers[i];
+        lora_specs.push_back(spec);
     }
 
-    if(sd_params->lora_specs.size()>0 && inputs.lora_multiplier>0)
+    if(lora_specs.size()>0)
     {
-        printf("\nApply %d LoRAs...\n",sd_params->lora_specs.size());
-        sd_params->lora_count = sd_params->lora_specs.size();
-        sd_ctx->sd->apply_loras(sd_params->lora_specs.data(), sd_params->lora_count);
+        printf("\nApply %d LoRAs...\n", lora_specs.size());
+        sd_ctx->sd->apply_loras(lora_specs.data(), lora_specs.size());
     }
 
     input_extraimage_buffers.reserve(max_extra_images);
@@ -1027,10 +1037,25 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     params.vae_tiling_params.enabled = dotile;
     params.batch_count = 1;
 
-    // needs to be "reapplied" because sdcpp tracks previously applied LoRAs
-    // and weights, and apply/unapply the differences at each gen
-    params.loras = sd_params->lora_specs.data();
-    params.lora_count = sd_params->lora_count;
+    // prepare the LoRA multipliers - similar to the preload code,
+    // but omit if the multiplier is zero
+    std::vector<sd_lora_t> lora_specs;
+    for(size_t i=0;i<sd_params->lora_paths.size();++i)
+    {
+        float multiplier = sd_params->lora_multipliers[i] == 0.f
+                           ? inputs.lora_multipliers[i]
+                           : sd_params->lora_multipliers[i];
+        if (multiplier != 0.f) {
+            sd_lora_t spec = {};
+            spec.path = sd_params->lora_paths[i].c_str();
+            spec.multiplier = multiplier;
+            lora_specs.push_back(spec);
+        }
+    }
+    // note sdcpp tracks previously applied LoRAs and weights,
+    // and apply/unapply the differences at each gen
+    params.loras = lora_specs.data();
+    params.lora_count = lora_specs.size();
 
     params.ref_images = reference_imgs.data();
     params.ref_images_count = reference_imgs.size();
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index 8176e1dca74..c9259669a0c 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -132,6 +132,7 @@ class StableDiffusionGGML {
     std::vector<std::shared_ptr<LoraModel>> diffusion_lora_models;
     std::vector<std::shared_ptr<LoraModel>> first_stage_lora_models;
     bool apply_lora_immediately = false;
+    std::map<std::string, std::shared_ptr<LoraModel>> kcpp_lora_cache;
 
     std::string taesd_path;
     bool use_tiny_autoencoder            = false;
@@ -1170,7 +1171,23 @@ class StableDiffusionGGML {
     std::shared_ptr<LoraModel> load_lora_model_from_file(const std::string& lora_id,
                                                          float multiplier,
                                                          ggml_backend_t backend,
+                                                         std::string stage = "",
                                                          LoraModel::filter_t lora_tensor_filter = nullptr) {
+        // kcpp
+        // first check the cache
+        bool kcpp_at_runtime = (stage != "");
+        std::string lora_key = "|" + stage + "|" + lora_id;
+        if (kcpp_at_runtime) {
+            auto it = kcpp_lora_cache.find(lora_key);
+            if (it != kcpp_lora_cache.end()) {
+                if (it->second) {
+                    it->second->multiplier = multiplier;
+                }
+                return it->second;
+            }
+        }
+        // by construction, kcpp will always find the preloaded LoRAs on the cache
+
         std::string lora_path             = lora_id;
         static std::string high_noise_tag = "|high_noise|";
         bool is_high_noise                = false;
@@ -1182,10 +1199,16 @@ class StableDiffusionGGML {
         auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
         if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
             LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
-            return nullptr;
+            // also cache negatives to avoid I/O at runtime
+            lora = nullptr;
+            if (kcpp_at_runtime)
+                kcpp_lora_cache[lora_key] = lora;
+            return lora;
         }
 
         lora->multiplier = multiplier;
+        if (kcpp_at_runtime)
+            kcpp_lora_cache[lora_key] = lora;
         return lora;
     }
 
@@ -1264,7 +1287,7 @@ class StableDiffusionGGML {
                 const std::string& lora_id = kv.first;
                 float multiplier           = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, "cond_stage", lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     cond_stage_lora_models.push_back(lora);
@@ -1296,7 +1319,7 @@ class StableDiffusionGGML {
                 const std::string& lora_name = kv.first;
                 float multiplier             = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, "diffusion", lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     diffusion_lora_models.push_back(lora);
@@ -1332,7 +1355,7 @@ class StableDiffusionGGML {
                 const std::string& lora_name = kv.first;
                 float multiplier             = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, "first_stage", lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     first_stage_lora_models.push_back(lora);