huggingface · regisss · May 23, 2025 · May 13, 2025 · May 20, 2025 · May 22, 2025
diff --git a/backends/gaudi/server/text_generation_server/models/__init__.py b/backends/gaudi/server/text_generation_server/models/__init__.py
@@ -109,6 +109,9 @@
     from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
         Qwen2ForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_qwen3_modeling import (
+        Qwen3ForCausalLM,
+    )
     from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
         FlashMistralForCausalLM,
     )
@@ -293,6 +296,12 @@ class ModelType(enum.Enum):
         "name": "Qwen 2.5 VL",
         "url": "https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e",
     }
+    QWEN3 = {
+        "type": "qwen3",
+        "name": "Qwen 3",
+        "url": "https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f",
+    }
+
     GALACTICA = {
         "type": "galactica",
         "name": "Galactica",
@@ -791,6 +800,18 @@ def get_model(
                 config_class=Qwen2_5_VLConfig,
                 processor_class=Qwen2_5_VLProcessor,
             )
+        elif model_type == QWEN3:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=Qwen3ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
         elif model_type == MLLAMA:
             return FlashMllamaCausalLM(
                 model_id=model_id,

diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama4_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama4_modeling.py
@@ -22,6 +22,7 @@
 from torch import nn
 import torch.nn.functional as F
 
+import habana_frameworks.torch as htorch
 from transformers.cache_utils import Cache
 from transformers.activations import ACT2FN
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
@@ -567,6 +568,9 @@ def forward(
         )
 
         freqs_ci = self.rotary_emb(hidden_states, position_ids.view(bs, -1))
+        lazy_mode = htorch.utils.internal.is_lazy()
+        if lazy_mode:
+            htorch.core.mark_step()
 
         for i, layer in enumerate(self.layers):
             hidden_states = layer(
@@ -582,6 +586,8 @@ def forward(
                 position_ids=position_ids,
                 hpu_attention_meta=hpu_attention_meta,
             )
+            if lazy_mode:
+                htorch.core.mark_step()
 
         hidden_states, _ = self.norm(hidden_states)