InferenceNexus
diff --git a/‎lmdeploy/pytorch/check_env/__init__.py
Lines changed: 7 additions & 4 deletions b/‎lmdeploy/pytorch/check_env/__init__.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎lmdeploy/pytorch/configurations/cogvlm.py
Lines changed: 2 additions & 3 deletions b/‎lmdeploy/pytorch/configurations/cogvlm.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎lmdeploy/pytorch/configurations/qwen.py
Lines changed: 3 additions & 4 deletions b/‎lmdeploy/pytorch/configurations/qwen.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎lmdeploy/turbomind/deploy/converter.py
Lines changed: 6 additions & 2 deletions b/‎lmdeploy/turbomind/deploy/converter.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎lmdeploy/utils.py
Lines changed: 37 additions & 0 deletions b/‎lmdeploy/utils.py
Lines changed: 37 additions & 0 deletions
diff --git a/‎tests/pytorch/kernel/test_apply_rotary.py
Lines changed: 2 additions & 1 deletion b/‎tests/pytorch/kernel/test_apply_rotary.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/pytorch/kernel/test_multinomial_sampling.py
Lines changed: 2 additions & 1 deletion b/‎tests/pytorch/kernel/test_multinomial_sampling.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/pytorch/kernel/test_rms_norm.py
Lines changed: 3 additions & 1 deletion b/‎tests/pytorch/kernel/test_rms_norm.py
Lines changed: 3 additions & 1 deletion
@@ -208,18 +208,21 @@ def __check_model_dtype_support(config):
         import torch
 
         from lmdeploy.pytorch.config import ModelConfig
+        from lmdeploy.utils import is_bf16_supported
 
         try:
             model_config = ModelConfig.from_hf_config(config,
                                                       model_path=model_path,
                                                       dtype=dtype)
             if model_config.dtype == torch.bfloat16:
-                assert torch.cuda.is_bf16_supported(), (
+                assert is_bf16_supported(), (
                     'bf16 is not supported on your device')
         except AssertionError as e:
-            message = (f'Your device does not support `{model_config.dtype}`. '
-                       'Try edit `torch_dtype` in `config.json`.\n'
-                       'Note that this might have negative effect!')
+            message = (
+                f'Your device does not support `{model_config.dtype}`. '
+                'You can set `dtype` to float16 in PyTorchEngineConfig or '
+                '`--dtype float16` to api_server.\n'
+                'Note that this might have negative effect!')
             _handle_exception(e, 'Model', logger, message=message)
         except Exception as e:
             message = (f'Checking failed with error {e}',
 
@@ -14,12 +14,11 @@ def condition(cls, hf_config):
     @classmethod
         """build."""
-        import torch
+        from lmdeploy.utils import is_bf16_supported
         cfg = DefaultModelConfigBuilder.build(hf_config)
         if getattr(hf_config, 'num_multi_query_heads', None):
             cfg.num_key_value_heads = hf_config.num_multi_query_heads
         cfg.cogvlm_style = True
-        torch_dtype = 'bfloat16' if torch.cuda.is_bf16_supported(
-        ) else 'float16'
+        torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
         hf_config.torch_dtype = torch_dtype
         return cfg
@@ -13,16 +13,15 @@ def condition(cls, hf_config):
     @classmethod
     def build(cls, hf_config, model_path: str = None):
         """build."""
-        import torch
+        from lmdeploy.utils import is_bf16_supported
         cfg = DefaultModelConfigBuilder.build(hf_config)
         if cfg.bos_token_id is None:
             cfg.bos_token_id = 151644
         if cfg.eos_token_id is None:
             cfg.eos_token_id = 151645
 
-        is_bf16_supported = torch.cuda.is_bf16_supported()
-        torch_dtype = 'bfloat16' if is_bf16_supported else 'float16'
-        if hf_config.bf16 and is_bf16_supported:
+        torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
+        if hf_config.bf16 and is_bf16_supported():
             torch_dtype = 'bfloat16'
         elif hf_config.fp16:
             torch_dtype = 'float16'
 
@@ -11,7 +11,7 @@
 from lmdeploy.model import MODELS, best_match_model
 f
67E6
rom lmdeploy.utils import get_logger, get_model
 
-from ...utils import _get_and_verify_max_len
+from ...utils import _get_and_verify_max_len, is_bf16_supported
 from ..supported_models import SUPPORTED_ARCHS, is_supported
 from .config import TurbomindModelConfig
 from .exporter import get_exporter_factory
@@ -138,6 +138,10 @@ def get_output_model_registered_name_and_config(model_path: str,
     else:
         assert 0, f'unsupported specified data type {dtype}'
 
+    if weight_type == 'bfloat16' and not is_bf16_supported():
+        logger.warn('data type fallback to float16 since '
+                    'torch.cuda.is_bf16_supported is False')
+        weight_type = 'float16'
     config.model_config.model_arch = model_arch
     config.model_config.weight_type = weight_type
     config.model_config.model_format = model_format
@@ -226,7 +230,7 @@ def get_tm_model(model_path,
             f'mismatched quant method: user input ' \
             f'"{engine_config.model_format}" ' \
             f'vs model quant_config "{quant_method}"'
-        assert group_size is None or group_size == _group_size, \
+        assert not group_size or group_size == _group_size, \
             f'mismatched quant group size: user input "{group_size}" ' \
             f'vs model quant_config "{_group_size}"'
 
 
@@ -352,3 +352,40 @@ def get_max_batch_size(device_type: str):
         return 16
     elif device_type == 'maca':
         return 128
+
+
+def is_bf16_supported(device_type: str = 'cuda'):
+    """Check if device support bfloat16.
+
+    Args:
+        device_type (str): the type of device
+    """
+
+    if device_type == 'cuda':
+        import torch
+        device = torch.cuda.current_device()
+
+        # Check for CUDA version and device compute capability.
+        # This is a fast way to check for it.
+        cuda_version = torch.version.cuda
+        if (cuda_version is not None and int(cuda_version.split('.')[0]) >= 11
+                and torch.cuda.get_device_properties(device).major >= 8):
+            return True
+        else:
+            return False
+    elif device_type == 'ascend':
+        # The following API doesn't work somehow in multi-npu devices. Due to
+        # the `ascend910` device's capability to support bfloat16, we are
+        # returning true as a workaround
+        return True
+        # import torch_npu
+        # device_name = torch_npu.npu.get_device_name(0)[:10]
+        # device_name = device_name.lower()
+        # if device_name.startwith('ascend910'):
+        #     return True
+        # else:
+        #     return False
+    elif device_type == 'maca':
+        return True
+    else:
+        return False
@@ -2,6 +2,7 @@
 import torch
 
 from lmdeploy.pytorch.kernels import apply_rotary_pos_emb
+from lmdeploy.utils import is_bf16_supported
 
 
 def _rotate_half(x):
@@ -12,7 +13,7 @@ def _rotate_half(x):
 
 
 def _bf16_mark():
-    return pytest.mark.skipif(not torch.cuda.is_bf16_supported(),
+    return pytest.mark.skipif(not is_bf16_supported(),
                               reason='bf16 not supported.')
 
 
 
@@ -2,10 +2,11 @@
 import torch
 
 from lmdeploy.pytorch.kernels import multinomial_sampling
+from lmdeploy.utils import is_bf16_supported
 
 
 def _bf16_mark():
-    return pytest.mark.skipif(not torch.cuda.is_bf16_supported(),
+    return pytest.mark.skipif(not is_bf16_supported(),
                               reason='bf16 not supported.')
 
 
 
@@ -1,9 +1,11 @@
 import pytest
 import torch
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def _bf16_mark():
-    return pytest.mark.skipif(not torch.cuda.is_bf16_supported(),
+    return pytest.mark.skipif(not is_bf16_supported(),
                               reason='bf16 not supported.')