Select the primary Unquantized MoE backend Note: Shape-specific fallbacks may still occur at runtime.
Source code in vllm/model_executor/layers/fused_moe/oracle/unquantized.py
| def select_unquantized_moe_backend(
moe_config: FusedMoEConfig,
use_ep: bool,
use_dp: bool,
) -> UnquantizedMoeBackend:
"""
Select the primary Unquantized MoE backend
Note: Shape-specific fallbacks may still occur at runtime.
"""
def _make_log_backend(backend: UnquantizedMoeBackend):
return f"Using {backend.value} backend for Unquantized MoE"
rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
activation_format = (
mk.FusedMoEActivationFormat.BatchedExperts
if moe_config.moe_parallel_config.use_batched_activation_format
else mk.FusedMoEActivationFormat.Standard
)
# Check if FlashInfer TRTLLM BF16 MoE is supported
trtllm_supported, _ = is_supported_config_trtllm_bf16(
moe_config=moe_config,
activation_format=activation_format,
)
flashinfer_trtllm_moe_enabled = (
has_flashinfer()
and envs.VLLM_USE_FLASHINFER_MOE_FP16
and trtllm_supported
and envs.VLLM_FLASHINFER_MOE_BACKEND == "latency"
)
# FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
flashinfer_cutlass_moe_enabled = (
has_flashinfer_cutlass_fused_moe()
and envs.VLLM_USE_FLASHINFER_MOE_FP16
and use_ep
and (not use_dp)
and current_platform.has_device_capability(90)
)
if current_platform.is_rocm():
if rocm_aiter_moe_enabled:
backend = UnquantizedMoeBackend.AITER
else:
backend = UnquantizedMoeBackend.TRITON
if current_platform.is_cuda():
if flashinfer_trtllm_moe_enabled:
backend = UnquantizedMoeBackend.FLASHINFER_TRTLLM
elif flashinfer_cutlass_moe_enabled:
backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
if trtllm_supported:
logger.info_once(
"FlashInfer TRTLLM MoE is available but not enabled, "
"consider setting VLLM_FLASHINFER_MOE_BACKEND=latency "
"to enable it for better performance.",
scope="local",
)
else:
if not envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported:
logger.info_once(
"FlashInfer TRTLLM MoE is available but not enabled, "
"consider setting VLLM_USE_FLASHINFER_MOE_FP16=1 "
"and VLLM_FLASHINFER_MOE_BACKEND=latency "
"to enable it for better performance.",
scope="local",
)
elif use_ep and (not use_dp):
logger.info_once(
"FlashInfer MoE is available for EP"
" but not enabled, consider setting"
" VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
scope="local",
)
elif use_dp:
logger.info_once(
"FlashInfer CUTLASS MoE is currently not available for DP.",
scope="local",
)
backend = UnquantizedMoeBackend.TRITON
if current_platform.is_xpu():
backend = UnquantizedMoeBackend.XPU
if current_platform.is_cpu():
backend = UnquantizedMoeBackend.CPU
if current_platform.is_tpu():
backend = UnquantizedMoeBackend.TPU
if current_platform.is_out_of_tree():
backend = UnquantizedMoeBackend.OOT
logger.info_once(_make_log_backend(backend), scope="local")
return backend
|