CMakeLists.txt
LICENSE
MANIFEST.in
README.md
pyproject.toml
requirements-common.txt
requirements-cpu.txt
requirements-cuda.txt
requirements-neuron.txt
requirements-rocm.txt
setup.py
cmake/cpu_extension.cmake
cmake/hipify.py
cmake/utils.cmake
csrc/activation_kernels.cu
csrc/cache.h
csrc/cache_kernels.cu
csrc/cuda_compat.h
csrc/cuda_utils.h
csrc/cuda_utils_kernels.cu
csrc/custom_all_reduce.cu
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce_test.cu
csrc/dispatch_utils.h
csrc/layernorm_kernels.cu
csrc/moe_align_block_size_kernels.cu
csrc/ops.h
csrc/pos_encoding_kernels.cu
csrc/torch_bindings.cpp
csrc/attention/attention_dtypes.h
csrc/attention/attention_generic.cuh
csrc/attention/attention_kernels.cu
csrc/attention/attention_utils.cuh
csrc/attention/dtype_bfloat16.cuh
csrc/attention/dtype_float16.cuh
csrc/attention/dtype_float32.cuh
csrc/attention/dtype_fp8.cuh
csrc/core/registration.h
csrc/core/scalar_type.hpp
csrc/core/torch_bindings.cpp
csrc/cpu/activation.cpp
csrc/cpu/attention.cpp
csrc/cpu/cache.cpp
csrc/cpu/cpu_types.hpp
csrc/cpu/cpu_types_vsx.hpp
csrc/cpu/cpu_types_x86.hpp
csrc/cpu/dnnl_helper.hpp
csrc/cpu/layernorm.cpp
csrc/cpu/pos_encoding.cpp
csrc/cpu/quant.cpp
csrc/cpu/torch_bindings.cpp
csrc/cpu/utils.cpp
csrc/cutlass_extensions/cute_utils.cuh
csrc/cutlass_extensions/torch_utils.hpp
csrc/cutlass_extensions/vllm_collective_builder.cuh
csrc/cutlass_extensions/vllm_custom_types.cuh
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
csrc/cutlass_extensions/vllm_numeric_conversion.cuh
csrc/mamba/causal_conv1d/causal_conv1d.cu
csrc/mamba/causal_conv1d/causal_conv1d.h
csrc/mamba/causal_conv1d/static_switch.h
csrc/mamba/mamba_ssm/selective_scan.h
csrc/mamba/mamba_ssm/selective_scan_fwd.cu
csrc/mamba/mamba_ssm/static_switch.h
csrc/moe/marlin_moe_ops.cu
csrc/moe/marlin_moe_ops.h
csrc/moe/moe_ops.h
csrc/moe/topk_softmax_kernels.cu
csrc/moe/torch_bindings.cpp
csrc/prepare_inputs/advance_step.cu
csrc/prepare_inputs/advance_step.cuh
csrc/quantization/aqlm/gemm_kernels.cu
csrc/quantization/awq/dequantize.cuh
csrc/quantization/awq/gemm_kernels.cu
csrc/quantization/compressed_tensors/int8_quant_kernels.cu
csrc/quantization/cutlass_w8a8/Epilogues.md
csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
csrc/quantization/cutlass_w8a8/common.hpp
csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
csrc/quantization/fp8/common.cu
csrc/quantization/fp8/fp8_marlin.cu
csrc/quantization/fp8/amd/hip_float8.h
csrc/quantization/fp8/amd/hip_float8_impl.h
csrc/quantization/fp8/amd/quant_utils.cuh
csrc/quantization/fp8/nvidia/quant_utils.cuh
csrc/quantization/gguf/dequantize.cuh
csrc/quantization/gguf/ggml-common.h
csrc/quantization/gguf/gguf_kernel.cu
csrc/quantization/gguf/mmq.cuh
csrc/quantization/gguf/mmvq.cuh
csrc/quantization/gguf/vecdotq.cuh
csrc/quantization/gptq/compat.cuh
csrc/quantization/gptq/matrix_view.cuh
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/qdq_2.cuh
csrc/quantization/gptq/qdq_3.cuh
csrc/quantization/gptq/qdq_4.cuh
csrc/quantization/gptq/qdq_8.cuh
csrc/quantization/gptq/qdq_util.cuh
csrc/quantization/gptq_marlin/awq_marlin_repack.cu
csrc/quantization/gptq_marlin/gptq_marlin.cu
csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
csrc/quantization/gptq_marlin/marlin.cuh
csrc/quantization/gptq_marlin/marlin_dtypes.cuh
csrc/quantization/machete/Readme.md
csrc/quantization/machete/generate.py
csrc/quantization/machete/machete_collective_builder.cuh
csrc/quantization/machete/machete_interleaving_utils.cuh
csrc/quantization/machete/machete_mainloop.cuh
csrc/quantization/machete/machete_mm_kernel.cuh
csrc/quantization/machete/machete_mm_launcher.cuh
csrc/quantization/machete/machete_prepack_kernel.cuh
csrc/quantization/machete/machete_prepack_launcher.cuh
csrc/quantization/machete/machete_prepacked_layout.cuh
csrc/quantization/machete/machete_pytorch.cu
csrc/quantization/marlin/dense/LICENSE
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
csrc/quantization/marlin/dense/common/base.h
csrc/quantization/marlin/dense/common/mem.h
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
csrc/quantization/marlin/sparse/LICENSE
csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
csrc/quantization/marlin/sparse/common/base.h
csrc/quantization/marlin/sparse/common/mem.h
csrc/quantization/marlin/sparse/common/mma.h
tests/test_cache_block_hashing.py
tests/test_config.py
tests/test_embedded_commit.py
tests/test_inputs.py
tests/test_logger.py
tests/test_logits_processor.py
tests/test_regression.py
tests/test_sampling_params.py
tests/test_scalartype.py
tests/test_sequence.py
tests/test_sharded_state_loader.py
tests/test_utils.py
vllm/__init__.py
vllm/_core_ext.py
vllm/_custom_ops.py
vllm/_ipex_ops.py
vllm/block.py
vllm/commit_id.py
vllm/config.py
vllm/connections.py
vllm/envs.py
vllm/logger.py
vllm/outputs.py
vllm/pooling_params.py
vllm/py.typed
vllm/sampling_params.py
vllm/scalar_type.py
vllm/scripts.py
vllm/sequence.py
vllm/tracing.py
vllm/utils.py
vllm/version.py
vllm.egg-info/PKG-INFO
vllm.egg-info/SOURCES.txt
vllm.egg-info/dependency_links.txt
vllm.egg-info/entry_points.txt
vllm.egg-info/requires.txt
vllm.egg-info/top_level.txt
vllm/adapter_commons/__init__.py
vllm/adapter_commons/layers.py
vllm/adapter_commons/models.py
vllm/adapter_commons/request.py
vllm/adapter_commons/utils.py
vllm/adapter_commons/worker_manager.py
vllm/assets/__init__.py
vllm/assets/audio.py
vllm/assets/base.py
vllm/assets/image.py
vllm/assets/video.py
vllm/attention/__init__.py
vllm/attention/layer.py
vllm/attention/selector.py
vllm/attention/backends/__init__.py
vllm/attention/backends/abstract.py
vllm/attention/backends/blocksparse_attn.py
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flashinfer.py
vllm/attention/backends/ipex_attn.py
vllm/attention/backends/openvino.py
vllm/attention/backends/pallas.py
vllm/attention/backends/rocm_flash_attn.py
vllm/attention/backends/torch_sdpa.py
vllm/attention/backends/utils.py
vllm/attention/backends/xformers.py
vllm/attention/ops/__init__.py
vllm/attention/ops/ipex_attn.py
vllm/attention/ops/paged_attn.py
vllm/attention/ops/prefix_prefill.py
vllm/attention/ops/triton_flash_attention.py
vllm/attention/ops/blocksparse_attention/__init__.py
vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
vllm/attention/ops/blocksparse_attention/interface.py
vllm/attention/ops/blocksparse_attention/utils.py
vllm/compilation/__init__.py
vllm/compilation/wrapper.py
vllm/core/__init__.py
vllm/core/block_manager_v1.py
vllm/core/block_manager_v2.py
vllm/core/embedding_model_block_manager.py
vllm/core/evictor_v1.py
vllm/core/evictor_v2.py
vllm/core/interfaces.py
vllm/core/scheduler.py
vllm/core/block/__init__.py
vllm/core/block/block_table.py
vllm/core/block/common.py
vllm/core/block/cpu_gpu_block_allocator.py
vllm/core/block/interfaces.py
vllm/core/block/naive_block.py
vllm/core/block/prefix_caching_block.py
vllm/core/block/utils.py
vllm/distributed/__init__.py
vllm/distributed/communication_op.py
vllm/distributed/parallel_state.py
vllm/distributed/utils.py
vllm/distributed/device_communicators/__init__.py
vllm/distributed/device_communicators/cuda_wrapper.py
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce_utils.py
vllm/distributed/device_communicators/pynccl.py
vllm/distributed/device_communicators/pynccl_wrapper.py
vllm/distributed/device_communicators/shm_broadcast.py
vllm/distributed/device_communicators/tpu_communicator.py
vllm/engine/__init__.py
vllm/engine/arg_utils.py
vllm/engine/async_llm_engine.py
vllm/engine/async_timeout.py
vllm/engine/llm_engine.py
vllm/engine/metrics.py
vllm/engine/metrics_types.py
vllm/engine/protocol.py
vllm/engine/output_processor/__init__.py
vllm/engine/output_processor/interfaces.py
vllm/engine/output_processor/multi_step.py
vllm/engine/output_processor/single_step.py
vllm/engine/output_processor/stop_checker.py
vllm/engine/output_processor/util.py
vllm/entrypoints/__init__.py
vllm/entrypoints/api_server.py
vllm/entrypoints/chat_utils.py
vllm/entrypoints/launcher.py
vllm/entrypoints/llm.py
vllm/entrypoints/logger.py
vllm/entrypoints/openai/__init__.py
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/logits_processors.py
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/run_batch.py
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_embedding.py
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/rpc/__init__.py
vllm/entrypoints/openai/rpc/client.py
vllm/entrypoints/openai/rpc/server.py
vllm/entrypoints/openai/tool_parsers/__init__.py
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
vllm/entrypoints/openai/tool_parsers/utils.py
vllm/executor/__init__.py
vllm/executor/cpu_executor.py
vllm/executor/distributed_gpu_executor.py
vllm/executor/executor_base.py
vllm/executor/gpu_executor.py
vllm/executor/msgspec_utils.py
vllm/executor/multiproc_gpu_executor.py
vllm/executor/multiproc_worker_utils.py
vllm/executor/multiproc_xpu_executor.py
vllm/executor/neuron_executor.py
vllm/executor/openvino_executor.py
vllm/executor/ray_gpu_executor.py
vllm/executor/ray_tpu_executor.py
vllm/executor/ray_utils.py
vllm/executor/ray_xpu_executor.py
vllm/executor/tpu_executor.py
vllm/executor/xpu_executor.py
vllm/inputs/__init__.py
vllm/inputs/data.py
vllm/inputs/parse.py
vllm/inputs/preprocess.py
vllm/inputs/registry.py
vllm/logging/__init__.py
vllm/logging/formatter.py
vllm/lora/__init__.py
vllm/lora/fully_sharded_layers.py
vllm/lora/layers.py
vllm/lora/lora.py
vllm/lora/models.py
vllm/lora/punica.py
vllm/lora/request.py
vllm/lora/utils.py
vllm/lora/worker_manager.py
vllm/lora/ops/__init__.py
vllm/lora/ops/bgmv_expand.py
vllm/lora/ops/bgmv_expand_slice.py
vllm/lora/ops/bgmv_shrink.py
vllm/lora/ops/sgmv_expand.py
vllm/lora/ops/sgmv_expand_slice.py
vllm/lora/ops/sgmv_shrink.py
vllm/lora/ops/utils.py
vllm/model_executor/__init__.py
vllm/model_executor/custom_op.py
vllm/model_executor/parameter.py
vllm/model_executor/pooling_metadata.py
vllm/model_executor/sampling_metadata.py
vllm/model_executor/utils.py
vllm/model_executor/guided_decoding/__init__.py
vllm/model_executor/guided_decoding/guided_fields.py
vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
vllm/model_executor/guided_decoding/outlines_decoding.py
vllm/model_executor/guided_decoding/outlines_logits_processors.py
vllm/model_executor/layers/__init__.py
vllm/model_executor/layers/activation.py
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/logits_processor.py
vllm/model_executor/layers/pooler.py
vllm/model_executor/layers/rejection_sampler.py
vllm/model_executor/layers/resampler.py
vllm/model_executor/layers/rotary_embedding.py
vllm/model_executor/layers/sampler.py
vllm/model_executor/layers/spec_decode_base_sampler.py
vllm/model_executor/layers/typical_acceptance_sampler.py
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/fused_moe/__init__.py
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/moe_pallas.py
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/mamba/__init__.py
vllm/model_executor/layers/mamba/ops/__init__.py
vllm/model_executor/layers/mamba/ops/causal_conv1d.py
vllm/model_executor/layers/mamba/ops/mamba_ssm.py
vllm/model_executor/layers/ops/__init__.py
vllm/model_executor/layers/ops/rand.py
vllm/model_executor/layers/ops/sample.py
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/aqlm.py
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_triton.py
vllm/model_executor/layers/quantization/base_config.py
vllm/model_executor/layers/quantization/bitsandbytes.py
vllm/model_executor/layers/quantization/deepspeedfp.py
vllm/model_executor/layers/quantization/experts_int8.py
vllm/model_executor/layers/quantization/fbgemm_fp8.py
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/gguf.py
vllm/model_executor/layers/quantization/gptq.py
vllm/model_executor/layers/quantization/gptq_marlin.py
vllm/model_executor/layers/quantization/gptq_marlin_24.py
vllm/model_executor/layers/quantization/kv_cache.py
vllm/model_executor/layers/quantization/marlin.py
vllm/model_executor/layers/quantization/modelopt.py
vllm/model_executor/layers/quantization/neuron_quant.py
vllm/model_executor/layers/quantization/qqq.py
vllm/model_executor/layers/quantization/schema.py
vllm/model_executor/layers/quantization/tpu_int8.py
vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
vllm/model_executor/layers/quantization/utils/__init__.py
vllm/model_executor/layers/quantization/utils/marlin_utils.py
vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
vllm/model_executor/layers/quantization/utils/quant_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/model_loader/__init__.py
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/neuron.py
vllm/model_executor/model_loader/openvino.py
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/models/__init__.py
vllm/model_executor/models/arctic.py
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/bart.py
vllm/model_executor/models/blip.py
vllm/model_executor/models/blip2.py
vllm/model_executor/models/bloom.py
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/clip.py
vllm/model_executor/models/commandr.py
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/decilm.py
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/eagle.py
vllm/model_executor/models/exaone.py
vllm/model_executor/models/falcon.py
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gpt2.py
vllm/model_executor/models/gpt_bigcode.py
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/granite.py
vllm/model_executor/models/idefics2_vision_model.py
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/intern_vit.py
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internvl.py
vllm/model_executor/models/jais.py
vllm/model_executor/models/jamba.py
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama_embedding.py
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/medusa.py
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mlp_speculator.py
vllm/model_executor/models/mpt.py
vllm/model_executor/models/na_vit.py
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/olmo.py
vllm/model_executor/models/opt.py
vllm/model_executor/models/orion.py
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi3_small.py
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/siglip.py
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/utils.py
vllm/model_executor/models/xverse.py
vllm/multimodal/__init__.py
vllm/multimodal/audio.py
vllm/multimodal/base.py
vllm/multimodal/image.py
vllm/multimodal/registry.py
vllm/multimodal/utils.py
vllm/multimodal/video.py
vllm/platforms/__init__.py
vllm/platforms/cpu.py
vllm/platforms/cuda.py
vllm/platforms/interface.py
vllm/platforms/rocm.py
vllm/platforms/tpu.py
vllm/plugins/__init__.py
vllm/prompt_adapter/__init__.py
vllm/prompt_adapter/layers.py
vllm/prompt_adapter/models.py
vllm/prompt_adapter/request.py
vllm/prompt_adapter/utils.py
vllm/prompt_adapter/worker_manager.py
vllm/spec_decode/__init__.py
vllm/spec_decode/batch_expansion.py
vllm/spec_decode/draft_model_runner.py
vllm/spec_decode/interfaces.py
vllm/spec_decode/medusa_worker.py
vllm/spec_decode/metrics.py
vllm/spec_decode/mlp_speculator_worker.py
vllm/spec_decode/multi_step_worker.py
vllm/spec_decode/ngram_worker.py
vllm/spec_decode/proposer_worker_base.py
vllm/spec_decode/smaller_tp_proposer_worker.py
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/target_model_runner.py
vllm/spec_decode/top1_proposer.py
vllm/spec_decode/util.py
vllm/transformers_utils/__init__.py
vllm/transformers_utils/config.py
vllm/transformers_utils/detokenizer.py
vllm/transformers_utils/image_processor.py
vllm/transformers_utils/processor.py
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/utils.py
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/arctic.py
vllm/transformers_utils/configs/chatglm.py
vllm/transformers_utils/configs/dbrx.py
vllm/transformers_utils/configs/eagle.py
vllm/transformers_utils/configs/exaone.py
vllm/transformers_utils/configs/falcon.py
vllm/transformers_utils/configs/granite.py
vllm/transformers_utils/configs/internvl.py
vllm/transformers_utils/configs/jais.py
vllm/transformers_utils/configs/medusa.py
vllm/transformers_utils/configs/mlp_speculator.py
vllm/transformers_utils/configs/mpt.py
vllm/transformers_utils/configs/nemotron.py
vllm/transformers_utils/configs/ultravox.py
vllm/transformers_utils/tokenizer_group/__init__.py
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
vllm/transformers_utils/tokenizers/__init__.py
vllm/transformers_utils/tokenizers/baichuan.py
vllm/transformers_utils/tokenizers/mistral.py
vllm/triton_utils/__init__.py
vllm/triton_utils/custom_cache_manager.py
vllm/triton_utils/importing.py
vllm/triton_utils/libentry.py
vllm/triton_utils/sample.py
vllm/usage/__init__.py
vllm/usage/usage_lib.py
vllm/worker/__init__.py
vllm/worker/cache_engine.py
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_worker.py
vllm/worker/embedding_model_runner.py
vllm/worker/enc_dec_model_runner.py
vllm/worker/model_runner.py
vllm/worker/model_runner_base.py
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_worker.py
vllm/worker/neuron_model_runner.py
vllm/worker/neuron_worker.py
vllm/worker/openvino_model_runner.py
vllm/worker/openvino_worker.py
vllm/worker/tpu_model_runner.py
vllm/worker/tpu_worker.py
vllm/worker/utils.py
vllm/worker/worker.py
vllm/worker/worker_base.py
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_worker.py