vllm.models.deepseek_v4.common.ops.cache_utils ¶

Triton kernels for DeepseekV4 paged K-cache management and sparse-attention index preparation.

quantize_and_insert_k_cache: quantize bf16 K to UE8M0 FP8 and insert into the paged cache.
dequantize_and_gather_k_cache: gather and dequantize FP8 K from the paged cache for sparse/SWA prefill.
compute_global_topk_indices_and_lens: map local topk indices to global KV cache slots and count valid entries.
combine_topk_swa_indices: concatenate topk compressed indices with SWA window indices for sparse prefill.

build_flashinfer_mixed_sparse_indices ¶

build_flashinfer_mixed_sparse_indices(
    decode_swa_indices: Tensor,
    decode_compressed_indices: Tensor | None,
    decode_compressed_topk_lens: Tensor | None,
    prefill_topk_indices: Tensor,
    query_start_loc: Tensor,
    seq_lens: Tensor,
    token_to_req_indices: Tensor,
    swa_block_table: Tensor,
    swa_block_size: int,
    compressed_block_table: Tensor | None,
    compressed_block_size: int,
    window_size: int,
    compress_ratio: int,
    topk: int,
    decode_compressed_indices_are_local: bool = False,
    decode_is_valid_token: Tensor | None = None,
) -> tuple[Tensor, Tensor]

Build the FlashInfer DSV4 sparse-index matrix for decode-first batches.

Produces sparse_indices of shape [num_tokens, window_size + padded_topk] (the first window_size columns are SWA slot ids, the rest are compressed/top-k slot ids) and sparse_topk_lens (active length per token). Decode tokens read precomputed SWA/compressed indices; prefill tokens derive their SWA window from the position and translate local compressed indices to global slots via the block tables.

Source code in vllm/models/deepseek_v4/common/ops/cache_utils.py

def build_flashinfer_mixed_sparse_indices(
    decode_swa_indices: torch.Tensor,
    decode_compressed_indices: torch.Tensor | None,
    decode_compressed_topk_lens: torch.Tensor | None,
    prefill_topk_indices: torch.Tensor,
    query_start_loc: torch.Tensor,
    seq_lens: torch.Tensor,
    token_to_req_indices: torch.Tensor,
    swa_block_table: torch.Tensor,
    swa_block_size: int,
    compressed_block_table: torch.Tensor | None,
    compressed_block_size: int,
    window_size: int,
    compress_ratio: int,
    topk: int,
    decode_compressed_indices_are_local: bool = False,
    decode_is_valid_token: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Build the FlashInfer DSV4 sparse-index matrix for decode-first batches.

    Produces ``sparse_indices`` of shape ``[num_tokens, window_size +
    padded_topk]`` (the first ``window_size`` columns are SWA slot ids, the rest
    are compressed/top-k slot ids) and ``sparse_topk_lens`` (active length per
    token). Decode tokens read precomputed SWA/compressed indices; prefill tokens
    derive their SWA window from the position and translate local compressed
    indices to global slots via the block tables.
    """
    assert decode_swa_indices.dtype == torch.int32
    assert decode_swa_indices.dim() == 2
    assert decode_swa_indices.shape[-1] == window_size
    if decode_compressed_topk_lens is not None:
        assert decode_compressed_topk_lens.dtype == torch.int32
    assert prefill_topk_indices.dtype == torch.int32
    assert prefill_topk_indices.dim() == 2
    assert query_start_loc.dtype == torch.int32
    assert seq_lens.dtype == torch.int32
    assert token_to_req_indices.dtype == torch.int32
    assert swa_block_table.dtype == torch.int32

    num_decode_tokens = decode_swa_indices.shape[0]
    num_prefill_tokens = prefill_topk_indices.shape[0]
    num_tokens = num_decode_tokens + num_prefill_tokens
    assert token_to_req_indices.shape[0] >= num_tokens
    if decode_compressed_topk_lens is not None:
        assert decode_compressed_topk_lens.shape[0] >= num_decode_tokens

    decode_compressed_topk = 0
    if decode_compressed_indices is None:
        decode_compressed_indices = prefill_topk_indices
    else:
        assert decode_compressed_indices.dtype == torch.int32
        assert decode_compressed_indices.dim() == 2
        assert decode_compressed_indices.shape[0] == num_decode_tokens
        decode_compressed_topk = decode_compressed_indices.shape[-1]
    if decode_compressed_topk > 0 and decode_compressed_indices_are_local:
        assert decode_is_valid_token is not None
        assert decode_is_valid_token.dtype == torch.bool
        assert decode_is_valid_token.shape[0] >= num_decode_tokens
    else:
        decode_is_valid_token = token_to_req_indices

    if compressed_block_table is None:
        compressed_block_table = swa_block_table
    assert compressed_block_table.dtype == torch.int32
    has_decode_compressed_lens = decode_compressed_topk_lens is not None
    if decode_compressed_topk_lens is None:
        decode_compressed_topk_lens = token_to_req_indices

    # The FlashInfer TRTLLM-gen sparse-MLA kernels require every per-token topk
    # index row to start on a 16-byte boundary: the kernel loads the compressed
    # indices with 128-bit (16-byte) vectorized loads, so a misaligned row would
    # fault or read across rows. 16 bytes = 4 int32 indices, so round the topk
    # width (and hence the row stride, since the SWA columns are fixed-width) up
    # to a multiple of 4. The extra columns are filled with -1 (invalid) and bounded
    # by ``sparse_topk_lens``, so padding never changes the attention result.
    padded_topk = max(topk, decode_compressed_topk)
    padded_topk = (padded_topk + 3) // 4 * 4
    sparse_indices = torch.empty(
        (num_tokens, window_size + padded_topk),
        dtype=torch.int32,
        device=decode_swa_indices.device,
    )
    sparse_topk_lens = torch.empty(
        num_tokens, dtype=torch.int32, device=decode_swa_indices.device
    )
    if num_tokens == 0:
        return sparse_indices, sparse_topk_lens

    window_block_size = triton.next_power_of_2(max(window_size, 1))
    topk_block_size = triton.next_power_of_2(max(padded_topk, 1))
    max_block_size = max(window_block_size, topk_block_size)
    num_warps = 4 if max_block_size >= 256 else 1

    _build_flashinfer_mixed_sparse_indices_kernel[(num_tokens,)](
        sparse_indices,
        sparse_indices.stride(0),
        sparse_topk_lens,
        decode_swa_indices,
        decode_swa_indices.stride(0),
        decode_compressed_indices,
        decode_compressed_indices.stride(0),
        decode_compressed_topk_lens,
        decode_is_valid_token,
        prefill_topk_indices,
        prefill_topk_indices.stride(0),
        query_start_loc,
        seq_lens,
        token_to_req_indices,
        swa_block_table,
        swa_block_table.stride(0),
        swa_block_size,
        compressed_block_table,
        compressed_block_table.stride(0),
        compressed_block_size,
        NUM_DECODE_TOKENS=num_decode_tokens,
        WINDOW_SIZE=window_size,
        COMPRESS_RATIO=compress_ratio,
        TOP_K=topk,
        PADDED_TOP_K=padded_topk,
        PREFILL_TOPK_STRIDE=prefill_topk_indices.shape[-1],
        DECODE_COMPRESSED_TOPK=decode_compressed_topk,
        DECODE_COMPRESSED_INDICES_ARE_LOCAL=decode_compressed_indices_are_local,
        HAS_DECODE_COMPRESSED_LENS=has_decode_compressed_lens,
        WINDOW_BLOCK_SIZE=window_block_size,
        TOPK_BLOCK_SIZE=topk_block_size,
        num_warps=num_warps,
    )
    return sparse_indices, sparse_topk_lens

compute_global_topk_indices_and_lens ¶

compute_global_topk_indices_and_lens(
    topk_indices: Tensor,
    token_to_req_indices: Tensor,
    block_table: Tensor,
    block_size: int,
    is_valid_token: Tensor,
) -> tuple[Tensor, Tensor]

Map local topk indices to global KV cache slots and count valid entries.

Fuses three operations into a single kernel: 1. Block-table lookup (local index → global slot id) 2. Valid-entry counting (topk_lens per token) 3. Masking padding tokens to length 0

Source code in vllm/models/deepseek_v4/common/ops/cache_utils.py

def compute_global_topk_indices_and_lens(
    topk_indices: torch.Tensor,
    token_to_req_indices: torch.Tensor,
    block_table: torch.Tensor,
    block_size: int,
    is_valid_token: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Map local topk indices to global KV cache slots and count valid entries.

    Fuses three operations into a single kernel:
    1. Block-table lookup (local index → global slot id)
    2. Valid-entry counting (topk_lens per token)
    3. Masking padding tokens to length 0
    """
    num_tokens = topk_indices.shape[0]
    global_topk_indices = torch.empty_like(topk_indices)
    topk_lens = torch.empty(num_tokens, dtype=torch.int32, device=topk_indices.device)
    _compute_global_topk_indices_and_lens_kernel[(num_tokens,)](
        global_topk_indices,
        global_topk_indices.stride(0),
        topk_lens,
        topk_indices,
        topk_indices.stride(0),
        topk_indices.shape[-1],
        token_to_req_indices,
        block_table,
        block_table.stride(0),
        block_size,
        is_valid_token,
        TRITON_BLOCK_SIZE=1024,
    )
    return global_topk_indices, topk_lens

quantize_and_insert_k_cache ¶

quantize_and_insert_k_cache(
    k: Tensor,
    k_cache: Tensor,
    slot_mapping: Tensor,
    block_size: int = 64,
    is_ue8m0: bool = True,
)

Quantize K tensor and insert into paged K cache.

K Cache block layout (block_size=64 tokens): - First 64 * 576 = 36864 bytes: Token data - Each token: 448 bytes (fp8) + 128 bytes (bf16) - Next 64 * 8 = 512 bytes: Scales - Each token: 8 bytes (uint8 scales, 7 real + 1 padding) - Padded to multiple of 576

Source code in vllm/models/deepseek_v4/common/ops/cache_utils.py

def quantize_and_insert_k_cache(
    k: torch.Tensor,  # [num_tokens, 512] bf16
    k_cache: torch.Tensor,  # [num_blocks, block_bytes] uint8
    slot_mapping: torch.Tensor,  # [num_tokens] int64
    block_size: int = 64,
    is_ue8m0: bool = True,
):
    """
    Quantize K tensor and insert into paged K cache.

    K Cache block layout (block_size=64 tokens):
    - First 64 * 576 = 36864 bytes: Token data
      - Each token: 448 bytes (fp8) + 128 bytes (bf16)
    - Next 64 * 8 = 512 bytes: Scales
      - Each token: 8 bytes (uint8 scales, 7 real + 1 padding)
    - Padded to multiple of 576
    """
    assert k.dim() == 2 and k.shape[1] == 512, (
        f"K must be [num_tokens, 512], got {k.shape}"
    )
    assert k.dtype == torch.bfloat16, f"K must be bf16, got {k.dtype}"
    assert is_ue8m0, "Only support ue8m0 quantization."

    # NOTE: When using DP, slot_mapping.shape[0] can be less than k.shape[0] due to
    # padding. Always use slot_mapping.shape[0] as the token count.
    num_tokens = slot_mapping.shape[0]
    block_stride = k_cache.stride(0)  # bytes per block

    TOKEN_FP8_DIM = 448
    TOKEN_BF16_DIM = 64
    TOKEN_SCALE_DIM = 8
    QUANT_BLOCK_SIZE = 64
    FP8_MAX = 448.0
    TOKEN_DATA_SIZE = TOKEN_FP8_DIM + TOKEN_BF16_DIM * 2

    grid = (num_tokens,)

    quantize_and_insert_k_kernel[grid](
        k,
        slot_mapping,
        k_cache,
        num_tokens,
        input_dim=512,
        fp8_dim=TOKEN_FP8_DIM,
        bf16_dim=TOKEN_BF16_DIM,
        scale_dim=TOKEN_SCALE_DIM,
        quant_block=QUANT_BLOCK_SIZE,
        cache_block_size=block_size,
        token_data_size=TOKEN_DATA_SIZE,
        block_stride=block_stride,
        fp8_max=FP8_MAX,
        n_quant_blocks=8,
    )

quantize_and_insert_k_kernel ¶

quantize_and_insert_k_kernel(
    k_ptr,
    slot_mapping_ptr,
    k_cache_ptr,
    num_tokens,
    input_dim: constexpr,
    fp8_dim: constexpr,
    bf16_dim: constexpr,
    scale_dim: constexpr,
    quant_block: constexpr,
    cache_block_size: constexpr,
    token_data_size: constexpr,
    block_stride: constexpr,
    fp8_max: constexpr,
    n_quant_blocks: constexpr,
)

Quantize K tensor and insert into paged K cache.

K Cache block layout (block_size=64 tokens): - [0, 64576): Token data, each token has 448 fp8 + 128 bf16 - [64576, 64576 + 648): Scales, each token has 8 uint8 scales - [64576 + 648, block_stride): Padding

One program per token.

Source code in vllm/models/deepseek_v4/common/ops/cache_utils.py

@triton.jit
def quantize_and_insert_k_kernel(
    # Input tensors
    k_ptr,  # [num_tokens, 512] bf16
    slot_mapping_ptr,  # [num_tokens] int64
    # Output tensor
    k_cache_ptr,  # [num_blocks, block_bytes] as uint8 (flattened view)
    # Dimensions
    num_tokens,
    input_dim: tl.constexpr,  # 512
    fp8_dim: tl.constexpr,  # 448
    bf16_dim: tl.constexpr,  # 64
    scale_dim: tl.constexpr,  # 8
    quant_block: tl.constexpr,  # 64 (quantization block size)
    cache_block_size: tl.constexpr,  # 64 (paged cache block size)
    token_data_size: tl.constexpr,  # 576 bytes per token data
    block_stride: tl.constexpr,  # total bytes per block (padded)
    fp8_max: tl.constexpr,
    n_quant_blocks: tl.constexpr,  # 8 (7 real + 1 padding)
):
    """
    Quantize K tensor and insert into paged K cache.

    K Cache block layout (block_size=64 tokens):
    - [0, 64*576): Token data, each token has 448 fp8 + 128 bf16
    - [64*576, 64*576 + 64*8): Scales, each token has 8 uint8 scales
    - [64*576 + 64*8, block_stride): Padding

    One program per token.
    """
    pid = tl.program_id(0)

    if pid >= num_tokens:
        return

    # Get slot mapping
    slot_idx = tl.load(slot_mapping_ptr + pid)
    if slot_idx == -1:
        return

    block_idx = slot_idx // cache_block_size
    pos_in_block = slot_idx % cache_block_size

    # Input pointer for this token
    input_row_ptr = k_ptr + pid * input_dim

    # int64: block_idx * block_stride can exceed 2^31 with many KV-cache blocks
    # (e.g. >= 57K at block_stride ~37K). Matches gather path below.
    cache_block_ptr = k_cache_ptr + block_idx.to(tl.int64) * block_stride

    # Token data pointer: token data is stored contiguously at start of block
    # Each token's data is at offset pos_in_block * token_data_size
    token_data_ptr = cache_block_ptr + pos_in_block * token_data_size

    # Scale pointer: scales are stored after ALL token data in the block
    # Scale for this token is at offset (64 * 576) + pos_in_block * 8
    token_scale_ptr = (
        cache_block_ptr + cache_block_size * token_data_size + pos_in_block * scale_dim
    )

    # Token data layout: [0:448] fp8, [448:576] bf16
    token_fp8_ptr = token_data_ptr
    token_bf16_ptr = token_data_ptr + fp8_dim

    # ========== Quantize and store FP8 portion (first 448 elements) ==========
    # Using UE8M0 quantization strategy (scale is power of 2, stored as uint8 exponent)
    for qblock_idx in tl.static_range(n_quant_blocks):
        qblock_start = qblock_idx * quant_block

        if qblock_start < fp8_dim:
            offsets = qblock_start + tl.arange(0, quant_block)
            mask = offsets < fp8_dim

            # Load bf16 input
            x = tl.load(input_row_ptr + offsets, mask=mask, other=0.0)

            # Compute absmax scale (same as CUDA kernel)
            abs_x = tl.abs(x)
            block_max = tl.max(abs_x, axis=0)
            block_max = tl.maximum(block_max, 1e-4)  # Match CUDA: fmaxf(amax, 1e-4)

            # UE8M0: Round scale UP to next power of 2
            # scale = 2^ceil(log2(block_max / fp8_max))
            raw_scale = block_max / fp8_max
            log_scale = tl.log2(raw_scale)
            exponent = tl.ceil(log_scale)  # Round UP to next integer exponent
            scale = tl.exp2(exponent)  # scale = 2^exponent (power of 2)

            # Quantize to fp8: fp8_value = bf16_value / scale
            x_scaled = x / scale
            x_clamped = tl.clamp(x_scaled, -fp8_max, fp8_max)

            # Convert to fp8, then bitcast to uint8 for storage
            x_fp8 = x_clamped.to(tl.float8e4nv)
            x_uint8 = x_fp8.to(tl.uint8, bitcast=True)

            # Store as uint8 (1 byte each)
            tl.store(token_fp8_ptr + offsets, x_uint8, mask=mask)

            # UE8M0 scale encoding: stored_value = exponent + 127 (bias)
            # During dequant: scale = 2^(stored_value - 127)
            encoded_scale = exponent + 127.0
            encoded_scale = tl.maximum(tl.minimum(encoded_scale, 255.0), 0.0)
            tl.store(token_scale_ptr + qblock_idx, encoded_scale.to(tl.uint8))

    # Padding scale at index 7
    tl.store(token_scale_ptr + 7, tl.zeros((), dtype=tl.uint8))

    # ========== Store BF16 portion (last 64 elements, no quantization) ==========
    bf16_input_offset = fp8_dim

    # Process bf16 in chunks of 16
    bf16_out_ptr = token_bf16_ptr.to(tl.pointer_type(tl.bfloat16))
    for i in tl.static_range(bf16_dim // 16):
        chunk_offsets = i * 16 + tl.arange(0, 16)
        bf16_vals = tl.load(input_row_ptr + bf16_input_offset + chunk_offsets)
        tl.store(bf16_out_ptr + chunk_offsets, bf16_vals)