|
| 1 | +import argparse |
| 2 | +from typing import Tuple, Union |
| 3 | + |
| 4 | +import tensorrt as trt |
| 5 | +import tensorrt.plugin as trtp |
| 6 | +import torch |
| 7 | +import torch_tensorrt |
| 8 | +import triton |
| 9 | +import triton.language as tl |
| 10 | + |
| 11 | +trt_logger = trt.Logger(trt.Logger.VERBOSE) |
| 12 | + |
| 13 | + |
| 14 | +@triton.jit |
| 15 | +def add_one_kernel(x_ptr, n_elements, y_ptr, BLOCK_SIZE: tl.constexpr): |
| 16 | + pid = tl.program_id(0) |
| 17 | + block_start = pid * BLOCK_SIZE |
| 18 | + offsets = block_start + tl.arange(0, BLOCK_SIZE) |
| 19 | + mask = offsets < n_elements |
| 20 | + x = tl.load(x_ptr + offsets, mask=mask) |
| 21 | + output = x + 1 |
| 22 | + tl.store(y_ptr + offsets, output, mask=mask) |
| 23 | + |
| 24 | + |
| 25 | +@torch.library.custom_op("my::add_one", mutates_args=()) # type: ignore[misc] |
| 26 | +def add_one(X: torch.Tensor) -> torch.Tensor: |
| 27 | + # Ensure the tensors are on the GPU |
| 28 | + assert X.is_cuda |
| 29 | + |
| 30 | + # Create output tensor |
| 31 | + Y = torch.empty_like(X) |
| 32 | + |
| 33 | + # Define block size |
| 34 | + BLOCK_SIZE = 256 |
| 35 | + |
| 36 | + # Grid of programs |
| 37 | + grid = lambda meta: (triton.cdiv(X.numel(), meta["BLOCK_SIZE"]),) |
| 38 | + |
| 39 | + # Launch the kernel |
| 40 | + add_one_kernel[grid](X, X.numel(), Y, BLOCK_SIZE=BLOCK_SIZE) |
| 41 | + |
| 42 | + return Y |
| 43 | + |
| 44 | + |
| 45 | +@torch.library.register_fake("my::add_one") |
| 46 | +def _(X: torch.Tensor) -> torch.Tensor: |
| 47 | + return X |
| 48 | + |
| 49 | + |
| 50 | +@trtp.register("my::add_one") |
| 51 | +def add_plugin_desc(X: trtp.TensorDesc) -> Tuple[trtp.TensorDesc]: |
| 52 | + return X.like() |
| 53 | + |
| 54 | + |
| 55 | +@trtp.aot_impl("my::add_one") |
| 56 | +def add_plugin_aot_impl( |
| 57 | + X: trtp.TensorDesc, outputs: Tuple[trtp.TensorDesc], tactic: int |
| 58 | +) -> Tuple[ |
| 59 | + Union[str, bytes], Union[str, bytes], trtp.KernelLaunchParams, trtp.SymExprs |
| 60 | +]: |
| 61 | + type_str = "fp32" if X.dtype == trt.float32 else "fp16" |
| 62 | + |
| 63 | + block_size = 256 |
| 64 | + src = triton.compiler.ASTSource( |
| 65 | + fn=add_one_kernel, |
| 66 | + signature={ |
| 67 | + "x_ptr": f"*{type_str}", |
| 68 | + "n_elements": "i32", |
| 69 | + "y_ptr": f"*{type_str}", |
| 70 | + "BLOCK_SIZE": "constexpr", |
| 71 | + }, |
| 72 | + constants={ |
| 73 | + "BLOCK_SIZE": block_size, |
| 74 | + }, |
| 75 | + ) |
| 76 | + |
| 77 | + compiled_kernel = triton.compile(src) |
| 78 | + |
| 79 | + N = X.shape_expr.numel() |
| 80 | + launch_params = trtp.KernelLaunchParams() |
| 81 | + |
| 82 | + # grid dims |
| 83 | + launch_params.grid_x = trtp.cdiv(N, block_size) |
| 84 | + # block dims |
| 85 | + launch_params.block_x = compiled_kernel.metadata.num_warps * 32 |
| 86 | + # shared memory |
| 87 | + launch_params.shared_mem = compiled_kernel.metadata.shared |
| 88 | + |
| 89 | + extra_args = trtp.SymIntExprs(1) |
| 90 | + extra_args[0] = trtp.SymInt32(N) |
| 91 | + |
| 92 | + return ( |
| 93 | + compiled_kernel.metadata.name, |
| 94 | + compiled_kernel.asm["ptx"], |
| 95 | + launch_params, |
| 96 | + extra_args, |
| 97 | + ) |
| 98 | + |
| 99 | + |
| 100 | +torch_tensorrt.dynamo.conversion.plugins.generate_plugin_converter( |
| 101 | + "my::add_one", |
| 102 | + supports_dynamic_shapes=False, |
| 103 | + requires_output_allocator=False, |
| 104 | + use_aot_if_available=True, |
| 105 | +) |
| 106 | + |
| 107 | + |
| 108 | +class MyModel(torch.nn.Module): |
| 109 | + def __init__(self): |
| 110 | + super().__init__() |
| 111 | + |
| 112 | + def forward(self, X: torch.Tensor) -> torch.Tensor: |
| 113 | + res = torch.ops.my.add_one.default(X) |
| 114 | + |
| 115 | + return res |
| 116 | + |
| 117 | + |
| 118 | +if __name__ == "__main__": |
| 119 | + parser = argparse.ArgumentParser() |
| 120 | + parser.add_argument( |
| 121 | + "--aot", action="store_true", help="Try to use AOT compilation", default=False |
| 122 | + ) |
| 123 | + args = parser.parse_args() |
| 124 | + |
| 125 | + my_model = MyModel().to("cuda") |
| 126 | + m = torch.full((64, 64), 2, device="cuda", dtype=torch.float) |
| 127 | + |
| 128 | + assert my_model(X=m)[0][0] == 3.0 |
| 129 | + |
| 130 | + with torch_tensorrt.logging.debug(): |
| 131 | + trt_inputs = [m] |
| 132 | + model_trt = torch_tensorrt.compile( |
| 133 | + my_model, |
| 134 | + inputs=trt_inputs, |
| 135 | + debug=True, |
| 136 | + min_block_size=1, |
| 137 | + ) |
| 138 | + print("Model compiled successfully!") |
| 139 | + print("Running inference with compiled model...") |
| 140 | + for i in range(10): |
| 141 | + res = model_trt(m) |
| 142 | + assert torch.allclose(res, my_model(m)), "Results do not match!" |
| 143 | + |
| 144 | + print("Inference successful!") |
0 commit comments