add tensorrt support.

sohaib023 · sohaib023 · commit b1fea92efc11 · 2021-04-25T22:03:31.000+05:00
diff --git a/eval_tensorrt.py b/eval_tensorrt.py
@@ -0,0 +1,125 @@
+import os
+import argparse
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+import tensorrt as trt
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+import torch
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+from libs.dataset import Dataset
+
+TRT_LOGGER = trt.Logger()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '-v',
+        '--val_path',
+        type=str,
+        help="Path to directory containing validation dataset.",
+        required=True
+    )
+    parser.add_argument(
+        '-o',
+        '--out_path',
+        type=str,
+        help="Path for saving prediction images.",
+        required=True
+    )
+    parser.add_argument(
+        '--engine',
+        type=str,
+        help="Path to tensorrt engine generated by 'onnx_to_trt.py'.",
+        required=True
+    )
+
+    args = parser.parse_args()
+
+    os.makedirs(args.out_path, exist_ok=True)
+    
+    val_dataset     = Dataset(args.val_path, shuffle_pairs=False, augment=False)
+    val_dataloader   = DataLoader(val_dataset, batch_size=1)
+
+    criterion = torch.nn.BCELoss()
+
+    with open(args.engine, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+        engine = runtime.deserialize_cuda_engine(f.read())
+    context = engine.create_execution_context()
+
+    device_input1, device_input2 = [None] * 2
+    for binding in engine:
+        if engine.binding_is_input(binding):
+            input_shape = engine.get_binding_shape(binding)
+            input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize  # in bytes
+            if device_input1 is None:
+                device_input1 = cuda.mem_alloc(input_size)
+            elif device_input2 is None:
+                device_input2 = cuda.mem_alloc(input_size)
+            else:
+                raise Exception("Network expects more than 2 inputs.")
+        else:
+            output_shape = engine.get_binding_shape(binding)
+            
+            host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
+            device_output = cuda.mem_alloc(host_output.nbytes)
+    stream = cuda.Stream()
+
+    losses = []
+    correct = 0
+    total = 0
+
+    inv_transform = transforms.Compose([ transforms.Normalize(mean = [ 0., 0., 0. ],
+                                                         std = [ 1/0.229, 1/0.224, 1/0.225 ]),
+                                    transforms.Normalize(mean = [ -0.485, -0.456, -0.406 ],
+                                                         std = [ 1., 1., 1. ]),
+                                   ])
+    
+    for i, ((img1, img2), y, (class1, class2)) in enumerate(val_dataloader):
+        print("[{} / {}]".format(i, len(val_dataloader)))
+
+        class1 = class1[0]
+        class2 = class2[0]
+
+        cuda.memcpy_htod_async(device_input1, img1.numpy().astype(np.float32), stream)
+        cuda.memcpy_htod_async(device_input2, img2.numpy().astype(np.float32), stream)
+
+        # run inference
+        context.execute_async(bindings=[int(device_input1), int(device_input2), int(device_output)], stream_handle=stream.handle)
+        cuda.memcpy_dtoh_async(host_output, device_output, stream)
+        stream.synchronize()
+
+        # postprocess results
+        prob = torch.Tensor(host_output).reshape(engine.max_batch_size, output_shape[0])
+
+        loss = criterion(prob, y)
+
+        losses.append(loss.item())
+        correct += torch.count_nonzero(y == (prob > 0.5)).item()
+        total += len(y)
+
+        fig = plt.figure("class1={}\tclass2={}".format(class1, class2), figsize=(4, 2))
+        plt.suptitle("cls1={}  conf={:.2f}  cls2={}".format(class1, prob[0][0].item(), class2))
+
+        img1 = inv_transform(img1).cpu().numpy()[0]
+        img2 = inv_transform(img2).cpu().numpy()[0]
+        # show first image
+        ax = fig.add_subplot(1, 2, 1)
+        plt.imshow(img1[0], cmap=plt.cm.gray)
+        plt.axis("off")
+
+        # show the second image
+        ax = fig.add_subplot(1, 2, 2)
+        plt.imshow(img2[0], cmap=plt.cm.gray)
+        plt.axis("off")
+
+        # show the plot
+        plt.savefig(os.path.join(args.out_path, '{}.png').format(i))
+
+    print("Validation: Loss={:.2f}\t Accuracy={:.2f}\t".format(sum(losses)/len(losses), correct / total))
diff --git a/infer_tensorrt.py b/infer_tensorrt.py
@@ -0,0 +1,93 @@
+import argparse
+
+import torch
+import numpy as np
+from PIL import Image
+import tensorrt as trt
+import pycuda.autoinit
+import pycuda.driver as cuda
+from torchvision import transforms
+
+
+# logger to capture errors, warnings, and other information during the build and inference phases
+TRT_LOGGER = trt.Logger()
+
+feed_shape = (224, 224)
+
+transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    transforms.Resize(feed_shape)
+])
+
+def preprocess(filename1, filename2):
+    image1 = Image.open(filename1).convert("RGB")
+    image2 = Image.open(filename2).convert("RGB")
+
+    image1 = transform(image1).float()
+    image2 = transform(image2).float()
+
+    return image1.numpy().astype(np.float32), image2.numpy().astype(np.float32)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--image1',
+        type=str,
+        help="Path to first image of the pair.",
+        required=True
+    )
+    parser.add_argument(
+        '--image2',
+        type=str,
+        help="Path to second image of the pair.",
+        required=True
+    )
+    parser.add_argument(
+        '--engine',
+        type=str,
+        help="Path to tensorrt engine generated by 'onnx_to_trt.py'.",
+        required=True
+    )
+
+
+    args = parser.parse_args()
+
+    with open(args.engine, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+        engine = runtime.deserialize_cuda_engine(f.read())
+    context = engine.create_execution_context()
+
+    device_input1, device_input2 = [None] * 2
+    for binding in engine:
+        if engine.binding_is_input(binding):  # we expect only one input
+            input_shape = engine.get_binding_shape(binding)
+            input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize  # in bytes
+            if device_input1 is None:
+                device_input1 = cuda.mem_alloc(input_size)
+            elif device_input2 is None:
+                device_input2 = cuda.mem_alloc(input_size)
+            else:
+                raise Exception("Network expects more than 2 inputs.")
+        else:  # and one output
+            output_shape = engine.get_binding_shape(binding)
+            # create page-locked memory buffers (i.e. won't be swapped to disk)
+            host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
+            device_output = cuda.mem_alloc(host_output.nbytes)
+
+    # Create a stream in which to copy inputs/outputs and run inference.
+    stream = cuda.Stream()
+
+    # preprocess input data
+    host_input = preprocess(args.image1, args.image2)
+    cuda.memcpy_htod_async(device_input1, host_input[0], stream)
+    cuda.memcpy_htod_async(device_input2, host_input[1], stream)
+
+    # run inference
+    context.execute_async(bindings=[int(device_input1), int(device_input2), int(device_output)], stream_handle=stream.handle)
+    cuda.memcpy_dtoh_async(host_output, device_output, stream)
+    stream.synchronize()
+
+    # postprocess results
+    output_data = torch.Tensor(host_output).reshape(engine.max_batch_size, output_shape[0])
+    print(F"Similarity between the two images = {round(output_data[0][0].item(), 2)}")
diff --git a/onnx_to_trt.py b/onnx_to_trt.py
@@ -0,0 +1,54 @@
+import pickle
+import argparse
+import tensorrt as trt
+
+# logger to capture errors, warnings, and other information during the build and inference phases
+TRT_LOGGER = trt.Logger()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--onnx',
+        type=str,
+        help="Path of onnx model generated by 'torch_to_onnx.py'.",
+        required=True
+    )
+    parser.add_argument(
+        '--engine',
+        type=str,
+        help="Path for saving tensorrt engine.",
+        required=True
+    )
+
+    args = parser.parse_args()
+
+    onnx_file_path = args.onnx
+    # initialize TensorRT engine and parse ONNX model
+    EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+    builder = trt.Builder(TRT_LOGGER)
+    network = builder.create_network(EXPLICIT_BATCH)
+    parser = trt.OnnxParser(network, TRT_LOGGER)
+    
+    # parse ONNX
+    with open(onnx_file_path, 'rb') as model:
+        print('Beginning ONNX file parsing')
+        parser.parse(model.read())
+    print('Completed parsing of ONNX file')
+
+    # allow TensorRT to use up to 1GB of GPU memory for tactic selection
+    builder.max_workspace_size = 1 << 30
+    # we have only one image in batch
+    builder.max_batch_size = 1
+    # use FP16 mode if possible
+    if builder.platform_has_fast_fp16:
+        builder.fp16_mode = True
+
+    # generate TensorRT engine optimized for the target platform
+    print('Building an engine...')
+    engine = builder.build_cuda_engine(network)
+    print("Completed creating Engine")
+
+    with open(args.engine, 'wb') as f:
+        f.write(engine.serialize())
diff --git a/torch_to_onnx.py b/torch_to_onnx.py
@@ -0,0 +1,41 @@
+import os
+import argparse
+
+import onnx
+import torch
+
+from siamese import SiameseNetwork
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '-c',
+        '--checkpoint',
+        type=str,
+        help="Path of model checkpoint to be used for inference.",
+        required=True
+    )
+    parser.add_argument(
+        '-o',
+        '--out_path',
+        type=str,
+        help="Path for saving tensorrt model.",
+        required=True
+    )
+
+    args = parser.parse_args()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    
+    checkpoint = torch.load(args.checkpoint)
+    model = SiameseNetwork(backbone=checkpoint['backbone'])
+    model.to(device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+
+    torch.onnx.export(model, (torch.rand(1, 3, 224, 224).to(device), torch.rand(1, 3, 224, 224).to(device)), args.out_path, input_names=['input'],
+                      output_names=['output'], export_params=True)
+    
+    onnx_model = onnx.load(args.out_path)
+    onnx.checker.check_model(onnx_model)