
2023/08/26 11:03
阅读数 575

TensorRT的基本使用可以参考模型部署篇 中的TensorRT 部署。


  1. TensorRT的核心在于对模型算子的优化(合并算子,利用GPU特性特定核函数等多种策略),通过tensorRT,能够在Nvidia系列GPU中获得最好的性能。
  2. 因此tensorRT的模型,需要在目标GPU上实际运行的方式选择最优算法和配置。
  3. 因此tensorRT生成的模型只能在特定条件下运行(编译的trt版本,cuda版本,编译时的GPU幸好)。




#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <cuda_runtime.h>
#include <stdio.h>

using namespace nvinfer1;
class TRTLogger: public ILogger {
    virtual void log(Severity severity, const char *msg) noexcept override {
        if (severity <= Severity::kVERBOSE) {
            printf("%d: %s\n", severity, msg);

Weights make_weights(float* ptr, int n) {
    Weights w;
    w.count = n;
    w.type = nvinfer1::DataType::kFLOAT;
    w.values = ptr;
    return w;

int main() {
    TRTLogger logger;
    IBuilder *builder = createInferBuilder(logger);
    IBuilderConfig *config = builder->createBuilderConfig();
    INetworkDefinition *network = builder->createNetworkV2(1);
    const int num_input = 3;
    const int num_output = 2;
    float layer1_weight_values[] = {1.0, 2.0, 0.5, 0.1, 0.2, 0.5};
    float layer1_bias_values[] = {0.3, 0.8};
    ITensor *input = network->addInput("image", nvinfer1::DataType::kFLOAT, Dims4(1, num_input, 1, 1));
    Weights layer1_weight = make_weights(layer1_weight_values, 6);
    Weights layer1_bias = make_weights(layer1_bias_values, 2);
    auto layer1 = network->addFullyConnected(*input, num_output, layer1_weight, layer1_bias);
    auto prob = network->addActivation(*layer1->getOutput(0), ActivationType::kSIGMOID);

    printf("workspace Size = %.2f MB\n", (1 << 28) / 1024.0f / 1024.0f);
    config->setMaxWorkspaceSize(1 << 28); //256M
    builder->setMaxBatchSize(1);  //推理的batchsize为1
    ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
    if (engine == nullptr) {
        printf("Build engine failed.\n");
        return -1;
    IHostMemory *model_data = engine->serialize();
    FILE *f = fopen("engine.trtmodel", "wb");
    fwrite(model_data->data(), 1, model_data->size(), f);

    return 0;

Makefile(我这里是在英伟达Jetson nano jetpak 4.5上开发,tensorrt版本号为7.1.1)


CFLAGS= -I$(INCLUDE) -I/usr/local/cuda-10.2/include
LIBS= -L$(LIBPATH) -lnvinfer -L/usr/local/cuda-10.2/lib64 -lcudart -lcublas -lstdc++fs

CXX_OBJECTS := $(patsubst %.cpp,%.o,$(shell find . -name "*.cpp"))
DEP_FILES  =$(patsubst  %.o,  %.d, $(CXX_OBJECTS))

                $(CXX)  $(CXX_OBJECTS) -o $(EXE) $(LIBS)

%.o: %.cpp
                $(CXX) -c -o $@ $(CFLAGS) $(LIBS) $<

                rm  -rf  $(CXX_OBJECTS)  $(DEP_FILES)  $(EXE)

                echo $(CXX_OBJECTS)


workspace Size = 256.00 MB
4: Applying generic optimizations to the graph for inference.
4: Original: 2 layers
4: After dead-layer removal: 2 layers
4: After Myelin optimization: 2 layers
4: After scale fusion: 2 layers
4: After vertical fusions: 2 layers
4: After final dead-layer removal: 2 layers
4: After tensor merging: 2 layers
4: After concat removal: 2 layers
4: Graph construction and optimization completed in 0.0724424 seconds.
4: Constructing optimization profile number 0 [1/1].
4: *************** Autotuning format combination: Float(1,1,1,3) -> Float(1,1,1,2) ***************
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x128_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x64_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_64x64_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_32x128_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x32_relu_nn_v1
4: --------------- Timing Runner: (Unnamed Layer* 0) [Fully Connected] (CaskFullyConnected)
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x128_relu_nn_v1
4: Tactic: 8883888914904656451 time 0.0325
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x64_relu_nn_v1
4: Tactic: 5453137127347942357 time 0.028385
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_64x64_relu_nn_v1
4: Tactic: 5373503982740029499 time 0.028333
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_32x128_relu_nn_v1
4: Tactic: 4133936625481774016 time 0.016875
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x32_relu_nn_v1
4: Tactic: 1933552664043962183 time 0.016927
4: Fastest Tactic: 4133936625481774016 Time: 0.016875
4: --------------- Timing Runner: (Unnamed Layer* 0) [Fully Connected] (CudaFullyConnected)
4: Tactic: 0 time 0.01974
4: Tactic: 1 time 0.023021
4: Tactic: 9 time 0.026927
4: Tactic: 26 time 0.019167
4: Tactic: 27 time 0.018907
4: Tactic: 48 time 0.019167
4: Tactic: 49 time 0.019844
4: Fastest Tactic: 27 Time: 0.018907
4: >>>>>>>>>>>>>>> Chose Runner Type: CaskFullyConnected Tactic: 4133936625481774016
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_32x128_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x128_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x64_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_64x64_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_32x128_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_128x32_relu_nn_v1
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_32x128_relu_nn_v1
4: *************** Autotuning format combination: Float(1,1,1,2) -> Float(1,1,1,2) ***************
4: --------------- Timing Runner: (Unnamed Layer* 1) [Activation] (Activation)
4: Tactic: 0 is the only option, timing skipped
4: Fastest Tactic: 0 Time: 0
4: Formats and tactics selection completed in 0.281916 seconds.
4: After reformat layers: 2 layers
4: Block size 268435456
4: Block size 512
4: Total Activation Memory: 268435968
3: Detected 1 inputs and 1 output network tensors.
4: (Unnamed Layer* 0) [Fully Connected] (caskFullyConnectedFP32) Set Tactic Name: maxwell_sgemm_32x128_relu_nn_v1
4: Layer: (Unnamed Layer* 0) [Fully Connected] Weights: 24 HostPersistent: 384 DevicePersistent: 1536
4: Layer: (Unnamed Layer* 1) [Activation] Weights: 0 HostPersistent: 0 DevicePersistent: 0
4: Total Host Persistent Memory: 384
4: Total Device Persistent Memory: 1536
4: Total Weight Memory: 24
4: Builder timing cache: created 1 entries, 0 hit(s)
4: Engine generation completed in 11.306 seconds.
4: Engine Layer Information:
4: Layer(caskFullyConnectedFP32): (Unnamed Layer* 0) [Fully Connected], Tactic: 4133936625481774016, image[Float(3,1,1)] -> (Unnamed Layer* 0) [Fully Connected]_output[Float(2,1,1)]
4: Layer(Activation): (Unnamed Layer* 1) [Activation], Tactic: 0, (Unnamed Layer* 0) [Fully Connected]_output[Float(2,1,1)] -> (Unnamed Layer* 1) [Activation]_output[Float(2,1,1)]


点击引领话题📣 发布并加入讨论🔥
0 评论
0 收藏