文档章节

使用TensorRT对caffe和pytorch onnx模型进行fp32和fp16推理

kezunlin
 kezunlin
发布于 11/20 10:22
字数 1452
阅读 46
收藏 0

【推荐】2019 Java 开发者跳槽指南.pdf(吐血整理) >>>

本文首发于个人博客https://kezunlin.me/post/bcdfb73c/,欢迎阅读最新内容!

tensorrt fp32 fp16 tutorial with caffe pytorch minist model

Series

Code Example

include headers

#include <assert.h>
#include <sys/stat.h>
#include <time.h>

#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <cmath>
#include <algorithm>

#include <cuda_runtime_api.h>

#include "NvCaffeParser.h"
#include "NvOnnxConfig.h"
#include "NvOnnxParser.h"
#include "NvInfer.h"
#include "common.h"

using namespace nvinfer1;
using namespace nvcaffeparser1;

static Logger gLogger;

// Attributes of MNIST Caffe model
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
//const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::string mnist_data_dir = "data/mnist/";


// Simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H * INPUT_W])
{
    readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

caffe model to tensorrt

void caffeToTRTModel(const std::string& deployFilepath,       // Path of Caffe prototxt file
                     const std::string& modelFilepath,        // Path of Caffe model file
                     const std::vector<std::string>& outputs, // Names of network outputs
                     unsigned int maxBatchSize,               // Note: Must be at least as large as the batch we want to run with
                     IHostMemory*& trtModelStream)            // Output buffer for the TRT model
{
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);

    // Parse caffe model to populate network, then set the outputs
    std::cout << "Reading Caffe prototxt: " << deployFilepath << "\n";
    std::cout << "Reading Caffe model: " << modelFilepath << "\n";
    INetworkDefinition* network = builder->createNetwork();
    ICaffeParser* parser = createCaffeParser();

    bool useFp16 = builder->platformHasFastFp16();
    std::cout << "platformHasFastFp16: " << useFp16 << "\n";

    bool useInt8 = builder->platformHasFastInt8();
    std::cout << "platformHasFastInt8: " << useInt8 << "\n";

    // create a 16-bit model if it's natively supported
    DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; 
    
    const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFilepath.c_str(),
                                                              modelFilepath.c_str(),
                                                              *network,
                                                              modelDataType);
    // Specify output tensors of network
    // ERROR: Network must have at least one output
    for (auto& s : outputs){
        std::cout<<"output = "<< s.c_str() << std::endl;
        network->markOutput(*blobNameToTensor->find(s.c_str())); // prob
    } 

    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);

    // set up the network for paired-fp16 format if available
    if(useFp16)
        builder->setFp16Mode(true);

    // Build engine
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    assert(engine);

    // Destroy parser and network
    network->destroy();
    parser->destroy();

    // Serialize engine and destroy it
    trtModelStream = engine->serialize();
    engine->destroy();
    builder->destroy();

    //shutdownProtobufLibrary();
}

pytorch onnx to tensorrt

void onnxToTRTModel( const std::string& modelFilepath,        // name of the onnx model 
                     unsigned int maxBatchSize,            // batch size - NB must be at least as large as the batch we want to run with
                     IHostMemory *&trtModelStream)      // output buffer for the TensorRT model
{
    // create the builder
    IBuilder* builder = createInferBuilder(gLogger);

    nvonnxparser::IOnnxConfig* config = nvonnxparser::createONNXConfig();
    config->setModelFileName(modelFilepath.c_str());
    
    nvonnxparser::IONNXParser* parser = nvonnxparser::createONNXParser(*config);
    
    //Optional - uncomment below lines to view network layer information
    //config->setPrintLayerInfo(true);
    //parser->reportParsingInfo();
    
    if (!parser->parse(modelFilepath.c_str(), DataType::kFLOAT))
    {
        string msg("failed to parse onnx file");
        gLogger.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
        exit(EXIT_FAILURE);
    }
    
    if (!parser->convertToTRTNetwork()) {
        string msg("ERROR, failed to convert onnx network into TRT network");
        gLogger.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
        exit(EXIT_FAILURE);
    }
    nvinfer1::INetworkDefinition* network = parser->getTRTNetwork();
    
    // Build the engine
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);

    ICudaEngine* engine = builder->buildCudaEngine(*network);
    assert(engine);

    // we don't need the network any more, and we can destroy the parser
    network->destroy();
    parser->destroy();

    // serialize the engine, then close everything down
    trtModelStream = engine->serialize();
    engine->destroy();
    builder->destroy();

    //shutdownProtobufLibrary();
}

do inference

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    int inputIndex, outputIndex;

    printf("Bindings after deserializing:\n");
    for (int bi = 0; bi < engine.getNbBindings(); bi++) 
    {
        if (engine.bindingIsInput(bi) == true) 
        {
            inputIndex = bi;
            printf("Binding %d (%s): Input.\n",  bi, engine.getBindingName(bi));
        } else 
        {
            outputIndex = bi;
            printf("Binding %d (%s): Output.\n", bi, engine.getBindingName(bi));
        }
    }

    //const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    //const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    std::cout<<"inputIndex = "<< inputIndex << std::endl; // 0   data
    std::cout<<"outputIndex = "<< outputIndex << std::endl; // 1  prob

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

save and load engine

void SaveEngine(const nvinfer1::IHostMemory& trtModelStream, const std::string& engine_filepath)
{
    std::ofstream file;
    file.open(engine_filepath, std::ios::binary | std::ios::out);
    if(!file.is_open())
    {
        std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
        return;
    }
    file.write((const char*)trtModelStream.data(), trtModelStream.size());
    file.close();
};


ICudaEngine* LoadEngine(IRuntime& runtime, const std::string& engine_filepath)
{
    ifstream file;
    file.open(engine_filepath, ios::binary | ios::in);
    file.seekg(0, ios::end); 
    int length = file.tellg();         
    file.seekg(0, ios::beg); 

    std::shared_ptr<char> data(new char[length], std::default_delete<char[]>());
    file.read(data.get(), length);
    file.close();

    // runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
    ICudaEngine* engine = runtime.deserializeCudaEngine(data.get(), length, nullptr);
    assert(engine != nullptr);
    return engine;
}

example

void demo_save_caffe_to_trt(const std::string& engine_filepath)
{
    std::string deploy_filepath = mnist_data_dir + "mnist.prototxt";
    std::string model_filepath = mnist_data_dir + "mnist.caffemodel";
    
     // Create TRT model from caffe model and serialize it to a stream
    IHostMemory* trtModelStream{nullptr};
    caffeToTRTModel(deploy_filepath, model_filepath, std::vector<std::string>{OUTPUT_BLOB_NAME}, 1, trtModelStream);
    assert(trtModelStream != nullptr);

    SaveEngine(*trtModelStream, engine_filepath);

    // destroy stream
    trtModelStream->destroy();
}


void demo_save_onnx_to_trt(const std::string& engine_filepath)
{
    std::string onnx_filepath = mnist_data_dir + "mnist.onnx";
    
     // Create TRT model from caffe model and serialize it to a stream
    IHostMemory* trtModelStream{nullptr};
    onnxToTRTModel(onnx_filepath, 1, trtModelStream);
    assert(trtModelStream != nullptr);

    SaveEngine(*trtModelStream, engine_filepath);

    // destroy stream
    trtModelStream->destroy();
}


int mnist_demo()
{
    bool use_caffe = false; 
    std::string engine_filepath;
    if (use_caffe){
        engine_filepath = "cfg/mnist/caffe_minist_fp32.trt";
        demo_save_caffe_to_trt(engine_filepath);
    } else {
        engine_filepath = "cfg/mnist/onnx_minist_fp32.trt";
        demo_save_onnx_to_trt(engine_filepath);
    }
    std::cout<<"[API] Save engine to "<< engine_filepath <<std::endl;

    //if (watrix::algorithm::FilesystemUtil::not_exists(engine_filepath)){
    
    const int num = 6;
    std::string digit_filepath = mnist_data_dir + std::to_string(num) + ".pgm";

     // Read a digit file
    uint8_t fileData[INPUT_H * INPUT_W];
    readPGMFile(digit_filepath, fileData);
    float data[INPUT_H * INPUT_W];

    if (use_caffe){

        std::string mean_filepath = mnist_data_dir + "mnist_mean.binaryproto";
        // Parse mean file
        ICaffeParser* parser = createCaffeParser();
        IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(mean_filepath.c_str());
        parser->destroy();

        // Subtract mean from image
        const float* meanData = reinterpret_cast<const float*>(meanBlob->getData()); // size 786

        for (int i = 0; i < INPUT_H * INPUT_W; i++)
            data[i] = float(fileData[i]) - meanData[i];
        
        meanBlob->destroy();
    } else {

        for (int i = 0; i < INPUT_H * INPUT_W; i++)
            data[i] = 1.0 - float(fileData[i]/255.0);
    }
    

    // Deserialize engine we serialized earlier
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);

    std::cout<<"[API] Load engine from "<< engine_filepath <<std::endl;
    ICudaEngine* engine = LoadEngine(*runtime, engine_filepath);
    assert(engine != nullptr);
    
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    // Run inference on input data
    float prob[OUTPUT_SIZE];
    doInference(*context, data, prob, 1);

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    std::cout << "\nOutput:\n\n";

    // for onnx,we get z as output, we need to use softmax to get probs
    if ( !use_caffe){

        //Calculate Softmax
        float sum{0.0f};
        for(int i = 0; i < OUTPUT_SIZE; i++)
        {
            prob[i] = exp(prob[i]);
            sum += prob[i];
        }
        for(int i = 0; i < OUTPUT_SIZE; i++)
        {
            prob[i] /= sum;
        }
    }
    
    // find max probs
    float val{0.0f};
    int idx{0};
    for (unsigned int i = 0; i < 10; i++)
    {
        val = std::max(val, prob[i]);
        if (val == prob[i]) {
            idx = i;
        }
        cout << " Prob " << i << "  "<< std::fixed << std::setw(5) << std::setprecision(4) << prob[i];
        std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
    }
    std::cout << std::endl;

    return (idx == num && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;
}


int main(int argc, char** argv)
{
    mnist_demo();
    return 0;
}

results

./bin/sample_mnist 
[API] Save engine to cfg/mnist/onnx_minist_fp32.trt
[API] Load engine from cfg/mnist/onnx_minist_fp32.trt
Bindings after deserializing:
Binding 0 (Input3): Input.
Binding 1 (Plus214_Output_0): Output.
inputIndex = 0
outputIndex = 1

Output:

 Prob 0  0.00000: 
 Prob 1  0.00001: 
 Prob 2  0.00002: 
 Prob 3  0.00003: 
 Prob 4  0.00004: 
 Prob 5  0.00005: 
 Prob 6  1.00006: **********
 Prob 7  0.00007: 
 Prob 8  0.00008: 
 Prob 9  0.00009: 

Reference

History

  • 20190422 created.

Copyright

© 著作权归作者所有

kezunlin
粉丝 0
博文 62
码字总数 55214
作品 0
东城
程序员
私信 提问
使用ONNX+TensorRT部署人脸检测和关键点250fps

使用ONNX+TensorRT部署人脸检测和关键点250fps This article was original written by Jin Tian, welcome re-post, first come with https://jinfagang.github.io . but please keep this c......

LucasJin
10/15
0
0
整合PyTorch 0.4和Caffe 2,PyTorch 1.0能挑战TensorFlow吗?

 译者 | 梁红丽 编辑 | Mavis 出品 | AI科技大本营(公众号ID:rgznai100) 【AI 科技大本营导读】5月2日,在加利福尼亚州举办的年度开发者 F8 大会上,Facebook 正式推出 PyTorch 1.0 ...

dqcfkyqdxym3f8rb0
2018/05/04
0
0
融合 Caffe2、ONNX 的新版 PyTorch 发布在即,能否赶超 TensorFlow?

雷锋网(公众号:雷锋网) AI 研习社按,上个月,Caffe2 代码正式并入 PyTorch,就在今天,Facebook AI 系统与平台部(AI Infra and Platform)副总 Bill Jia 发文表示,PyTorch 1.0 发布在即,...

思颖
2018/05/03
0
0
NVIDIA深度学习Tensor Core全面解析(下篇)

深度学习的基准测试 雷锋网(公众号:雷锋网)消息,在《NVIDIA深度学习Tensor Core全面解析(上篇)》中,我们从硬件上分析了Titan V的Volta核心,本篇将通过多项测试来考验Volta架构,利用各...

任然
2018/09/08
0
0
Caffe2 代码全部并入 PyTorch:深度学习框架格局剧震

昨日,Caffe2 的 Github 页面突然出现了一个「巨大的改动」:Caffe2 开源代码正式并入 PyTorch,至此,Facebook 主力支持的两大深度学习框架已合二为一。这两大框架,在整个深度学习框架格局...

周其
2018/04/02
6.2K
5

没有更多内容

加载失败,请刷新页面

加载更多

如何管stderr,而不是stdout?

我有一个要写入信息的程序stdout和stderr ,我需要grep通过什么是未来标准错误 ,而忽视标准输出 。 我当然可以分2步完成: command > /dev/null 2> temp.filegrep 'something' temp.file...

技术盛宴
19分钟前
4
0
centos7.5上通过docker安装并运行mysql5.7

1. docker pull mysql:5.7 2. docker run --name mysql5.7 -p 3306:3306 -e MYSQL_ROOT_PASSWORD=123456 -d mysql:5.7...

Ryub
23分钟前
5
0
什么是比赛条件?

在编写多线程应用程序时,遇到的最常见问题之一是竞争条件。 我对社区的问题是: 什么是比赛条件? 您如何检测到它们? 您如何处理它们? 最后,如何防止它们发生? #1楼 当设备或系统试图同...

javail
34分钟前
5
0
SpringMVC源码分析-DispatcherServlet-init方法分析

上一篇:SpringMVC源码分析-DispatcherServlet实例化干了些什么 先吐槽一下。。。写了两小时的博客突然被俺家小屁孩按了刷新,东西不见了,建议OSCHINA能够自动定时保存啊。让我先安静一下。...

特拉仔
42分钟前
5
0
python协程 生成器

协程,又称微线程,纤程。英文名Coroutine。 线程是系统级别的它们由操作系统调度,而协程则是程序级别的由程序根据需要自己调度。在一个线程中会有很多函数,我们把这些函数称为子程序,在子...

沙门行道
52分钟前
5
0

没有更多内容

加载失败,请刷新页面

加载更多

返回顶部
顶部