c++ boost compute opencl调用

原创
2020/05/13 15:51
阅读数 1.2K

概述

     在大量的数据计算中cpu越来月疲于应对,对几百万数据进行排序,然后进行批量处理,以及对于几百万像素甚至更多的图像做处理,cpu在这类运行算已经无法应对,使用gpu等运算可以有效的解决此类问题.通过boost::compute让传统c++开发者可以更加轻松的切换opencl编程,极大降低了开发难度.

 

例子1

    

#include <boost/compute/core.hpp>
#include <boost/boost/compute/device.hpp>
#include <iostream>
#include <boost/boost/compute/container/vector.hpp>
#include <boost/compute/algorithm/transform.hpp>
#include <boost/compute/algorithm/sort.hpp>

namespace compute = boost::compute;

#define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
#define BOOST_COMPUTE_HAVE_THREAD_LOCAL
#define BOOST_COMPUTE_THREAD_SAFE
#define BOOST_COMPUTE_USE_OFFLINE_CACHE

void listDevice() {
    auto devices = boost::compute::system::devices();
    for (auto &d:devices) {
        std::cout << d.name() << " " << d.clock_frequency() << " " << d.compute_units()<<"\n";
    }
}

void testStd() {
    std::vector<float> host_vector(5000000);
    std::generate(host_vector.begin(), host_vector.end(), rand);
    auto begin = std::chrono::high_resolution_clock::now();
    std::sort(host_vector.begin(),host_vector.end());
    auto end = std::chrono::high_resolution_clock::now();
    std::cout << "test std Sort=" << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count()<<"\n";
}

void testCompute() {
    // get default device and setup context
    compute::device device = compute::system::default_device();
    compute::context context(device);
    compute::command_queue queue(context, device);

    // generate random data on the host
    std::vector<float> host_vector(5000000);
    std::generate(host_vector.begin(), host_vector.end(), rand);
    auto begin = std::chrono::high_resolution_clock::now();
    compute::vector<float> device_vector(host_vector.size(), context);

    compute::copy(
            host_vector.begin(), host_vector.end(), device_vector.begin(), queue
    );
    auto end = std::chrono::high_resolution_clock::now();
    std::cout << "copy cpu to gpu time=" << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() <<"\n";
   begin = std::chrono::high_resolution_clock::now();
    compute::sort( device_vector.begin(), device_vector.end(), queue);
    end = std::chrono::high_resolution_clock::now();
    std::cout << "gpu sort time=" << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() <<"\n";
    begin = std::chrono::high_resolution_clock::now();
    compute::copy(
            device_vector.begin(), device_vector.end(), host_vector.begin(), queue
    );
     end = std::chrono::high_resolution_clock::now();
    std::cout << "gpu to cpu time=" << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() <<"\n";
}

int main() {
    listDevice();
    testCompute();
    testStd();
}

输出结果

GeForce GTX 1050 Ti 1417 6
copy cpu to gpu time=3
gpu sort time=32
gpu to cpu time=2
test std Sort=1758

在1050显卡上性能差异已经不是一个级了,并且随着数据量的增加opencl的运行优势将会进一步放大.

 

例子2

          boost::compute提供一组宏,让开发者可以直接在opencl中使用自己定义的结构体.

#include <boost/compute.hpp>

//自定义结构体
struct Circle {
    float x{};
    float y{};
    float r{};
};

auto make_circles(size_t n) {
    auto circles = std::vector<Circle>{};
    circles.resize(n);
    std::generate(circles.begin(), circles.end(), []() {
        auto x = float(std::rand());
        auto y = float(std::rand());
        auto r = float(std::rand());
        return Circle{x, y, r};
    });
    return circles;
}

//结构体添加到openclBOOST_COMPUTE_ADAPT_STRUCT(Circle, Circle, (x, y, r));

//定义opencl计算函数
BOOST_COMPUTE_FUNCTION(
        float,
        circle_area_gpu,
        (Circle c),
        {
            //这里的代码是opencl代码,你可以胡乱写
            //c++编译器不会报错,会在运行时出异常.
            float pi = 3.14f;
            return c.r * c.r * pi;
        }
);
namespace bc = boost::compute;

int main() {
    auto ds = boost::compute::system::devices();
    //打印支持opencl的设备,如果没有设备将无法运行
    for (auto &d:ds) {
        std::cout << d.name() << " " << d.clock_frequency() << "\n";
    }
    //使用默认的设备
    bc::device gpu = boost::compute::system::default_device();

    //创建opencl的上下文对象
    boost::compute::context context(gpu);
    //创建command queue对象
    boost::compute::command_queue q(context, gpu);
    //创建100万的测试数据对象
    std::vector<Circle> cpu_circles = make_circles(1000000);
    const auto n = cpu_circles.size();
    //创建boost comput的结合对象
    auto gpu_circles = bc::vector<Circle>(n, context);
    //将数据从cpu复制到gpu
    bc::copy(cpu_circles.begin(), cpu_circles.end(), gpu_circles.begin(), q);
    //定义接受数据的集合
    auto gpu_areas = bc::vector<float>(n, context);
    //开始计算面积
    bc::transform(
            gpu_circles.begin(),
            gpu_circles.end(),
            gpu_areas.begin(),
            circle_area_gpu,
            q
    );
    //将数据重复copycpu
    auto cpu_area = std::vector<float>(gpu_areas.size());
    bc::copy(gpu_areas.begin(), gpu_areas.end(), cpu_area.begin(), q);
    return 1;
}

特别需要注意的是BOOST_COMPUTE_FUNCTION,

更多的api https://www.boost.org/doc/libs/1_65_1/libs/compute/doc/html/boost_compute/reference.html#boost_compute.reference.api_overview

boost compute已经提供大量的类stl的算法提供给开发者使用,可以在自己的c++程序中非常快速的引入opencl以提高性能.

展开阅读全文
加载中
点击引领话题📣 发布并加入讨论🔥
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部