CUDA卷积加速(一)

创建时间：2016-10-26 投稿人：浏览次数：2985

推出这个系列的目的呢，主要是因为CUDA学习的一个主要用途就是对图像处理进行加速，而处理图像的过程中经常要用到卷积。卷积的计算有多种形式，本系列主要研究的是二维矩阵的卷积计算。从最原始的计算方法（就是本科教科书上的那种）再到优化后的适用于大型数据的算法，均提供并行化的设计思路。

考虑到机器学习很火，很多人也用python来做图形处理。但不好的地方在于python目前还不支持GPGPU编程，这就很尴尬。所以说编程语言掌握全面一点总归是好的嘛！

下面是普通版的CUDA进行卷积的操作方法，是对卷积最初始的算法进行的GPU平台的移植。我们称之为版本二（版本一就是原始版的嘛）。算法本身不难，但他考验了一些CUDA的基本功，对以后的深入学习有着铺垫作用。

那么利用计算机计算卷积最原始的版本呢，莫过于离散乘积求和，即对f[n-m]g[m]求和。那么整个卷积就只需要几个循环便可以做出来了。

那么这里我们也提供对上述过程最简单的CUDA程序给大家看：

__global__ void conv_kernel(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= len_out)
    {
        return;
    }

    float sum = 0.0f;
    for (int m = 0; m < len_b; ++m)
    {
        int k = tid - m;
        if (0 <= k && k < len_a)
        {
            sum += ina[k] * inb[m];
        }
    }
    out[tid] = sum;
}

<pre class="cpp" name="code">__global__ void conv2_kernel(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int tx = threadIdx.x;                  //线程数
    const size_t len_s = blockDim.x;                      //Block x宽度

    extern __shared__ float sIna[];                       //共享内存

    if (tid >= len_out)
    {
        return;
    }

    if (tid < len_a)
    {
        sIna[tx] = ina[tid];
    }
    __syncthreads();

    float sum = 0.0f;
    for (int m = 0; m < len_b; ++m)
    {
        int k = tid - len_b + m + 1;
        int sk = tx - len_b + m + 1;
        if (0 <= sk && sk < len_s)
        {
            sum += sIna[sk] * inb[m];
        }
        else
        {
            if (0 <= k && k < len_a)
            {
                sum += ina[k] * inb[m];
            }
        }
    }
    out[tid] = sum;
}

__constant__ static float c_b[1024];
__global__ void conv3_kernel(const float *ina, float *out, size_t len_a, size_t len_b, size_t len_out)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int tx = threadIdx.x;
    const size_t len_s = blockDim.x;

    extern __shared__ float sIna[];

    const float *inb = &c_b[0];

    if (tid >= len_out)
    {
        return;
    }

    if (tid < len_a)
    {
        sIna[tx] = ina[tid];
    }
    __syncthreads();

    float sum = 0.0f;
    for (int m = 0; m < len_b; ++m)
    {
        int k = tid - len_b + m + 1;
        int sk = tx - len_b + m + 1;
        if (0 <= sk && sk < len_s)
        {
            sum += sIna[sk] * inb[m];
        }
        else
        {
            if (0 <= k && k < len_a)
            {
                sum += ina[k] * inb[m];
            }
        }
    }
    out[tid] = sum;
}

一口气列了三种，都是非常经典的加速技巧。

下面附上main()

<pre class="cpp" name="code">int main(int argc, char const *argv[])
{
    thrust::host_vector<float> h_a(N_A);
    thrust::device_vector<float> d_a(N_A);
    thrust::host_vector<float> h_b(N_B);
    thrust::device_vector<float> d_b(N_B);

    size_t N = h_a.size() + h_b.size() - 1;
    size_t L = pow( 2, static_cast<int>(log2(N - 1)) + 1 );

    thrust::host_vector<float> h_result(N);
    thrust::device_vector<float> d_result(N);

    thrust::sequence(h_a.begin(), h_a.end());
    // thrust::sequence(h_b.begin(), h_b.end());
    thrust::sequence(h_b.rbegin(), h_b.rend());
    
    // get raw pointer for kernel
    float *raw_point_a = thrust::raw_pointer_cast( &d_a[0] );
    float *raw_point_b = thrust::raw_pointer_cast( &d_b[0] );
    float *raw_point_result = thrust::raw_pointer_cast( &d_result[0] );

    int numThreads = NUM_THREADS;
    int numBlocks = (L + numThreads - 1) / numThreads;

    cudaEvent_t start, stop;
    checkCudaErrors( cudaEventCreate(&start) );
    checkCudaErrors( cudaEventCreate(&stop) );
    cudaEventRecord(start);

    // copy a b to device
    thrust::copy(h_a.begin(), h_a.end(), d_a.begin());
    thrust::copy(h_b.begin(), h_b.end(), d_b.begin());

    // conv(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, L, numBlocks, numThreads);
    conv2(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, N, numBlocks, numThreads);

    thrust::copy(d_result.begin(), d_result.end(), h_result.begin());

    cudaEventRecord(stop);
    checkCudaErrors( cudaThreadSynchronize() );
    float time = 0;
    cudaEventElapsedTime(&time, start, stop);

    cout << "run times: " << time << " ms " << endl;

    // for (thrust::host_vector<float>::iterator i = h_result.begin(); i != h_result.end(); ++i)
    // {
    //     cout << *i << "	";
    // }
    cout << endl;

    return 0;
}

声明：该文观点仅代表作者本人，牛骨文系教育信息发布平台，牛骨文仅提供信息存储空间服务。

上一篇：图像均值滤波的CUDA并行化优化
下一篇： linux 中php以及nginx的重启命令

热门文章: CTF writeup 2_南邮网络攻防训...; SSM框架——详细整合教程（...; Linux Shell脚本编程－－curl命...; HttpClient使用详解; Java面试题全集（上）; JAVA设计模式之单例模式; java.lang.OutOfMemoryError: PermGen ...; TCP协议中的三次握手和四次...; form表单的两种提交方式，su...; String,StringBuffer与StringBuilder...

最新文章: Java之品优购课程讲义_day20（7）; 剑指 Offer - 8：跳台阶; Netty权威指南_札记02_NIO编程; mysql时间属性之时间戳和datetime之...; 虚拟现实或许可以拯救古埃及的“...; spring cloud服务注册中心eureka---集群...; Java SE 第六章; HTTP请求+数据库; HIDL学习笔记之HIDL C++（第二天）; ubuntu系统下指定tomcat运行时为JDK1.8...