cuda中当数组数大于线程数的处理方法


参考stackoverflow一篇帖子的处理方法:https://stackoverflow.com/questions/26913683/different-way-to-index-threads-in-cuda-c

代码中cuda_gridsize函数参考yolo。

代码如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <ctime>

using namespace std;
#define BLOCK 512

dim3 cuda_gridsize(size_t n){
    size_t k = (n - 1) / BLOCK + 1;
    unsigned int x = k;
    unsigned int y = 1;
    if (x > 65535){
        x = ceil(sqrt(k));
        y = (n - 1) / (x*BLOCK) + 1;
    }
    dim3 d = { x, y, 1 };
    //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK);
    return d;
}

__global__ void gpuCalc(unsigned char *img,long H,long W)
{
    long threadId_2D = threadIdx.x + threadIdx.y*blockDim.x;
    long blockId_2D = blockIdx.x + blockIdx.y*gridDim.x;
    long i = threadId_2D + (blockDim.x*blockDim.y)*blockId_2D;
    
    //另一种索引方式
    //long i = (gridDim.x*blockDim.x)*(threadIdx.y + blockDim.y*blockIdx.y) + (threadIdx.x + blockDim.x*blockIdx.x);

    while (i < H*W){
        img[i] = 255 - img[i];
        i += (gridDim.x*blockDim.x)*(gridDim.y*blockDim.y);
    }
}

void addWithCuda(unsigned char *img, long H,long W)
{
    unsigned char *dev_a = 0;

    cudaSetDevice(0);

    cudaMalloc((void**)&dev_a, H*W * sizeof(unsigned char));
    cudaMemcpy(dev_a, img, H*W * sizeof(unsigned char), cudaMemcpyHostToDevice);

    gpuCalc<<<cuda_gridsize(H*W),BLOCK>> >(dev_a, H, W);

    cudaMemcpy(img, dev_a, H*W * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    cudaFree(dev_a);

    cudaGetLastError();
}

void cpuCalc(unsigned char *img,long W, long H)
{
    for (long i = 0; i < H*W; i++)
        img[i] = 255 - img[i];
}

int main()
{
    long W = 20000;
    long H = 20000;

    unsigned char *img = new unsigned char[W*H];
    unsigned char *cmp = new unsigned char[W*H];

    for (long i = 0; i < H*W; i++)
        img[i] = rand() % 100;

    memcpy(cmp, img, H*W);

    cpuCalc(img, W, H);
    printf("cpu calc end\n");

    addWithCuda(img, W,H);
    printf("gpu calc end\n");

    bool flag = true;
    for (long i = 0; i < H*W; i++)
    {
        if (img[i] != cmp[i])
        {
            printf("no pass\n");
            flag = false;
            break;
        }
    }
    if (flag)
        printf("pass");

    delete[] cmp;
    delete[] img;
    getchar();

    return 0;
}
优质内容筛选与推荐>>
1、Bzoj2115: [Wc2011] Xor
2、Contest 20140923 登月计划 BabyStepGaintStep
3、互质
4、04 将当前位置用大头针标注到百度地图上
5、CSS 文本字体颜色(CSS color)


长按二维码向我转账

受苹果公司新规定影响,微信 iOS 版的赞赏功能被关闭,可通过二维码转账支持公众号。

    阅读
    好看
    已推荐到看一看
    你的朋友可以在“发现”-“看一看”看到你认为好看的文章。
    已取消,“好看”想法已同步删除
    已推荐到看一看 和朋友分享想法
    最多200字,当前共 发送

    已发送

    朋友将在看一看看到

    确定
    分享你的想法...
    取消

    分享想法到看一看

    确定
    最多200字,当前共

    发送中

    网络异常,请稍后重试

    微信扫一扫
    关注该公众号