CUDA中数组求最大值
1个回答
展开全部
cudaMallocPitch()
cudaMemcpy2D()
Get threadIdx blockIdx to access to each element, you might need to check the definition of pitch before you start to work on 2D matrix. It makes sure that the allocation is appropriatedly padded to meet the memory alignment requirements.
// Host code
int width = 64, height = 64;
float* devPtr;
size_t pitch;
cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height);
MyKernel<<<100, 512>>>(devPtr, pitch, width, height);
// Device code
__global__ void MyKernel(float* devPtr, size_t pitch, int width, int height)
{
for (int r = 0; r < height; ++r)
{
float* row = (float*)((char*)devPtr + r * pitch);
for (int c = 0; c < width; ++c)
{ float element = row[c]; }
}
}
It would be easier to use 1D as a start point
cudaMemcpy2D()
Get threadIdx blockIdx to access to each element, you might need to check the definition of pitch before you start to work on 2D matrix. It makes sure that the allocation is appropriatedly padded to meet the memory alignment requirements.
// Host code
int width = 64, height = 64;
float* devPtr;
size_t pitch;
cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height);
MyKernel<<<100, 512>>>(devPtr, pitch, width, height);
// Device code
__global__ void MyKernel(float* devPtr, size_t pitch, int width, int height)
{
for (int r = 0; r < height; ++r)
{
float* row = (float*)((char*)devPtr + r * pitch);
for (int c = 0; c < width; ++c)
{ float element = row[c]; }
}
}
It would be easier to use 1D as a start point
本回答被网友采纳
已赞过
已踩过<
评论
收起
你对这个回答的评价是?
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询