CUDA矩阵转置,结果一直不对,大侠帮忙看看 5
#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<stdlib.h>#includ...
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
//#include "device_fuctions.h"
#define BLOCK_DIM 4
#define N 16
__global__ void transpose(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}
int main()
{
//分别为主机和设备上变量分配内存
float a[N][N]={{1,2,1,4},{3,6,7,9},{6,5,4,3},{1,2,1,8}};
float b[N][N],*idata,*odata;
cudaMalloc((void **)&idata,sizeof(float)*N*N);
cudaMalloc((void **)&odata,sizeof(float)*N*N);
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
printf("%.0f\t",a[i][j]);
}
printf("\n");
}
//将变量拷贝到设备上
cudaMemcpy(idata,a,sizeof(float)*N*N,cudaMemcpyHostToDevice);
dim3 blocks(N/BLOCK_DIM,N/BLOCK_DIM,1);
dim3 threads(BLOCK_DIM,BLOCK_DIM,1);
transpose<<<blocks,threads>>>(odata,idata,N,N);
cudaMemcpy(odata,b,sizeof(float)*N*N,cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
printf("%d\t ",b[i][j]);
}
printf("\n");
}
cudaFree(idata);
cudaFree(odata);
return 0;
} 展开
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
//#include "device_fuctions.h"
#define BLOCK_DIM 4
#define N 16
__global__ void transpose(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}
int main()
{
//分别为主机和设备上变量分配内存
float a[N][N]={{1,2,1,4},{3,6,7,9},{6,5,4,3},{1,2,1,8}};
float b[N][N],*idata,*odata;
cudaMalloc((void **)&idata,sizeof(float)*N*N);
cudaMalloc((void **)&odata,sizeof(float)*N*N);
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
printf("%.0f\t",a[i][j]);
}
printf("\n");
}
//将变量拷贝到设备上
cudaMemcpy(idata,a,sizeof(float)*N*N,cudaMemcpyHostToDevice);
dim3 blocks(N/BLOCK_DIM,N/BLOCK_DIM,1);
dim3 threads(BLOCK_DIM,BLOCK_DIM,1);
transpose<<<blocks,threads>>>(odata,idata,N,N);
cudaMemcpy(odata,b,sizeof(float)*N*N,cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
printf("%d\t ",b[i][j]);
}
printf("\n");
}
cudaFree(idata);
cudaFree(odata);
return 0;
} 展开
展开全部
你好,
错误的地方如下:
// 1. CUDA kernel 不接受二维数组
// 2. N = 16, 然而你的a只有4x4个元素,而不是16x16
float a[N][N]={{1,2,1,4},{3,6,7,9},{6,5,4,3},{1,2,1,8}};
float b[N][N];
// 3. odata和b的位置反了
cudaMemcpy(odata,b,sizeof(float)*N*N,cudaMemcpyDeviceToHost);
你这样写不是最有的方案,不是联合的内存访问。请参考以下这篇文章的方法:
http://brianmykietka.net/projects.php?project=finalmatrixtranspose
谢谢,望采纳
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询