00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "GpuIntegralImage_kernel.h"
00032
00033 namespace asrl {
00034
00035
00036 __device__ void convert_dev(float & out, unsigned char in){ out = (float) in / 255.0f; }
00037 __device__ void convert_dev(float & out, float in){ out = in; }
00038
00039
00040
00041 template <typename T>
00042 __global__ void transpose_kernel(float *odata, size_t o_pitch, T *idata, size_t i_pitch, size_t width, size_t height)
00043 {
00044 __shared__ float block[ASRL_TRANSPOSE_BLOCK_DIM][ASRL_TRANSPOSE_BLOCK_DIM+1];
00045
00046
00047 unsigned int xBlock = __mul24(blockDim.x, blockIdx.x);
00048 unsigned int yBlock = __mul24(blockDim.y, blockIdx.y);
00049 unsigned int xIndex = xBlock + threadIdx.x;
00050 unsigned int yIndex = yBlock + threadIdx.y;
00051
00052 if ((xIndex < width) && (yIndex < height))
00053 {
00054
00055 unsigned int index_in = __mul24(i_pitch, yIndex) + xIndex;
00056 convert_dev(block[threadIdx.y][threadIdx.x], idata[index_in]);
00057 }
00058
00059 __syncthreads();
00060
00061
00062 xIndex = yBlock + threadIdx.x;
00063 yIndex = xBlock + threadIdx.y;
00064 if ((xIndex < height) && (yIndex < width))
00065 {
00066 unsigned int index_out = __mul24(o_pitch, yIndex) + xIndex;
00067 odata[index_out] = block[threadIdx.x][threadIdx.y];
00068 }
00069 }
00070
00071 void run_transpose_kernel_uchar(dim3 grid, dim3 block, float *odata, size_t o_pitch, unsigned char *idata, size_t i_pitch, size_t width, size_t height)
00072 {
00073 transpose_kernel<unsigned char> <<< grid, block, 0>>> (odata, o_pitch, idata, i_pitch, width, height);
00074 }
00075
00076 void run_transpose_kernel_float(dim3 grid, dim3 block, float *odata, size_t o_pitch, float *idata, size_t i_pitch, size_t width, size_t height)
00077 {
00078 transpose_kernel<float> <<< grid, block, 0>>> (odata, o_pitch, idata, i_pitch, width, height);
00079 }
00080
00081 }
00082
00083