Autonomous Space Robotics Lab: gpusurf: GpuIntegralImage

00001 /*
00002 Copyright (c) 2010, Paul Furgale and Chi Hay Tong
00003 All rights reserved.
00004 
00005 Redistribution and use in source and binary forms, with or without 
00006 modification, are permitted provided that the following conditions are 
00007 met:
00008 
00009 * Redistributions of source code must retain the above copyright notice, 
00010   this list of conditions and the following disclaimer.
00011 * Redistributions in binary form must reproduce the above copyright 
00012   notice, this list of conditions and the following disclaimer in the 
00013   documentation and/or other materials provided with the distribution.
00014 * The names of its contributors may not be used to endorse or promote 
00015   products derived from this software without specific prior written 
00016   permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
00019 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
00020 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
00021 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
00022 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
00023 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
00024 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
00025 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
00026 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
00027 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
00028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 */
00030 
00031 #include "GpuIntegralImage_kernel.h"
00032 
00033 namespace asrl {
00034   // convert unsigned chars into floats (scaled by 255.0f)
00035   // - second function does nothing - overloaded, but __device__ functions are inline, so no overhead
00036   __device__ void convert_dev(float & out, unsigned char in){ out = (float) in  / 255.0f; }
00037   __device__ void convert_dev(float & out, float in){ out = in; }
00038 
00039 
00040   // matrix transpose operation (on the GPU)
00041   template <typename T>
00042   __global__ void transpose_kernel(float *odata, size_t o_pitch, T *idata, size_t i_pitch, size_t width, size_t height)
00043   {
00044     __shared__ float block[ASRL_TRANSPOSE_BLOCK_DIM][ASRL_TRANSPOSE_BLOCK_DIM+1];
00045 
00046     // read the matrix tile into shared memory
00047     unsigned int xBlock = __mul24(blockDim.x, blockIdx.x);
00048     unsigned int yBlock = __mul24(blockDim.y, blockIdx.y);
00049     unsigned int xIndex = xBlock + threadIdx.x;
00050     unsigned int yIndex = yBlock + threadIdx.y;
00051 
00052     if ((xIndex < width) && (yIndex < height))
00053       {
00054         // load block into shared memory
00055         unsigned int index_in = __mul24(i_pitch, yIndex) + xIndex;      // where from in data
00056         convert_dev(block[threadIdx.y][threadIdx.x], idata[index_in]);  // convert to float (if not already)
00057       }
00058 
00059     __syncthreads();
00060 
00061     // write it back to global memory
00062     xIndex = yBlock + threadIdx.x;
00063     yIndex = xBlock + threadIdx.y;
00064     if ((xIndex < height) && (yIndex < width))
00065       {
00066         unsigned int index_out = __mul24(o_pitch, yIndex) + xIndex;
00067         odata[index_out] = block[threadIdx.x][threadIdx.y];
00068       }
00069   }
00070   
00071   void run_transpose_kernel_uchar(dim3 grid, dim3 block, float *odata, size_t o_pitch, unsigned char *idata, size_t i_pitch, size_t width, size_t height)
00072   {
00073     transpose_kernel<unsigned char> <<< grid, block, 0>>> (odata, o_pitch, idata, i_pitch, width, height);
00074   }
00075   
00076   void run_transpose_kernel_float(dim3 grid, dim3 block, float *odata, size_t o_pitch, float *idata, size_t i_pitch, size_t width, size_t height)
00077   {
00078     transpose_kernel<float> <<< grid, block, 0>>> (odata, o_pitch, idata, i_pitch, width, height);
00079   }
00080 
00081 } // namespace asrl
00082 
00083
Autonomous Space Robotics Lab

Speeded Up SURF

GpuIntegralImage_kernel.cu