Autonomous Space Robotics Lab: gpusurf: descriptors.cu Source File

00001 /*
00002 Copyright (c) 2010, Paul Furgale and Chi Hay Tong
00003 All rights reserved.
00004 
00005 Redistribution and use in source and binary forms, with or without 
00006 modification, are permitted provided that the following conditions are 
00007 met:
00008 
00009 * Redistributions of source code must retain the above copyright notice, 
00010   this list of conditions and the following disclaimer.
00011 * Redistributions in binary form must reproduce the above copyright 
00012   notice, this list of conditions and the following disclaimer in the 
00013   documentation and/or other materials provided with the distribution.
00014 * The names of its contributors may not be used to endorse or promote 
00015   products derived from this software without specific prior written 
00016   permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
00019 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
00020 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
00021 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
00022 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
00023 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
00024 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
00025 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
00026 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
00027 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
00028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 */
00030 
00031 #include "descriptors.h"
00032 #include "gpu_globals.h"
00033 #include "gpu_utils.h"
00034 
00035 namespace asrl {
00036   // precomputed values for a Gaussian with a standard deviation of 3.3
00037   // - it appears SURF uses a different value, but not sure what it is
00038   __constant__ float dc_3p3gauss1D[20] = {0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,
00039                                           0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,
00040                                           0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,
00041                                           0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f};
00042 
00043 
00044   // Spawn 16 blocks per interest point
00045   // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
00046   __global__ void compute_descriptors_kernel(float * d_descriptors, Keypoint * d_features)
00047   {
00048     // compute thread IDs (row-major)
00049     int tid = __mul24(threadIdx.y,blockDim.x) + threadIdx.x;
00050 
00051     // allocate shared memory
00052     __shared__ float smem[2*5*5];               // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
00053 
00054     // get the interest point parameters (x, y, scale, strength, theta)
00055     __shared__ float ipt[5];
00056     if (tid < 5)
00057       {
00058         //ipt[tid] = d_features[__mul24(blockIdx.x, SF_FEATURE_STRIDE) + tid];
00059         ipt[tid] = ((float*)&d_features[blockIdx.x])[tid];
00060       }
00061     __syncthreads();
00062 
00063 
00064     // compute sin(theta), cos(theta)
00065     // (there are faster, but less accurate trig functions: __sinf, __cosf)
00066     __shared__ float sin_theta;
00067     __shared__ float cos_theta;
00068     if (tid == 0)
00069       {
00070         sin_theta = sinf(ipt[SF_ANGLE]);
00071       }
00072     else if (tid == 24) // another number in a different half-warp (to ensure no repeated branching logic)
00073       {
00074         cos_theta = cosf(ipt[SF_ANGLE]);
00075       }
00076     __syncthreads();
00077 
00078 
00079     // Compute sampling points
00080     // since grids are 2D, need to compute xBlock and yBlock indices
00081     int xBlock = (blockIdx.y & 3);      // blockIdx.y % 4
00082     int yBlock = (blockIdx.y >> 2);     // floor(blockIdx.y/4)
00083     int xIndex = __mul24(xBlock, blockDim.x) + threadIdx.x;
00084     int yIndex = __mul24(yBlock, blockDim.y) + threadIdx.y;
00085 
00086     // Compute rotated sampling points
00087     // (clockwise rotation since we are rotating the lattice)
00088     // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
00089     float sample_x = ipt[SF_X] + (  cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
00090                                     + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
00091     float sample_y = ipt[SF_Y] + ( -sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
00092                                    + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
00093 
00094     // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
00095     //  a b c
00096     //  d        f
00097     //  g h i
00098     float a = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
00099     float b = tex2D(d_integralTex, sample_x,                                     sample_y - ipt[SF_SIZE]);
00100     float c = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
00101     float d = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y);
00102     float f = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y);
00103     float g = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
00104     float h = tex2D(d_integralTex, sample_x,                                     sample_y + ipt[SF_SIZE]);
00105     float i = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);   
00106 
00107     // compute axis-aligned HaarX, HaarY
00108     // (could group the additions together into multiplications)
00109     float gauss = dc_3p3gauss1D[xIndex] * dc_3p3gauss1D[yIndex];        // separable because independent (circular)
00110     float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i));             // unrotated dx
00111     float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i));             // unrotated dy
00112 
00113     // rotate responses (store all dxs then all dys)
00114     // - counterclockwise rotation to rotate back to zero orientation
00115     smem[tid] =  aa_dx*cos_theta - aa_dy*sin_theta;             // rotated dx
00116     smem[25+tid] = aa_dx*sin_theta + aa_dy*cos_theta;           // rotated dy
00117     __syncthreads();
00118 
00119 
00120     // sum (reduce) 5x5 area response
00121     __shared__ float rmem[5*5];         // buffer for conducting reductions
00122 
00123     // copy all of the dx responses to a |dx| array
00124     rmem[tid] = fabs(smem[tid]);        // |dx| array
00125     __syncthreads();
00126 
00127 
00128     // sum (reduce) dx and |dx|
00129     // first step is to reduce from 25 to 16
00130     if (tid < 9)        // use 9 threads
00131       {
00132         smem[tid] = smem[tid] + smem[tid + 16];
00133         rmem[tid] = rmem[tid] + rmem[tid + 16];
00134       }
00135     __syncthreads();
00136 
00137 
00138     // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
00139     if (tid < 16)
00140       {
00141         smem[tid] = smem[tid] + smem[tid + 8];
00142         smem[tid] = smem[tid] + smem[tid + 4];
00143         smem[tid] = smem[tid] + smem[tid + 2];
00144         smem[tid] = smem[tid] + smem[tid + 1];
00145 
00146         rmem[tid] = rmem[tid] + rmem[tid + 8];
00147         rmem[tid] = rmem[tid] + rmem[tid + 4];
00148         rmem[tid] = rmem[tid] + rmem[tid + 2];
00149         rmem[tid] = rmem[tid] + rmem[tid + 1];
00150       }
00151     __syncthreads();
00152 
00153 
00154     // write dx and |dx| result out (order matches SURF)
00155     if (tid == 0)
00156       {
00157         int block_start = __mul24(blockIdx.x,ASRL_SURF_DESCRIPTOR_DIM) + __mul24(blockIdx.y,4);
00158         d_descriptors[block_start] = smem[0];
00159         d_descriptors[block_start+1] = rmem[0];
00160       }
00161     __syncthreads();
00162 
00163 
00164     // index shift for the dy values
00165     int dy_index = tid + 25;
00166 
00167     // copy all of the dy responses to a |dy| array
00168     rmem[tid] = fabs(smem[dy_index]);   // |dy| array
00169     __syncthreads();
00170 
00171 
00172     // sum (reduce) dy and |dy|
00173     // first step is to reduce from 25 to 16
00174     if (tid < 9)        // use 9 threads
00175       {
00176         smem[dy_index] = smem[dy_index] + smem[dy_index + 16];
00177         rmem[tid] = rmem[tid] + rmem[tid + 16];
00178       }
00179     __syncthreads();
00180 
00181 
00182     // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
00183     if (tid < 16)
00184       {
00185         smem[dy_index] = smem[dy_index] + smem[dy_index + 8];
00186         smem[dy_index] = smem[dy_index] + smem[dy_index + 4];
00187         smem[dy_index] = smem[dy_index] + smem[dy_index + 2];
00188         smem[dy_index] = smem[dy_index] + smem[dy_index + 1];
00189 
00190         rmem[tid] = rmem[tid] + rmem[tid + 8];
00191         rmem[tid] = rmem[tid] + rmem[tid + 4];
00192         rmem[tid] = rmem[tid] + rmem[tid + 2];
00193         rmem[tid] = rmem[tid] + rmem[tid + 1];
00194       }
00195     __syncthreads();
00196 
00197 
00198     // write dy and |dy| result out (order matches SURF)
00199     if (tid == 0)
00200       {
00201         int block_start = __mul24(blockIdx.x, ASRL_SURF_DESCRIPTOR_DIM) + __mul24(blockIdx.y,4);
00202         d_descriptors[block_start+2] = smem[25];
00203         d_descriptors[block_start+3] = rmem[0];
00204       }
00205     // at this time, d_descriptors is composed of unnormalized values of: (dx, dy, |dx|, |dy|)
00206   }
00207 
00208 
00209   __global__ void normalize_descriptors_kernel(float * d_descriptors)
00210   {
00211     // no need for thread ID
00212     int descriptor_base = __mul24(blockIdx.x, ASRL_SURF_DESCRIPTOR_DIM);
00213 
00214 
00215     // read in the unnormalized descriptor values (squared)
00216     __shared__ float sqDesc[ASRL_SURF_DESCRIPTOR_DIM];
00217     float lookup = d_descriptors[descriptor_base + threadIdx.x];
00218     sqDesc[threadIdx.x] = lookup * lookup;
00219     __syncthreads();
00220 
00221 
00222     // reduction to get total
00223     if (threadIdx.x < 32)
00224       {
00225         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 32];
00226         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 16];
00227         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 8];
00228         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 4];
00229         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 2];
00230         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 1];
00231       }
00232     __syncthreads();
00233 
00234 
00235     // compute length (square root)
00236     __shared__ float len;
00237     if (threadIdx.x == 0)
00238       {
00239         len = sqrt(sqDesc[0]);
00240       }
00241     __syncthreads();
00242 
00243 
00244     // normalize and store in output
00245     d_descriptors[descriptor_base + threadIdx.x] = lookup / len;        
00246   }
00247   
00248 
00249   void compute_descriptors(float * d_descriptors, Keypoint * d_features, int nFeaturesFound)
00250   {
00251     // Should we be checking for errors here?
00252     // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
00253     compute_descriptors_kernel <<< dim3(nFeaturesFound,16,1), dim3(5,5,1) >>> (d_descriptors, d_features);
00254     normalize_descriptors_kernel <<< dim3(nFeaturesFound,1,1), dim3(ASRL_SURF_DESCRIPTOR_DIM,1,1) >>> (d_descriptors);
00255   }
00256 
00257 
00258 } // namespace asrl
Autonomous Space Robotics Lab

Speeded Up SURF

descriptors.cu