00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 #include "descriptors.h"
00032 #include "gpu_globals.h"
00033 #include "gpu_utils.h"
00034 
00035 namespace asrl {
00036   
00037   
00038   __constant__ float dc_3p3gauss1D[20] = {0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,
00039                                           0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,
00040                                           0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,
00041                                           0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f};
00042 
00043 
00044   
00045   
00046   __global__ void compute_descriptors_kernel(float * d_descriptors, Keypoint * d_features)
00047   {
00048     
00049     int tid = __mul24(threadIdx.y,blockDim.x) + threadIdx.x;
00050 
00051     
00052     __shared__ float smem[2*5*5];               
00053 
00054     
00055     __shared__ float ipt[5];
00056     if (tid < 5)
00057       {
00058         
00059         ipt[tid] = ((float*)&d_features[blockIdx.x])[tid];
00060       }
00061     __syncthreads();
00062 
00063 
00064     
00065     
00066     __shared__ float sin_theta;
00067     __shared__ float cos_theta;
00068     if (tid == 0)
00069       {
00070         sin_theta = sinf(ipt[SF_ANGLE]);
00071       }
00072     else if (tid == 24) 
00073       {
00074         cos_theta = cosf(ipt[SF_ANGLE]);
00075       }
00076     __syncthreads();
00077 
00078 
00079     
00080     
00081     int xBlock = (blockIdx.y & 3);      
00082     int yBlock = (blockIdx.y >> 2);     
00083     int xIndex = __mul24(xBlock, blockDim.x) + threadIdx.x;
00084     int yIndex = __mul24(yBlock, blockDim.y) + threadIdx.y;
00085 
00086     
00087     
00088     
00089     float sample_x = ipt[SF_X] + (  cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
00090                                     + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
00091     float sample_y = ipt[SF_Y] + ( -sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
00092                                    + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
00093 
00094     
00095     
00096     
00097     
00098     float a = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
00099     float b = tex2D(d_integralTex, sample_x,                                     sample_y - ipt[SF_SIZE]);
00100     float c = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
00101     float d = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y);
00102     float f = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y);
00103     float g = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
00104     float h = tex2D(d_integralTex, sample_x,                                     sample_y + ipt[SF_SIZE]);
00105     float i = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);   
00106 
00107     
00108     
00109     float gauss = dc_3p3gauss1D[xIndex] * dc_3p3gauss1D[yIndex];        
00110     float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i));             
00111     float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i));             
00112 
00113     
00114     
00115     smem[tid] =  aa_dx*cos_theta - aa_dy*sin_theta;             
00116     smem[25+tid] = aa_dx*sin_theta + aa_dy*cos_theta;           
00117     __syncthreads();
00118 
00119 
00120     
00121     __shared__ float rmem[5*5];         
00122 
00123     
00124     rmem[tid] = fabs(smem[tid]);        
00125     __syncthreads();
00126 
00127 
00128     
00129     
00130     if (tid < 9)        
00131       {
00132         smem[tid] = smem[tid] + smem[tid + 16];
00133         rmem[tid] = rmem[tid] + rmem[tid + 16];
00134       }
00135     __syncthreads();
00136 
00137 
00138     
00139     if (tid < 16)
00140       {
00141         smem[tid] = smem[tid] + smem[tid + 8];
00142         smem[tid] = smem[tid] + smem[tid + 4];
00143         smem[tid] = smem[tid] + smem[tid + 2];
00144         smem[tid] = smem[tid] + smem[tid + 1];
00145 
00146         rmem[tid] = rmem[tid] + rmem[tid + 8];
00147         rmem[tid] = rmem[tid] + rmem[tid + 4];
00148         rmem[tid] = rmem[tid] + rmem[tid + 2];
00149         rmem[tid] = rmem[tid] + rmem[tid + 1];
00150       }
00151     __syncthreads();
00152 
00153 
00154     
00155     if (tid == 0)
00156       {
00157         int block_start = __mul24(blockIdx.x,ASRL_SURF_DESCRIPTOR_DIM) + __mul24(blockIdx.y,4);
00158         d_descriptors[block_start] = smem[0];
00159         d_descriptors[block_start+1] = rmem[0];
00160       }
00161     __syncthreads();
00162 
00163 
00164     
00165     int dy_index = tid + 25;
00166 
00167     
00168     rmem[tid] = fabs(smem[dy_index]);   
00169     __syncthreads();
00170 
00171 
00172     
00173     
00174     if (tid < 9)        
00175       {
00176         smem[dy_index] = smem[dy_index] + smem[dy_index + 16];
00177         rmem[tid] = rmem[tid] + rmem[tid + 16];
00178       }
00179     __syncthreads();
00180 
00181 
00182     
00183     if (tid < 16)
00184       {
00185         smem[dy_index] = smem[dy_index] + smem[dy_index + 8];
00186         smem[dy_index] = smem[dy_index] + smem[dy_index + 4];
00187         smem[dy_index] = smem[dy_index] + smem[dy_index + 2];
00188         smem[dy_index] = smem[dy_index] + smem[dy_index + 1];
00189 
00190         rmem[tid] = rmem[tid] + rmem[tid + 8];
00191         rmem[tid] = rmem[tid] + rmem[tid + 4];
00192         rmem[tid] = rmem[tid] + rmem[tid + 2];
00193         rmem[tid] = rmem[tid] + rmem[tid + 1];
00194       }
00195     __syncthreads();
00196 
00197 
00198     
00199     if (tid == 0)
00200       {
00201         int block_start = __mul24(blockIdx.x, ASRL_SURF_DESCRIPTOR_DIM) + __mul24(blockIdx.y,4);
00202         d_descriptors[block_start+2] = smem[25];
00203         d_descriptors[block_start+3] = rmem[0];
00204       }
00205     
00206   }
00207 
00208 
00209   __global__ void normalize_descriptors_kernel(float * d_descriptors)
00210   {
00211     
00212     int descriptor_base = __mul24(blockIdx.x, ASRL_SURF_DESCRIPTOR_DIM);
00213 
00214 
00215     
00216     __shared__ float sqDesc[ASRL_SURF_DESCRIPTOR_DIM];
00217     float lookup = d_descriptors[descriptor_base + threadIdx.x];
00218     sqDesc[threadIdx.x] = lookup * lookup;
00219     __syncthreads();
00220 
00221 
00222     
00223     if (threadIdx.x < 32)
00224       {
00225         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 32];
00226         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 16];
00227         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 8];
00228         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 4];
00229         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 2];
00230         sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 1];
00231       }
00232     __syncthreads();
00233 
00234 
00235     
00236     __shared__ float len;
00237     if (threadIdx.x == 0)
00238       {
00239         len = sqrt(sqDesc[0]);
00240       }
00241     __syncthreads();
00242 
00243 
00244     
00245     d_descriptors[descriptor_base + threadIdx.x] = lookup / len;        
00246   }
00247   
00248 
00249   void compute_descriptors(float * d_descriptors, Keypoint * d_features, int nFeaturesFound)
00250   {
00251     
00252     
00253     compute_descriptors_kernel <<< dim3(nFeaturesFound,16,1), dim3(5,5,1) >>> (d_descriptors, d_features);
00254     normalize_descriptors_kernel <<< dim3(nFeaturesFound,1,1), dim3(ASRL_SURF_DESCRIPTOR_DIM,1,1) >>> (d_descriptors);
00255   }
00256 
00257 
00258 }