00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "orientation.h"
00032
00033 #define FLT_MAX 3.4028235E38
00034 #include "gpu_globals.h"
00035 #include "gpu_utils.h"
00036
00037
00038 namespace asrl {
00039
00040 __constant__ float dc_gauss1D[13] = {0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f, 0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f, 0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f, 0.002215924206f};
00041
00042 __device__ inline void swap(float3 & a, float3 & b)
00043 {
00044 float3 tmp = a;
00045 a = b;
00046 b = tmp;
00047 }
00048
00049 __device__ void setMaxXZ(float3 & dest, float3 & comp)
00050 {
00051 if(dest.x < comp.x) {
00052 dest.x = comp.x;
00053 dest.z = comp.z;
00054 }
00055 }
00056
00057 __device__ void setSumXY(float2 & dest, float2 & src)
00058 {
00059 dest.x += src.x;
00060 dest.y += src.y;
00061 }
00062
00063 __device__ void setMaxZ3(float3 & dest, float3 & comp)
00064 {
00065 if(dest.z < comp.z) {
00066 dest.x = comp.x;
00067 dest.y = comp.y;
00068 dest.z = comp.z;
00069 }
00070 }
00071
00072 __global__ void find_orientation_fast_kernel(Keypoint * d_features)
00073 {
00074 int tid = __mul24(threadIdx.y, 17) + threadIdx.x;
00075 int tid2 = INT_MAX;
00076 if(threadIdx.x < 13 && threadIdx.y < 13) {
00077 tid2 = __mul24(threadIdx.y,13) + threadIdx.x;
00078 }
00079
00080
00081 __shared__ float texLookups[17][17];
00082
00083
00084 __shared__ float Edx[13*13];
00085 __shared__ float Edy[13*13];
00086 __shared__ float xys[3];
00087
00088 if(tid < 3)
00089 {
00090 xys[tid] = ((float*)(&d_features[blockIdx.x]))[tid];
00091 }
00092
00093
00094 __syncthreads();
00095
00096
00097
00098 texLookups[threadIdx.x][threadIdx.y] =
00099 tex2D(d_integralTex, xys[0] + ((int)threadIdx.x - 8)*xys[2],
00100 xys[1] + ((int)threadIdx.y - 8)*xys[2]);
00101
00102 __syncthreads();
00103
00104 float dx = 0.f;
00105 float dy = 0.f;
00106
00107
00108
00109
00110 if(tid2 < 169)
00111 {
00112
00113 dx -= texLookups[threadIdx.x ][threadIdx.y ];
00114 dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
00115 dx -= texLookups[threadIdx.x + 4][threadIdx.y ];
00116 dx += texLookups[threadIdx.x ][threadIdx.y + 4];
00117 dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];
00118 dx += texLookups[threadIdx.x + 4][threadIdx.y + 4];
00119
00120 dy -= texLookups[threadIdx.x ][threadIdx.y ];
00121 dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2];
00122 dy -= texLookups[threadIdx.x ][threadIdx.y + 4];
00123 dy += texLookups[threadIdx.x + 4][threadIdx.y ];
00124 dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];
00125 dy += texLookups[threadIdx.x + 4][threadIdx.y + 4];
00126
00127 float g = dc_gauss1D[threadIdx.x] * dc_gauss1D[threadIdx.y];
00128
00129 Edx[tid2] = dx * g;
00130 Edy[tid2] = dy * g;
00131
00132 }
00133
00134 __syncthreads();
00135
00136
00137
00138 if (tid < 41) {Edx[tid] += Edx[tid+128]; } __syncthreads();
00139 if (tid < 64) { Edx[tid] += Edx[tid + 64]; } __syncthreads();
00140 if (tid < 32) {
00141 Edx[tid]+=Edx[tid + 32];
00142 Edx[tid]+=Edx[tid + 16];
00143 Edx[tid]+=Edx[tid + 8];
00144 Edx[tid]+=Edx[tid + 4];
00145 Edx[tid]+=Edx[tid + 2];
00146 Edx[tid]+=Edx[tid + 1];
00147 }
00148
00149
00150 if (tid < 41) {Edy[tid] += Edy[tid+128]; } __syncthreads();
00151 if (tid < 64) {Edy[tid] += Edy[tid + 64]; } __syncthreads();
00152 if (tid < 32) {
00153 Edy[tid]+=Edy[tid + 32];
00154 Edy[tid]+=Edy[tid + 16];
00155 Edy[tid]+=Edy[tid + 8];
00156 Edy[tid]+=Edy[tid + 4];
00157 Edy[tid]+=Edy[tid + 2];
00158 Edy[tid]+=Edy[tid + 1];
00159 }
00160
00161
00162
00163 if (tid == 0)
00164 {
00165 d_features[blockIdx.x].angle = -atan2(Edy[0],Edx[0]);
00166 }
00167
00168 }
00169
00170 __global__ void find_orientation_kernel(Keypoint * d_features)
00171 {
00172 int tid = __mul24(threadIdx.y, 17) + threadIdx.x;
00173 int tid2 = INT_MAX;
00174 if(threadIdx.x < 13 && threadIdx.y < 13) {
00175 tid2 = __mul24(threadIdx.y,13) + threadIdx.x;
00176 }
00177
00178
00179
00180
00181 __shared__ float texLookups[17][17];
00182 __shared__ float3 dxDyAngle[17*17];
00183 __shared__ float xys[3];
00184
00185 if(tid < 3)
00186 {
00187
00188 xys[tid] = ((float*)(&d_features[blockIdx.x]))[tid];
00189 }
00190
00191 __syncthreads();
00192
00193
00194
00195 texLookups[threadIdx.x][threadIdx.y] =
00196 tex2D(d_integralTex, xys[0] + ((int)threadIdx.x - 8)*xys[2],
00197 xys[1] + ((int)threadIdx.y - 8)*xys[2]);
00198
00199 __syncthreads();
00200
00201
00202
00203
00204
00205
00206 float dx = 0.f;
00207 float dy = 0.f;
00208 float angle = 0.f;
00209 dxDyAngle[tid].z = FLT_MAX;
00210
00211
00212
00213
00214 if(tid2 < 169)
00215 {
00216 dx -= texLookups[threadIdx.x ][threadIdx.y ];
00217 dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
00218 dx -= texLookups[threadIdx.x + 4][threadIdx.y ];
00219 dx += texLookups[threadIdx.x ][threadIdx.y + 4];
00220 dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];
00221 dx += texLookups[threadIdx.x + 4][threadIdx.y + 4];
00222
00223 dy -= texLookups[threadIdx.x ][threadIdx.y ];
00224 dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2];
00225 dy -= texLookups[threadIdx.x ][threadIdx.y + 4];
00226 dy += texLookups[threadIdx.x + 4][threadIdx.y ];
00227 dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];
00228 dy += texLookups[threadIdx.x + 4][threadIdx.y + 4];
00229
00230
00231 float g = dc_gauss1D[threadIdx.x] * dc_gauss1D[threadIdx.y];
00232
00233 dxDyAngle[tid2].x = dx * g;
00234 dxDyAngle[tid2].y = dy * g;
00235 angle = atan2(dy,dx);
00236 dxDyAngle[tid2].z = angle;
00237
00238 }
00239
00240 __syncthreads();
00241
00242 #if 0
00243
00244
00245
00246
00247 if(tid < 169) {
00248 angle = dxDyAngle[tid].z;
00249 dx = dxDyAngle[tid].x;
00250 dy = dxDyAngle[tid].y;
00251
00252 int idx = tid;
00253
00254 for(int i = 1; i < 169; i++) {
00255 idx++;
00256 if(idx == 169){
00257 idx = 0;
00258 }
00259 float a2 = dxDyAngle[idx].z;
00260 if(a2 < angle)
00261 {
00262 a2 += 6.2831853071796f;
00263 }
00264
00265 if( a2 < angle + 1.047197551197f)
00266 {
00267 dx += dxDyAngle[idx].x;
00268 dy += dxDyAngle[idx].y;
00269 }
00270 }
00271
00272 angle = atan2f(dy,dx);
00273 dx = dx*dx + dy*dy;
00274
00275 }
00276 __syncthreads();
00277
00278 if(tid < 169) {
00279 dxDyAngle[tid].x = dx;
00280 dxDyAngle[tid].z = angle;
00281 } else {
00282 dxDyAngle[tid].x = -FLT_MAX;
00283 }
00284 #else
00285
00286
00287
00288
00289
00290
00291 for (unsigned int k = 2; k <= 256; k *= 2)
00292 {
00293
00294 for (unsigned int j = k / 2; j>0; j /= 2)
00295 {
00296 unsigned int ixj = tid ^ j;
00297
00298 if(tid < 256 && ixj > tid)
00299 {
00300 if ((tid & k) == 0)
00301 {
00302 if (dxDyAngle[tid].z > dxDyAngle[ixj].z)
00303 {
00304 swap(dxDyAngle[tid], dxDyAngle[ixj]);
00305 }
00306 }
00307 else
00308 {
00309 if (dxDyAngle[tid].z < dxDyAngle[ixj].z)
00310 {
00311 swap(dxDyAngle[tid], dxDyAngle[ixj]);
00312 }
00313 }
00314 }
00315
00316 __syncthreads();
00317 }
00318 }
00319
00320
00321
00322
00323 if(tid < 169) {
00324 angle = dxDyAngle[tid].z;
00325 dx = dxDyAngle[tid].x;
00326 dy = dxDyAngle[tid].y;
00327
00328 int idx = tid;
00329 float wrapped = 0.f;
00330 for(int i = 1; i < 169; i++) {
00331 idx++;
00332 if(idx == 169){
00333 idx = 0;
00334 wrapped = 6.2831853071796f;
00335 }
00336
00337 if( (dxDyAngle[idx].z + wrapped) > angle + 1.047197551197f){
00338 break;
00339 } else {
00340 dx += dxDyAngle[idx].x;
00341 dy += dxDyAngle[idx].y;
00342 }
00343 }
00344
00345 angle = atan2f(dy,dx);
00346 dx = dx*dx + dy*dy;
00347 }
00348 __syncthreads();
00349
00350 if(tid < 169) {
00351 dxDyAngle[tid].x = dx;
00352 dxDyAngle[tid].z = angle;
00353 } else {
00354 dxDyAngle[tid].x = -FLT_MAX;
00355 }
00356 #endif
00357
00358 __syncthreads();
00359
00360
00361
00362 if (tid < 128) { setMaxXZ(dxDyAngle[tid],dxDyAngle[tid+128]); } __syncthreads();
00363
00364 if (tid < 64) { setMaxXZ(dxDyAngle[tid], dxDyAngle[tid + 64]); } __syncthreads();
00365
00366 if (tid < 32) {
00367 setMaxXZ(dxDyAngle[tid],dxDyAngle[tid + 32]);
00368 setMaxXZ(dxDyAngle[tid],dxDyAngle[tid + 16]);
00369 setMaxXZ(dxDyAngle[tid],dxDyAngle[tid + 8]);
00370 setMaxXZ(dxDyAngle[tid],dxDyAngle[tid + 4]);
00371 setMaxXZ(dxDyAngle[tid],dxDyAngle[tid + 2]);
00372 setMaxXZ(dxDyAngle[tid],dxDyAngle[tid + 1]);
00373 }
00374
00375
00376 if (tid == 0)
00377 {
00378
00379 d_features[blockIdx.x].angle = -dxDyAngle[0].z;
00380 }
00381
00382 }
00383
00384
00385
00386 void find_orientation( Keypoint * d_features, int nFeatures)
00387 {
00388
00389 dim3 threads;
00390 threads.x = 17;
00391 threads.y = 17;
00392
00393 dim3 grid;
00394 grid.x = nFeatures;
00395 grid.y = 1;
00396 grid.z = 1;
00397
00398 find_orientation_kernel <<< grid, threads >>>(d_features);
00399
00400 }
00401
00402 void find_orientation_fast( Keypoint * d_features, int nFeatures)
00403 {
00404
00405 dim3 threads;
00406 threads.x = 17;
00407 threads.y = 17;
00408
00409 dim3 grid;
00410 grid.x = nFeatures;
00411 grid.y = 1;
00412 grid.z = 1;
00413
00414 find_orientation_fast_kernel <<< grid, threads >>>(d_features);
00415 }
00416
00417
00418 }