00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "descriptors.h"
00032 #include "gpu_globals.h"
00033 #include "gpu_utils.h"
00034
00035 namespace asrl {
00036
00037
00038 __constant__ float dc_3p3gauss1D[20] = {0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,
00039 0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,
00040 0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,
00041 0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f};
00042
00043
00044
00045
00046 __global__ void compute_descriptors_kernel(float * d_descriptors, Keypoint * d_features)
00047 {
00048
00049 int tid = __mul24(threadIdx.y,blockDim.x) + threadIdx.x;
00050
00051
00052 __shared__ float smem[2*5*5];
00053
00054
00055 __shared__ float ipt[5];
00056 if (tid < 5)
00057 {
00058
00059 ipt[tid] = ((float*)&d_features[blockIdx.x])[tid];
00060 }
00061 __syncthreads();
00062
00063
00064
00065
00066 __shared__ float sin_theta;
00067 __shared__ float cos_theta;
00068 if (tid == 0)
00069 {
00070 sin_theta = sinf(ipt[SF_ANGLE]);
00071 }
00072 else if (tid == 24)
00073 {
00074 cos_theta = cosf(ipt[SF_ANGLE]);
00075 }
00076 __syncthreads();
00077
00078
00079
00080
00081 int xBlock = (blockIdx.y & 3);
00082 int yBlock = (blockIdx.y >> 2);
00083 int xIndex = __mul24(xBlock, blockDim.x) + threadIdx.x;
00084 int yIndex = __mul24(yBlock, blockDim.y) + threadIdx.y;
00085
00086
00087
00088
00089 float sample_x = ipt[SF_X] + ( cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
00090 + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
00091 float sample_y = ipt[SF_Y] + ( -sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
00092 + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
00093
00094
00095
00096
00097
00098 float a = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
00099 float b = tex2D(d_integralTex, sample_x, sample_y - ipt[SF_SIZE]);
00100 float c = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
00101 float d = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y);
00102 float f = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y);
00103 float g = tex2D(d_integralTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
00104 float h = tex2D(d_integralTex, sample_x, sample_y + ipt[SF_SIZE]);
00105 float i = tex2D(d_integralTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
00106
00107
00108
00109 float gauss = dc_3p3gauss1D[xIndex] * dc_3p3gauss1D[yIndex];
00110 float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i));
00111 float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i));
00112
00113
00114
00115 smem[tid] = aa_dx*cos_theta - aa_dy*sin_theta;
00116 smem[25+tid] = aa_dx*sin_theta + aa_dy*cos_theta;
00117 __syncthreads();
00118
00119
00120
00121 __shared__ float rmem[5*5];
00122
00123
00124 rmem[tid] = fabs(smem[tid]);
00125 __syncthreads();
00126
00127
00128
00129
00130 if (tid < 9)
00131 {
00132 smem[tid] = smem[tid] + smem[tid + 16];
00133 rmem[tid] = rmem[tid] + rmem[tid + 16];
00134 }
00135 __syncthreads();
00136
00137
00138
00139 if (tid < 16)
00140 {
00141 smem[tid] = smem[tid] + smem[tid + 8];
00142 smem[tid] = smem[tid] + smem[tid + 4];
00143 smem[tid] = smem[tid] + smem[tid + 2];
00144 smem[tid] = smem[tid] + smem[tid + 1];
00145
00146 rmem[tid] = rmem[tid] + rmem[tid + 8];
00147 rmem[tid] = rmem[tid] + rmem[tid + 4];
00148 rmem[tid] = rmem[tid] + rmem[tid + 2];
00149 rmem[tid] = rmem[tid] + rmem[tid + 1];
00150 }
00151 __syncthreads();
00152
00153
00154
00155 if (tid == 0)
00156 {
00157 int block_start = __mul24(blockIdx.x,ASRL_SURF_DESCRIPTOR_DIM) + __mul24(blockIdx.y,4);
00158 d_descriptors[block_start] = smem[0];
00159 d_descriptors[block_start+1] = rmem[0];
00160 }
00161 __syncthreads();
00162
00163
00164
00165 int dy_index = tid + 25;
00166
00167
00168 rmem[tid] = fabs(smem[dy_index]);
00169 __syncthreads();
00170
00171
00172
00173
00174 if (tid < 9)
00175 {
00176 smem[dy_index] = smem[dy_index] + smem[dy_index + 16];
00177 rmem[tid] = rmem[tid] + rmem[tid + 16];
00178 }
00179 __syncthreads();
00180
00181
00182
00183 if (tid < 16)
00184 {
00185 smem[dy_index] = smem[dy_index] + smem[dy_index + 8];
00186 smem[dy_index] = smem[dy_index] + smem[dy_index + 4];
00187 smem[dy_index] = smem[dy_index] + smem[dy_index + 2];
00188 smem[dy_index] = smem[dy_index] + smem[dy_index + 1];
00189
00190 rmem[tid] = rmem[tid] + rmem[tid + 8];
00191 rmem[tid] = rmem[tid] + rmem[tid + 4];
00192 rmem[tid] = rmem[tid] + rmem[tid + 2];
00193 rmem[tid] = rmem[tid] + rmem[tid + 1];
00194 }
00195 __syncthreads();
00196
00197
00198
00199 if (tid == 0)
00200 {
00201 int block_start = __mul24(blockIdx.x, ASRL_SURF_DESCRIPTOR_DIM) + __mul24(blockIdx.y,4);
00202 d_descriptors[block_start+2] = smem[25];
00203 d_descriptors[block_start+3] = rmem[0];
00204 }
00205
00206 }
00207
00208
00209 __global__ void normalize_descriptors_kernel(float * d_descriptors)
00210 {
00211
00212 int descriptor_base = __mul24(blockIdx.x, ASRL_SURF_DESCRIPTOR_DIM);
00213
00214
00215
00216 __shared__ float sqDesc[ASRL_SURF_DESCRIPTOR_DIM];
00217 float lookup = d_descriptors[descriptor_base + threadIdx.x];
00218 sqDesc[threadIdx.x] = lookup * lookup;
00219 __syncthreads();
00220
00221
00222
00223 if (threadIdx.x < 32)
00224 {
00225 sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 32];
00226 sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 16];
00227 sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 8];
00228 sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 4];
00229 sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 2];
00230 sqDesc[threadIdx.x] = sqDesc[threadIdx.x] + sqDesc[threadIdx.x + 1];
00231 }
00232 __syncthreads();
00233
00234
00235
00236 __shared__ float len;
00237 if (threadIdx.x == 0)
00238 {
00239 len = sqrt(sqDesc[0]);
00240 }
00241 __syncthreads();
00242
00243
00244
00245 d_descriptors[descriptor_base + threadIdx.x] = lookup / len;
00246 }
00247
00248
00249 void compute_descriptors(float * d_descriptors, Keypoint * d_features, int nFeaturesFound)
00250 {
00251
00252
00253 compute_descriptors_kernel <<< dim3(nFeaturesFound,16,1), dim3(5,5,1) >>> (d_descriptors, d_features);
00254 normalize_descriptors_kernel <<< dim3(nFeaturesFound,1,1), dim3(ASRL_SURF_DESCRIPTOR_DIM,1,1) >>> (d_descriptors);
00255 }
00256
00257
00258 }