00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "non_max_suppression.h"
00032
00033 namespace asrl {
00034
00035 extern __shared__ float fh_vals[];
00036 __global__ void surf_nonmaxonly_kernel(float * d_hessian, int octave, int4 * d_maxmin, unsigned int * d_maxmin_counter, float threshold)
00037 {
00038
00039 int hidx_x = threadIdx.x + __mul24(blockIdx.x, (blockDim.x-2));
00040 int hidx_y = threadIdx.y + __mul24(blockIdx.y, (blockDim.y-2));
00041 int hidx_z = threadIdx.z;
00042 int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
00043
00044
00045 if( hidx_x < d_octave_params[octave].x_size &&
00046 hidx_y < d_octave_params[octave].y_size &&
00047 hidx_z < d_octave_params[octave].nIntervals ){
00048
00049 int hidx_lin = hidx_x +
00050 d_hessian_stride[0] * hidx_y +
00051 d_hessian_stride[0] * d_octave_params[octave].y_size * hidx_z;
00052 fh_vals[localLin] = d_hessian[hidx_lin];
00053 }
00054 __syncthreads();
00055
00056
00057
00058 bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < d_octave_params[octave].x_size - 1
00059 && threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < d_octave_params[octave].y_size - 1
00060 && threadIdx.z > 0 && threadIdx.z < blockDim.z-1;
00061
00062 float val = fh_vals[localLin];
00063
00064 if(inBounds2 && val >= threshold){
00065
00066 int zoff = __mul24(blockDim.x, blockDim.y);
00067 bool condmax = val > fh_vals[localLin + 1]
00068 && val > fh_vals[localLin - 1]
00069 && val > fh_vals[localLin - blockDim.x + 1]
00070 && val > fh_vals[localLin - blockDim.x ]
00071 && val > fh_vals[localLin - blockDim.x - 1]
00072 && val > fh_vals[localLin + blockDim.x + 1]
00073 && val > fh_vals[localLin + blockDim.x ]
00074 && val > fh_vals[localLin + blockDim.x - 1]
00075
00076 && val > fh_vals[localLin - zoff + 1]
00077 && val > fh_vals[localLin - zoff ]
00078 && val > fh_vals[localLin - zoff - 1]
00079 && val > fh_vals[localLin - zoff - blockDim.x + 1]
00080 && val > fh_vals[localLin - zoff - blockDim.x ]
00081 && val > fh_vals[localLin - zoff - blockDim.x - 1]
00082 && val > fh_vals[localLin - zoff + blockDim.x + 1]
00083 && val > fh_vals[localLin - zoff + blockDim.x ]
00084 && val > fh_vals[localLin - zoff + blockDim.x - 1]
00085
00086 && val > fh_vals[localLin + zoff + 1]
00087 && val > fh_vals[localLin + zoff ]
00088 && val > fh_vals[localLin + zoff - 1]
00089 && val > fh_vals[localLin + zoff - blockDim.x + 1]
00090 && val > fh_vals[localLin + zoff - blockDim.x ]
00091 && val > fh_vals[localLin + zoff - blockDim.x - 1]
00092 && val > fh_vals[localLin + zoff + blockDim.x + 1]
00093 && val > fh_vals[localLin + zoff + blockDim.x ]
00094 && val > fh_vals[localLin + zoff + blockDim.x - 1]
00095 ;
00096
00097 if(condmax) {
00098 unsigned i = atomicInc(d_maxmin_counter,(unsigned int) -1);
00099
00100 if(i < ASRL_SURF_MAX_CANDIDATES) {
00101 int4 f = {hidx_x, hidx_y, threadIdx.z, octave};
00102 d_maxmin[i] = f;
00103
00104 }
00105 }
00106 }
00107
00108
00109 }
00110
00111 void run_surf_nonmaxonly_kernel(dim3 grid, dim3 threads, size_t sharedBytes, float * d_hessian, int octave, int4 * d_maxmin, unsigned int * d_maxmin_counter, float threshold)
00112 {
00113 surf_nonmaxonly_kernel <<< grid, threads, sharedBytes >>>
00114 (d_hessian, octave, d_maxmin, d_maxmin_counter, threshold);
00115 }
00116
00117 }