Autonomous Space Robotics Lab: gpusurf: GpuIntegralImageProcessor.cpp Source File

00001 /*
00002 Copyright (c) 2010, Paul Furgale and Chi Hay Tong
00003 All rights reserved.
00004 
00005 Redistribution and use in source and binary forms, with or without 
00006 modification, are permitted provided that the following conditions are 
00007 met:
00008 
00009 * Redistributions of source code must retain the above copyright notice, 
00010   this list of conditions and the following disclaimer.
00011 * Redistributions in binary form must reproduce the above copyright 
00012   notice, this list of conditions and the following disclaimer in the 
00013   documentation and/or other materials provided with the distribution.
00014 * The names of its contributors may not be used to endorse or promote 
00015   products derived from this software without specific prior written 
00016   permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
00019 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
00020 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
00021 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
00022 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
00023 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
00024 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
00025 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
00026 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
00027 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
00028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 */
00030 
00031 #include "GpuIntegralImageProcessor.hpp"
00032 #include <cudpp.h>
00033 #include <cuda.h>
00034 #include <builtin_types.h>
00035 #include <channel_descriptor.h>
00036 #include <iostream>
00037 #include "GpuIntegralImage_kernel.h"
00038 #include "GpuIntegralImage.hpp"
00039 #include "assert_macros.hpp"
00040 
00041 
00042 namespace asrl {
00043 
00044   GpuIntegralImageProcessor::GpuIntegralImageProcessor(int width, int height)
00045   {
00046     cudaError_t err;
00047     m_width = width;
00048     m_height = height;
00049 
00050     // allocate GPU memory (char, normal, and transposed data)
00051     unsigned char * char_ptr;
00052     err = cudaMallocPitch( (void**) &char_ptr, &char_pitch, width*sizeof(unsigned char), height);
00053     ASRL_ASSERT_EQ(err,cudaSuccess, "Unable to allocate CUDA char* input image.");
00054     char_data.reset(char_ptr,&cudaFree);
00055 
00056     float * norm_ptr;
00057     err = cudaMallocPitch( (void**) &norm_ptr, &norm_pitch, width*sizeof(float), height);
00058     ASRL_ASSERT_EQ(err, cudaSuccess, "Unable to allocate CUDA normally oriented float integral image.");
00059     norm_data.reset(norm_ptr,&cudaFree);
00060 
00061     float * trans_ptr;
00062     err = cudaMallocPitch( (void**) &trans_ptr, &trans_pitch, height*sizeof(float), width);
00063     ASRL_ASSERT_EQ(err, cudaSuccess, "Unable to allocate CUDA transpose oriented integral image.");
00064     trans_data.reset(trans_ptr,&cudaFree);
00065 
00066     CUDPPConfiguration config = { CUDPP_SCAN, CUDPP_ADD, CUDPP_FLOAT, CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE };
00067     CUDPPResult result = cudppPlan(&colPlan, config, width*height, width, trans_pitch/sizeof(float));
00068     ASRL_ASSERT_EQ(result, CUDPP_SUCCESS, "Error creating scanPlan (column scan):" << result);
00069 
00070     result = cudppPlan(&rowPlan, config, width*height, height, norm_pitch/sizeof(float));
00071     ASRL_ASSERT_EQ(result,CUDPP_SUCCESS, "Error creating scanPlan (row scan): " << result);
00072   }
00073 
00074   GpuIntegralImageProcessor::~GpuIntegralImageProcessor()
00075   {
00076     try{
00077       CUDPPResult result = cudppDestroyPlan(colPlan);
00078       if(result != CUDPP_SUCCESS)
00079         std::cerr << "Unable to destroy column plan. Err code: " << result;
00080     } catch(std::exception const & e) {
00081       std::cout << e.what() << std::endl;
00082     }
00083     try{
00084       CUDPPResult result = cudppDestroyPlan(rowPlan);
00085       if(result != CUDPP_SUCCESS)
00086         std::cerr << "Unable to destroy column plan. Err code: " << result;
00087     } catch(std::exception const & e) {
00088       std::cout << e.what() << std::endl;
00089     }
00090   }
00091 
00092 
00093   void GpuIntegralImageProcessor::upload(cv::Mat & image)
00094   {
00095     ASRL_ASSERT_EQ(image.type(),CV_8UC1,"The image must be single channel, 8 bit");
00096     ASRL_ASSERT(image.isContinuous(),"The image must be tightly packed. width: " << image.cols << ", step: " << image.step);
00097     // This is synchronous. Asynchronous calls have to go through page-locked memory. I'm not sure it would be faster to
00098     // copy the buffer to PLM and then transfer asynchronously...
00099     cudaError_t err = cudaMemcpy2D( (void*) char_data.get(), char_pitch, (void*) image.ptr(), image.cols*sizeof(unsigned char),
00100                                     image.cols*sizeof(unsigned char), image.rows, cudaMemcpyHostToDevice );
00101     ASRL_ASSERT_EQ(err,cudaSuccess, "Unable to copy image to GPU: (" << err << "): " << cudaGetErrorString(err));               
00102   }
00103 
00104   void GpuIntegralImageProcessor::process(cv::Mat & image, GpuIntegralImage & outImage, cudaStream_t stream){
00105     upload(image);
00106     process(outImage, stream);
00107   }
00108 
00109   void GpuIntegralImageProcessor::process(GpuIntegralImage & outImage, cudaStream_t stream)
00110   {
00111     call_integral_kernel((size_t) width(), (size_t)height(),
00112                          char_data.get(),               // initial storage on the gpu of the unsigned char data
00113                          norm_data.get(), 
00114                          trans_data.get(),      // storage of floats on the gpu
00115                          outImage.d_get(),
00116                          norm_pitch, 
00117                          trans_pitch, 
00118                          char_pitch,
00119                          rowPlan,
00120                          colPlan,
00121                          stream);
00122   }
00123 
00124 } // namespace asrl
Autonomous Space Robotics Lab

Speeded Up SURF

GpuIntegralImageProcessor.cpp