00001
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 #ifndef ASRL_CUDA_SYNCHRONIZED_MEMORY_HPP
00043 #define ASRL_CUDA_SYNCHRONIZED_MEMORY_HPP
00044
00045 #include <builtin_types.h>
00046 #include <cuda_runtime_api.h>
00047 #include "assert_macros.hpp"
00048 #include <limits>
00049 #include <iostream>
00050 #include <typeinfo>
00051 #include <cstring>
00052
00053 namespace asrl {
00054
00064 template<typename T>
00065 class CudaSynchronizedMemory
00066 {
00071 CudaSynchronizedMemory(CudaSynchronizedMemory const & rhs);
00072
00077 CudaSynchronizedMemory & operator=(CudaSynchronizedMemory const & rhs);
00078 public:
00080 typedef T value_type;
00081
00086 CudaSynchronizedMemory();
00087
00095 CudaSynchronizedMemory(unsigned N, bool pageLocked = false);
00096
00097
00101 ~CudaSynchronizedMemory();
00102
00110 void init(unsigned N, bool pageLocked = false);
00111
00112
00113
00114
00119 void reset();
00120
00121
00126 T * begin();
00127
00133 T * end();
00134
00139 T * h_get();
00140
00146 T const * h_get() const;
00147
00148
00154 T * d_get();
00155
00161 T const * d_get() const;
00162
00168 size_t size() const;
00169
00170
00178 T & operator[](size_t i);
00179
00187 T const & operator[](size_t i) const;
00188
00194 void pushToDevice(size_t nElements = std::numeric_limits<size_t>::max());
00195
00201 void pullFromDevice(size_t nElements = std::numeric_limits<size_t>::max());
00202
00209 void pullFromDeviceAsync(cudaStream_t stream, size_t nElements = std::numeric_limits<size_t>::max());
00210
00215 void memsetHost(int val);
00216
00221 void memsetDevice(int val);
00222
00227 void memset(int val);
00228
00233 bool isPageLocked();
00234 private:
00235
00236 void freeHostMemory();
00237 void freeDeviceMemory();
00238
00240 T * m_host;
00242 T * m_device;
00244 size_t m_size;
00246 bool m_pageLocked;
00247
00248 };
00249
00250
00251 template<typename T>
00252 CudaSynchronizedMemory<T>::CudaSynchronizedMemory() : m_host(0), m_device(0), m_size(0), m_pageLocked(false) {}
00253
00254 template<typename T>
00255 CudaSynchronizedMemory<T>::CudaSynchronizedMemory(unsigned N, bool pageLocked) : m_host(0), m_device(0), m_size(0), m_pageLocked(false)
00256 {
00257 init(N, pageLocked);
00258 }
00259
00260 template<typename T>
00261 CudaSynchronizedMemory<T>::~CudaSynchronizedMemory()
00262 {
00263 try
00264 {
00265 reset();
00266 }
00267 catch(std::exception const & e)
00268 {
00269 std::cerr << "Error in " << __FUNCTION__ << ": " << e.what() << std::endl;
00270 }
00271 }
00272
00273 template<typename T>
00274 void CudaSynchronizedMemory<T>::reset()
00275 {
00276 m_size = 0;
00277 freeHostMemory();
00278 freeDeviceMemory();
00279
00280 m_pageLocked = false;
00281 }
00282
00283 template<typename T>
00284 void CudaSynchronizedMemory<T>::freeHostMemory()
00285 {
00286 if(m_host != NULL)
00287 {
00288 if(m_pageLocked)
00289 {
00290 cudaError_t err = cudaFreeHost(m_host);
00291 ASRL_ASSERT_EQ(err, cudaSuccess,
00292 "Unable to free page-locked host memory for " << m_size << " elements of type "
00293 << typeid(T).name() << ": " << cudaGetErrorString(err));
00294 }
00295 else
00296 {
00297 delete [] m_host;
00298 }
00299 m_host = NULL;
00300 }
00301 }
00302
00303 template<typename T>
00304 void CudaSynchronizedMemory<T>::freeDeviceMemory()
00305 {
00306 if(m_device != NULL)
00307 {
00308 cudaError_t err = cudaFree(m_device);
00309 ASRL_ASSERT_EQ(err, cudaSuccess,
00310 "Unable to free device memory for " << m_size << " elements of type "
00311 << typeid(T).name() << ": " << cudaGetErrorString(err));
00312
00313 m_device = NULL;
00314 }
00315
00316 }
00317
00318
00319 template<typename T>
00320 void CudaSynchronizedMemory<T>::memset(int val)
00321 {
00322 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00323 memsetHost(val);
00324 memsetDevice(val);
00325 }
00326
00327 template<typename T>
00328 void CudaSynchronizedMemory<T>::memsetHost(int val)
00329 {
00330 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00331 ::memset(m_host, val, m_size * sizeof(T));
00332 }
00333
00334 template<typename T>
00335 void CudaSynchronizedMemory<T>::memsetDevice(int val)
00336 {
00337 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00338 cudaError_t err = cudaMemset(m_device,val,m_size*sizeof(T));
00339 ASRL_ASSERT_EQ(err, cudaSuccess, "Unable to set device memory");
00340 }
00341
00342
00343 template<typename T>
00344 void CudaSynchronizedMemory<T>::init(unsigned N, bool pageLocked)
00345 {
00346 try {
00347 if(pageLocked) {
00348 cudaError_t e = cudaMallocHost((void**)&m_host,N*sizeof(T));
00349 ASRL_ASSERT_EQ(e, cudaSuccess,
00350 "Unable to allocate page-locked host memory for " << N << " elements of type "
00351 << typeid(T).name() << ": " << cudaGetErrorString(e));
00352
00353 } else {
00354 try {
00355 m_host = new T[N];
00356 } catch(std::exception const & e) {
00357 ASRL_THROW(
00358 "Unable to allocate host memory for " << N << " elements of type "
00359 << typeid(T).name() << ": " << e.what());
00360 }
00361 }
00362
00363 cudaError_t err = cudaMalloc((void**)&m_device,N*sizeof(T));
00364 ASRL_ASSERT_EQ(err, cudaSuccess,
00365 "Unable to allocate device memory for " << N << " elements of type "
00366 << typeid(T).name() << ": " << cudaGetErrorString(err));
00367
00368 m_size = N;
00369 m_pageLocked = pageLocked;
00370 } catch(std::exception const &)
00371 {
00372 reset();
00373 throw;
00374 }
00375 }
00376
00377 template<typename T>
00378 bool CudaSynchronizedMemory<T>::isPageLocked()
00379 {
00380 return m_pageLocked;
00381 }
00382
00383 template<typename T>
00384 T * CudaSynchronizedMemory<T>::begin()
00385 {
00386 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00387 return m_host;
00388 }
00389
00390 template<typename T>
00391 T * CudaSynchronizedMemory<T>::end()
00392 {
00393 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00394 return m_host + m_size;
00395 }
00396
00397 template<typename T>
00398 T * CudaSynchronizedMemory<T>::h_get() { return m_host; }
00399
00400 template<typename T>
00401 T const * CudaSynchronizedMemory<T>::h_get() const { return m_host; }
00402
00403 template<typename T>
00404 T * CudaSynchronizedMemory<T>::d_get() { return m_device; }
00405
00406 template<typename T>
00407 T const * CudaSynchronizedMemory<T>::d_get() const { return m_device; }
00408
00409 template<typename T>
00410 size_t CudaSynchronizedMemory<T>::size() const { return m_size; }
00411
00412 template<typename T>
00413 T & CudaSynchronizedMemory<T>::operator[](size_t i) {
00414 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00415 ASRL_ASSERT_LT_DBG(i, m_size, "Index out of range: " << i << " >= " << m_size);
00416 return m_host[i];
00417 }
00418
00419 template<typename T>
00420 T const & CudaSynchronizedMemory<T>::operator[](size_t i) const {
00421 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00422 ASRL_ASSERT_LT_DBG(i, m_size, "Index out of range: " << i << " >= " << m_size);
00423 return m_host[i];
00424 }
00425
00426 template<typename T>
00427 void CudaSynchronizedMemory<T>::pushToDevice(size_t nElements)
00428 {
00429 ASRL_ASSERT_GT_DBG(m_size, 0, "The array is empty");
00430 if(nElements > m_size)
00431 nElements = m_size;
00432
00433 cudaError_t err = (cudaMemcpy((void*) m_device, (void *)m_host, nElements*sizeof(T), cudaMemcpyHostToDevice));
00434 ASRL_ASSERT_EQ(err, cudaSuccess, "Unable to copy " << typeid(T).name() << " array of size " << m_size << " to device (" << err << "): " << cudaGetErrorString(err));
00435 }
00436
00437 template<typename T>
00438 void CudaSynchronizedMemory<T>::pullFromDevice(size_t nElements)
00439 {
00440 ASRL_ASSERT_GT_DBG(m_size,0, "The array is empty");
00441 if(nElements > m_size)
00442 nElements = m_size;
00443 cudaError_t err = (cudaMemcpy((void*) m_host, (void *)m_device, nElements*sizeof(T), cudaMemcpyDeviceToHost));
00444 ASRL_ASSERT_EQ(err, cudaSuccess, "Unable to copy " << typeid(T).name() << " array of size " << m_size << " from device (" << err << "): " << cudaGetErrorString(err));
00445 }
00446
00447 template<typename T>
00448 void CudaSynchronizedMemory<T>::pullFromDeviceAsync(cudaStream_t stream, size_t nElements)
00449 {
00450 ASRL_ASSERT_GT(m_size,0, "The array is empty");
00451 ASRL_ASSERT(m_pageLocked, "Asynchronous transfer is only valid for page-locked host memory");
00452 if(nElements > m_size)
00453 nElements = m_size;
00454 cudaError_t err = (cudaMemcpyAsync((void*) m_host, (void *)m_device, nElements*sizeof(T), cudaMemcpyDeviceToHost, stream));
00455 ASRL_ASSERT_EQ(err,cudaSuccess, "Unable to copy " << typeid(T).name() << " array of size " << m_size << " from device. Stream " << stream << ": (" << err << "): " << cudaGetErrorString(err));
00456 }
00457
00458 }
00459
00460 #endif // ASRL_CUDA_SYNCHRONIZED_MEMORY_HPP