Refreshing code with changes from recent development.

romerojosh · romerojosh · commit b794717791ce · 2020-10-01T12:55:01.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -30,7 +30,7 @@ endif()
 #decide whether to use CUDA or not
 #find_package(CUDAToolkit REQUIRED)
 if(NOT CUDA_COMPUTE_CAPABILITY)
-  set(CUDA_COMPUTE_CAPABILITY 70)
+  set(CUDA_COMPUTE_CAPABILITY 70 80)
 endif()
 
 #Find OpenMP
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
@@ -60,7 +60,11 @@ py::capsule cleanup(T* ptr) {
 
 class GPUTracker {
   public:
-    GPUTracker(np_array_cast dataf,
+    GPUTracker(double max_angle,
+               double min_signal,
+               double tc_threshold,
+               double step_size,
+               np_array_cast dataf,
                np_array H,
                np_array R,
                np_array delta_b,
@@ -117,6 +121,11 @@ class GPUTracker {
       std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
       ngpus_ = ngpus;
 
+      max_angle_ = max_angle;
+      min_signal_ = min_signal;
+      tc_threshold_ = tc_threshold;
+      step_size_ = step_size;
+
       // Allocate/copy constant problem data on GPUs
       dataf_d.resize(ngpus_, nullptr);
       H_d.resize(ngpus_, nullptr);
@@ -132,7 +141,8 @@ class GPUTracker {
       //#pragma omp parallel for
       for (int n = 0; n < ngpus_; ++n) {
         CHECK_CUDA(cudaSetDevice(n));
-        CHECK_CUDA(cudaMalloc(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
+        CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
+        CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n));
         CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
         CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
         CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));
@@ -210,7 +220,8 @@ class GPUTracker {
       std::vector<int> nSlines(ngpus_);
 
       // Call GPU routine
-      generate_streamlines_cuda_mgpu(nseeds, seeds_d,
+      generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
+                                     nseeds, seeds_d,
                                      dimx_, dimy_, dimz_, dimt_,
                                      dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
                                      sphere_vertices_d, sphere_edges_d, nedges_,
@@ -270,6 +281,11 @@ class GPUTracker {
     int nedges_;
     int delta_nr_, samplm_nr_;
 
+    double max_angle_;
+    double tc_threshold_;
+    double min_signal_;
+    double step_size_;
+
     std::vector<int> nSlines_old_;
     std::vector<REAL*> slines_;
     std::vector<int*> slinesLen_;
@@ -292,12 +308,14 @@ class GPUTracker {
 
 PYBIND11_MODULE(cuslines, m) {
   py::class_<GPUTracker>(m, "GPUTracker")
-    .def(py::init<np_array_cast, np_array,
+    .def(py::init<double, double, double, double,
+		  np_array_cast, np_array,
                   np_array, np_array,
                   np_array, np_array_int,
                   np_array, np_array,
                   np_array, np_array_int,
                   int, int, int>(),
+                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
@@ -92,7 +92,7 @@ __device__ int trilinear_interp_d(const int dimx,
                 return -1;
         }
 
-        int  coo[3][2];
+        long long  coo[3][2];
         REAL wgh[3][2]; // could use just one...
 
         const REAL_T ONE  = static_cast<REAL_T>(1.0);
@@ -647,7 +647,8 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__device__ int closest_peak_d(const REAL3_T  direction, //dir
+__device__ int closest_peak_d(const REAL_T max_angle,
+			      const REAL3_T  direction, //dir
                               const int npeaks,
                               const REAL3_T *__restrict__ peaks,
                                     REAL3_T *__restrict__ peak) {// dirs,
@@ -657,7 +658,8 @@ __device__ int closest_peak_d(const REAL3_T  direction, //dir
         const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-        const REAL_T cos_similarity = COS(MAX_ANGLE_P);
+        //const REAL_T cos_similarity = COS(MAX_ANGLE_P);
+        const REAL_T cos_similarity = COS(max_angle);
 #if 0
         if (!threadIdx.y && !tidx) {
                 printf("direction: (%f, %f, %f)\n",
@@ -804,7 +806,9 @@ template<int BDIM_X,
          typename REAL_T,
          typename REAL3_T>
 __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
-                               REAL3_T dir,
+                               const REAL_T max_angle,
+			       const REAL_T min_signal,
+			       REAL3_T dir,
                                const int dimx,
                                const int dimy,
                                const int dimz,
@@ -919,7 +923,8 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
 			//__syncwarp();
 
 			for(int j = tidx; j < dimt; j += BDIM_X) {
-				__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
+				//__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
+				__vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]);
 			}
 			__syncwarp(WMASK);
 
@@ -1021,7 +1026,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
                                 }
                                 */
                                 REAL3_T peak;
-                                const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(dir, ndir, dirs, &peak);
+                                const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
                                 __syncwarp(WMASK);
                                 if (foundPeak) {
                                         if (tidx == 0) {
@@ -1041,7 +1046,8 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__device__ int check_point_d(const REAL3_T point,
+__device__ int check_point_d(const REAL_T tc_threshold,
+			     const REAL3_T point,
                              const int dimx,
                              const int dimy,
                              const int dimz,
@@ -1064,14 +1070,19 @@ __device__ int check_point_d(const REAL3_T point,
         if (rv != 0) {
                 return OUTSIDEIMAGE;
         }
-        return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
+        //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
+        return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
 }
 
 template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
 __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
+			 const REAL_T max_angle,
+			 const REAL_T min_signal,
+			 const REAL_T tc_threshold,
+			 const REAL_T step_size,
                          REAL3_T seed,
                          REAL3_T first_step,
                          REAL3_T voxel_size,
@@ -1088,7 +1099,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          // max_angle, pmf_threshold from global defines
                          // b0s_mask already passed
                          // min_signal from global defines
-                         // tc_threashold from global defines
+                         // tc_threshold from global defines
                          // pmf_threashold from global defines
                          const REAL_T *__restrict__ metric_map,
 			 const int delta_nr,
@@ -1131,6 +1142,8 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                 int ndir = get_direction_d<BDIM_X,
                                            BDIM_Y,
                                            5>(st,
+					      max_angle,
+					      min_signal,
                                               direction,
                                               dimx, dimy, dimz, dimt, dataf,
                                               b0s_mask /* !dwi_mask */,
@@ -1161,9 +1174,12 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                 }
                 //return;
 #endif
-                point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
-                point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
-                point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
+                //point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
+                //point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
+                //point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
+                point.x += (direction.x / voxel_size.x) * step_size;
+                point.y += (direction.y / voxel_size.y) * step_size;
+                point.z += (direction.z / voxel_size.z) * step_size;
 
                 if (tidx == 0) {
                         streamline[i] = point;
@@ -1175,7 +1191,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                 }
                 __syncwarp(WMASK);
 
-                tissue_class = check_point_d<BDIM_X, BDIM_Y>(point, dimx, dimy, dimz, metric_map); 
+                tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
 
                 if (tissue_class == ENDPOINT ||
                     tissue_class == INVALIDPOINT ||
@@ -1192,7 +1208,9 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void getNumStreamlines_k(const long long rndSeed,
+__global__ void getNumStreamlines_k(const REAL_T max_angle,
+				    const REAL_T min_signal,
+				    const long long rndSeed,
                                     const int rndOffset,
                                     const int nseed,
                                     const REAL3_T *__restrict__ seeds,
@@ -1244,6 +1262,8 @@ __global__ void getNumStreamlines_k(const long long rndSeed,
         int ndir = get_direction_d<BDIM_X,
                                    BDIM_Y,
                                    1>(&st,
+				      max_angle,
+				      min_signal,
                                       MAKE_REAL3(0,0,0),
                                       dimx, dimy, dimz, dimt, dataf,
                                       b0s_mask /* !dwi_mask */,
@@ -1280,7 +1300,11 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void genStreamlinesMerge_k(const long long rndSeed,
+__global__ void genStreamlinesMerge_k(const REAL_T max_angle,
+				      const REAL_T min_signal,
+				      const REAL_T tc_threshold,
+				      const REAL_T step_size,
+				      const long long rndSeed,
                                       const int rndOffset,
                                       const int nseed,
                                       const REAL3_T *__restrict__ seeds,
@@ -1358,6 +1382,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
                 int stepsB;
                 const int tissue_classB = tracker_d<BDIM_X,
                                                     BDIM_Y>(&st,
+						            max_angle,
+						            min_signal,
+							    tc_threshold,
+							    step_size,
                                                             seed,
                                                             MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
                                                             MAKE_REAL3(1, 1, 1),
@@ -1391,6 +1419,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
                 int stepsF;
                 const int tissue_classF = tracker_d<BDIM_X,
                                                     BDIM_Y>(&st,
+						            max_angle,
+						            min_signal,
+							    tc_threshold,
+							    step_size,
                                                             seed,
                                                             first_step,
                                                             MAKE_REAL3(1, 1, 1),
@@ -1433,7 +1465,8 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
         return;
 }
 
-void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &seeds_d,
+void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+                                    const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
 			            const int delta_nr,
@@ -1464,10 +1497,6 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
     CHECK_CUDA(cudaMalloc(&shDirTemp1_d[n], sizeof(*shDirTemp1_d[n])*samplm_nr*grid.x*block.y));
   }
 
-
-//  int delta_nr = 28;    // TO BE MADE PARAMETERS!
-//  int samplm_nr = 181;
-
   int n32dimt = ((dimt+31)/32)*32;
 
   size_t shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_d
@@ -1486,7 +1515,9 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
     // Precompute number of streamlines before allocating memory
     getNumStreamlines_k<THR_X_SL,
                         THR_X_BL/THR_X_SL>
-                        <<<grid, block, shSizeGNS>>>(rng_seed,
+                        <<<grid, block, shSizeGNS>>>(max_angle,
+						     min_signal,
+						     rng_seed,
 						     rng_offset + n*nseeds_per_gpu,
 						     nseeds_gpu,
 						     reinterpret_cast<const REAL3 *>(seeds_d[n]),
@@ -1591,7 +1622,11 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
     //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
     genStreamlinesMerge_k<THR_X_SL,
                           THR_X_BL/THR_X_SL>
-                          <<<grid, block, shSizeGNS, streams[n]>>>(rng_seed,
+                          <<<grid, block, shSizeGNS, streams[n]>>>(max_angle,
+								   min_signal,
+								   tc_threshold,
+								   step_size,
+								   rng_seed,
 								   rng_offset + n*nseeds_per_gpu,
 								   nseeds_gpu,
 								   reinterpret_cast<const REAL3 *>(seeds_d[n]),
diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
@@ -33,7 +33,8 @@
 
 #include "globals.h"
 
-void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &seeds_d,
+void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+				    const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
 			            const int delta_nr,
diff --git a/cuslines/globals.h b/cuslines/globals.h
@@ -69,10 +69,11 @@
 
 #define MAX_SLINE_LEN	(501)
 #define PMF_THRESHOLD_P	((REAL)0.1)
-#define TC_THRESHOLD_P	((REAL)0.1)
-#define STEP_SIZE_P	((REAL)0.5)
-#define MAX_ANGLE_P	((REAL)1.0471975511965976) // 60 deg in radians
-#define MIN_SIGNAL_P	((REAL)1.0)
+
+//#define TC_THRESHOLD_P	((REAL)0.1)
+//#define STEP_SIZE_P	((REAL)0.5)  // only for TRK generation
+//#define MAX_ANGLE_P	((REAL)1.0471975511965976) // 60 deg in radians
+//#define MIN_SIGNAL_P	((REAL)1.0)
 
 #define MAX_SLINES_PER_SEED (10)
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && \
     apt install --assume-yes apt-transport-https \
     	ca-certificates gnupg \
-	software-properties-common gcc git wget
+	software-properties-common gcc git wget numactl
 RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \
     	      | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
 RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ focal main"
@@ -23,7 +23,7 @@ ENV PATH /opt/anaconda/bin:${PATH}
 ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH}
 
 # python prereqs
-RUN pip install numpy scipy cython nibabel dipy
+RUN pip install numpy scipy cython nibabel dipy tqdm
 
 # copy stuff
 COPY CMakeLists.txt /opt/GPUStreamlines/CMakeLists.txt
diff --git a/run_dipy_gpu.py b/run_dipy_gpu.py
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py