@@ -92,7 +92,7 @@ __device__ int trilinear_interp_d(const int dimx,
9292 return -1 ;
9393 }
9494
95- int coo[3 ][2 ];
95+ long long coo[3 ][2 ];
9696 REAL wgh[3 ][2 ]; // could use just one...
9797
9898 const REAL_T ONE = static_cast <REAL_T>(1.0 );
@@ -647,7 +647,8 @@ template<int BDIM_X,
647647 int BDIM_Y,
648648 typename REAL_T,
649649 typename REAL3_T>
650- __device__ int closest_peak_d (const REAL3_T direction, // dir
650+ __device__ int closest_peak_d (const REAL_T max_angle,
651+ const REAL3_T direction, // dir
651652 const int npeaks,
652653 const REAL3_T *__restrict__ peaks,
653654 REAL3_T *__restrict__ peak) {// dirs,
@@ -657,7 +658,8 @@ __device__ int closest_peak_d(const REAL3_T direction, //dir
657658 const int lid = (threadIdx .y *BDIM_X + threadIdx .x ) % 32 ;
658659 const unsigned int WMASK = ((1ull << BDIM_X)-1 ) << (lid & (~(BDIM_X-1 )));
659660
660- const REAL_T cos_similarity = COS (MAX_ANGLE_P);
661+ // const REAL_T cos_similarity = COS(MAX_ANGLE_P);
662+ const REAL_T cos_similarity = COS (max_angle);
661663#if 0
662664 if (!threadIdx.y && !tidx) {
663665 printf("direction: (%f, %f, %f)\n",
@@ -804,7 +806,9 @@ template<int BDIM_X,
804806 typename REAL_T,
805807 typename REAL3_T>
806808__device__ int get_direction_d (curandStatePhilox4_32_10_t *st,
807- REAL3_T dir,
809+ const REAL_T max_angle,
810+ const REAL_T min_signal,
811+ REAL3_T dir,
808812 const int dimx,
809813 const int dimy,
810814 const int dimz,
@@ -919,7 +923,8 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
919923 // __syncwarp();
920924
921925 for (int j = tidx; j < dimt; j += BDIM_X) {
922- __vox_data_sh[j] = MAX (MIN_SIGNAL_P, __vox_data_sh[j]);
926+ // __vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
927+ __vox_data_sh[j] = MAX (min_signal, __vox_data_sh[j]);
923928 }
924929 __syncwarp (WMASK);
925930
@@ -1021,7 +1026,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
10211026 }
10221027 */
10231028 REAL3_T peak;
1024- const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(dir, ndir, dirs, &peak);
1029+ const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
10251030 __syncwarp (WMASK);
10261031 if (foundPeak) {
10271032 if (tidx == 0 ) {
@@ -1041,7 +1046,8 @@ template<int BDIM_X,
10411046 int BDIM_Y,
10421047 typename REAL_T,
10431048 typename REAL3_T>
1044- __device__ int check_point_d (const REAL3_T point,
1049+ __device__ int check_point_d (const REAL_T tc_threshold,
1050+ const REAL3_T point,
10451051 const int dimx,
10461052 const int dimy,
10471053 const int dimz,
@@ -1064,14 +1070,19 @@ __device__ int check_point_d(const REAL3_T point,
10641070 if (rv != 0 ) {
10651071 return OUTSIDEIMAGE;
10661072 }
1067- return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
1073+ // return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
1074+ return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
10681075}
10691076
10701077template <int BDIM_X,
10711078 int BDIM_Y,
10721079 typename REAL_T,
10731080 typename REAL3_T>
10741081__device__ int tracker_d (curandStatePhilox4_32_10_t *st,
1082+ const REAL_T max_angle,
1083+ const REAL_T min_signal,
1084+ const REAL_T tc_threshold,
1085+ const REAL_T step_size,
10751086 REAL3_T seed,
10761087 REAL3_T first_step,
10771088 REAL3_T voxel_size,
@@ -1088,7 +1099,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
10881099 // max_angle, pmf_threshold from global defines
10891100 // b0s_mask already passed
10901101 // min_signal from global defines
1091- // tc_threashold from global defines
1102+ // tc_threshold from global defines
10921103 // pmf_threashold from global defines
10931104 const REAL_T *__restrict__ metric_map,
10941105 const int delta_nr,
@@ -1131,6 +1142,8 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
11311142 int ndir = get_direction_d<BDIM_X,
11321143 BDIM_Y,
11331144 5 >(st,
1145+ max_angle,
1146+ min_signal,
11341147 direction,
11351148 dimx, dimy, dimz, dimt, dataf,
11361149 b0s_mask /* !dwi_mask */ ,
@@ -1161,9 +1174,12 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
11611174 }
11621175 //return;
11631176#endif
1164- point.x += (direction.x / voxel_size.x ) * STEP_SIZE_P;
1165- point.y += (direction.y / voxel_size.y ) * STEP_SIZE_P;
1166- point.z += (direction.z / voxel_size.z ) * STEP_SIZE_P;
1177+ // point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
1178+ // point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
1179+ // point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
1180+ point.x += (direction.x / voxel_size.x ) * step_size;
1181+ point.y += (direction.y / voxel_size.y ) * step_size;
1182+ point.z += (direction.z / voxel_size.z ) * step_size;
11671183
11681184 if (tidx == 0 ) {
11691185 streamline[i] = point;
@@ -1175,7 +1191,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
11751191 }
11761192 __syncwarp (WMASK);
11771193
1178- tissue_class = check_point_d<BDIM_X, BDIM_Y>(point, dimx, dimy, dimz, metric_map);
1194+ tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
11791195
11801196 if (tissue_class == ENDPOINT ||
11811197 tissue_class == INVALIDPOINT ||
@@ -1192,7 +1208,9 @@ template<int BDIM_X,
11921208 int BDIM_Y,
11931209 typename REAL_T,
11941210 typename REAL3_T>
1195- __global__ void getNumStreamlines_k (const long long rndSeed,
1211+ __global__ void getNumStreamlines_k (const REAL_T max_angle,
1212+ const REAL_T min_signal,
1213+ const long long rndSeed,
11961214 const int rndOffset,
11971215 const int nseed,
11981216 const REAL3_T *__restrict__ seeds,
@@ -1244,6 +1262,8 @@ __global__ void getNumStreamlines_k(const long long rndSeed,
12441262 int ndir = get_direction_d<BDIM_X,
12451263 BDIM_Y,
12461264 1 >(&st,
1265+ max_angle,
1266+ min_signal,
12471267 MAKE_REAL3 (0 ,0 ,0 ),
12481268 dimx, dimy, dimz, dimt, dataf,
12491269 b0s_mask /* !dwi_mask */ ,
@@ -1280,7 +1300,11 @@ template<int BDIM_X,
12801300 int BDIM_Y,
12811301 typename REAL_T,
12821302 typename REAL3_T>
1283- __global__ void genStreamlinesMerge_k (const long long rndSeed,
1303+ __global__ void genStreamlinesMerge_k (const REAL_T max_angle,
1304+ const REAL_T min_signal,
1305+ const REAL_T tc_threshold,
1306+ const REAL_T step_size,
1307+ const long long rndSeed,
12841308 const int rndOffset,
12851309 const int nseed,
12861310 const REAL3_T *__restrict__ seeds,
@@ -1358,6 +1382,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
13581382 int stepsB;
13591383 const int tissue_classB = tracker_d<BDIM_X,
13601384 BDIM_Y>(&st,
1385+ max_angle,
1386+ min_signal,
1387+ tc_threshold,
1388+ step_size,
13611389 seed,
13621390 MAKE_REAL3 (-first_step.x , -first_step.y , -first_step.z ),
13631391 MAKE_REAL3 (1 , 1 , 1 ),
@@ -1391,6 +1419,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
13911419 int stepsF;
13921420 const int tissue_classF = tracker_d<BDIM_X,
13931421 BDIM_Y>(&st,
1422+ max_angle,
1423+ min_signal,
1424+ tc_threshold,
1425+ step_size,
13941426 seed,
13951427 first_step,
13961428 MAKE_REAL3 (1 , 1 , 1 ),
@@ -1433,7 +1465,8 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
14331465 return ;
14341466}
14351467
1436- void generate_streamlines_cuda_mgpu (const int nseeds, const std::vector<REAL*> &seeds_d,
1468+ void generate_streamlines_cuda_mgpu (const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
1469+ const int nseeds, const std::vector<REAL*> &seeds_d,
14371470 const int dimx, const int dimy, const int dimz, const int dimt,
14381471 const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
14391472 const int delta_nr,
@@ -1464,10 +1497,6 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
14641497 CHECK_CUDA (cudaMalloc (&shDirTemp1_d[n], sizeof (*shDirTemp1_d[n])*samplm_nr*grid.x *block.y ));
14651498 }
14661499
1467-
1468- // int delta_nr = 28; // TO BE MADE PARAMETERS!
1469- // int samplm_nr = 181;
1470-
14711500 int n32dimt = ((dimt+31 )/32 )*32 ;
14721501
14731502 size_t shSizeGNS = sizeof (REAL)*(THR_X_BL/THR_X_SL)*(2 *n32dimt + 2 *MAX (n32dimt, samplm_nr)) + // for get_direction_d
@@ -1486,7 +1515,9 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
14861515 // Precompute number of streamlines before allocating memory
14871516 getNumStreamlines_k<THR_X_SL,
14881517 THR_X_BL/THR_X_SL>
1489- <<<grid, block, shSizeGNS>>> (rng_seed,
1518+ <<<grid, block, shSizeGNS>>> (max_angle,
1519+ min_signal,
1520+ rng_seed,
14901521 rng_offset + n*nseeds_per_gpu,
14911522 nseeds_gpu,
14921523 reinterpret_cast <const REAL3 *>(seeds_d[n]),
@@ -1591,7 +1622,11 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
15911622 // fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
15921623 genStreamlinesMerge_k<THR_X_SL,
15931624 THR_X_BL/THR_X_SL>
1594- <<<grid, block, shSizeGNS, streams[n]>>> (rng_seed,
1625+ <<<grid, block, shSizeGNS, streams[n]>>> (max_angle,
1626+ min_signal,
1627+ tc_threshold,
1628+ step_size,
1629+ rng_seed,
15951630 rng_offset + n*nseeds_per_gpu,
15961631 nseeds_gpu,
15971632 reinterpret_cast <const REAL3 *>(seeds_d[n]),
0 commit comments