Skip to content

Commit b794717

Browse files
committed
Refreshing code with changes from recent development.
1 parent 0ecab0c commit b794717

File tree

8 files changed

+185
-76
lines changed

8 files changed

+185
-76
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ endif()
3030
#decide whether to use CUDA or not
3131
#find_package(CUDAToolkit REQUIRED)
3232
if(NOT CUDA_COMPUTE_CAPABILITY)
33-
set(CUDA_COMPUTE_CAPABILITY 70)
33+
set(CUDA_COMPUTE_CAPABILITY 70 80)
3434
endif()
3535

3636
#Find OpenMP

cuslines/cuslines.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,11 @@ py::capsule cleanup(T* ptr) {
6060

6161
class GPUTracker {
6262
public:
63-
GPUTracker(np_array_cast dataf,
63+
GPUTracker(double max_angle,
64+
double min_signal,
65+
double tc_threshold,
66+
double step_size,
67+
np_array_cast dataf,
6468
np_array H,
6569
np_array R,
6670
np_array delta_b,
@@ -117,6 +121,11 @@ class GPUTracker {
117121
std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
118122
ngpus_ = ngpus;
119123

124+
max_angle_ = max_angle;
125+
min_signal_ = min_signal;
126+
tc_threshold_ = tc_threshold;
127+
step_size_ = step_size;
128+
120129
// Allocate/copy constant problem data on GPUs
121130
dataf_d.resize(ngpus_, nullptr);
122131
H_d.resize(ngpus_, nullptr);
@@ -132,7 +141,8 @@ class GPUTracker {
132141
//#pragma omp parallel for
133142
for (int n = 0; n < ngpus_; ++n) {
134143
CHECK_CUDA(cudaSetDevice(n));
135-
CHECK_CUDA(cudaMalloc(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
144+
CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
145+
CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n));
136146
CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
137147
CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
138148
CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));
@@ -210,7 +220,8 @@ class GPUTracker {
210220
std::vector<int> nSlines(ngpus_);
211221

212222
// Call GPU routine
213-
generate_streamlines_cuda_mgpu(nseeds, seeds_d,
223+
generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
224+
nseeds, seeds_d,
214225
dimx_, dimy_, dimz_, dimt_,
215226
dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
216227
sphere_vertices_d, sphere_edges_d, nedges_,
@@ -270,6 +281,11 @@ class GPUTracker {
270281
int nedges_;
271282
int delta_nr_, samplm_nr_;
272283

284+
double max_angle_;
285+
double tc_threshold_;
286+
double min_signal_;
287+
double step_size_;
288+
273289
std::vector<int> nSlines_old_;
274290
std::vector<REAL*> slines_;
275291
std::vector<int*> slinesLen_;
@@ -292,12 +308,14 @@ class GPUTracker {
292308

293309
PYBIND11_MODULE(cuslines, m) {
294310
py::class_<GPUTracker>(m, "GPUTracker")
295-
.def(py::init<np_array_cast, np_array,
311+
.def(py::init<double, double, double, double,
312+
np_array_cast, np_array,
296313
np_array, np_array,
297314
np_array, np_array_int,
298315
np_array, np_array,
299316
np_array, np_array_int,
300317
int, int, int>(),
318+
py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
301319
py::arg().noconvert(), py::arg().noconvert(),
302320
py::arg().noconvert(), py::arg().noconvert(),
303321
py::arg().noconvert(), py::arg().noconvert(),

cuslines/generate_streamlines_cuda.cu

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ __device__ int trilinear_interp_d(const int dimx,
9292
return -1;
9393
}
9494

95-
int coo[3][2];
95+
long long coo[3][2];
9696
REAL wgh[3][2]; // could use just one...
9797

9898
const REAL_T ONE = static_cast<REAL_T>(1.0);
@@ -647,7 +647,8 @@ template<int BDIM_X,
647647
int BDIM_Y,
648648
typename REAL_T,
649649
typename REAL3_T>
650-
__device__ int closest_peak_d(const REAL3_T direction, //dir
650+
__device__ int closest_peak_d(const REAL_T max_angle,
651+
const REAL3_T direction, //dir
651652
const int npeaks,
652653
const REAL3_T *__restrict__ peaks,
653654
REAL3_T *__restrict__ peak) {// dirs,
@@ -657,7 +658,8 @@ __device__ int closest_peak_d(const REAL3_T direction, //dir
657658
const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
658659
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
659660

660-
const REAL_T cos_similarity = COS(MAX_ANGLE_P);
661+
//const REAL_T cos_similarity = COS(MAX_ANGLE_P);
662+
const REAL_T cos_similarity = COS(max_angle);
661663
#if 0
662664
if (!threadIdx.y && !tidx) {
663665
printf("direction: (%f, %f, %f)\n",
@@ -804,7 +806,9 @@ template<int BDIM_X,
804806
typename REAL_T,
805807
typename REAL3_T>
806808
__device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
807-
REAL3_T dir,
809+
const REAL_T max_angle,
810+
const REAL_T min_signal,
811+
REAL3_T dir,
808812
const int dimx,
809813
const int dimy,
810814
const int dimz,
@@ -919,7 +923,8 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
919923
//__syncwarp();
920924

921925
for(int j = tidx; j < dimt; j += BDIM_X) {
922-
__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
926+
//__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
927+
__vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]);
923928
}
924929
__syncwarp(WMASK);
925930

@@ -1021,7 +1026,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
10211026
}
10221027
*/
10231028
REAL3_T peak;
1024-
const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(dir, ndir, dirs, &peak);
1029+
const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
10251030
__syncwarp(WMASK);
10261031
if (foundPeak) {
10271032
if (tidx == 0) {
@@ -1041,7 +1046,8 @@ template<int BDIM_X,
10411046
int BDIM_Y,
10421047
typename REAL_T,
10431048
typename REAL3_T>
1044-
__device__ int check_point_d(const REAL3_T point,
1049+
__device__ int check_point_d(const REAL_T tc_threshold,
1050+
const REAL3_T point,
10451051
const int dimx,
10461052
const int dimy,
10471053
const int dimz,
@@ -1064,14 +1070,19 @@ __device__ int check_point_d(const REAL3_T point,
10641070
if (rv != 0) {
10651071
return OUTSIDEIMAGE;
10661072
}
1067-
return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
1073+
//return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
1074+
return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
10681075
}
10691076

10701077
template<int BDIM_X,
10711078
int BDIM_Y,
10721079
typename REAL_T,
10731080
typename REAL3_T>
10741081
__device__ int tracker_d(curandStatePhilox4_32_10_t *st,
1082+
const REAL_T max_angle,
1083+
const REAL_T min_signal,
1084+
const REAL_T tc_threshold,
1085+
const REAL_T step_size,
10751086
REAL3_T seed,
10761087
REAL3_T first_step,
10771088
REAL3_T voxel_size,
@@ -1088,7 +1099,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
10881099
// max_angle, pmf_threshold from global defines
10891100
// b0s_mask already passed
10901101
// min_signal from global defines
1091-
// tc_threashold from global defines
1102+
// tc_threshold from global defines
10921103
// pmf_threashold from global defines
10931104
const REAL_T *__restrict__ metric_map,
10941105
const int delta_nr,
@@ -1131,6 +1142,8 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
11311142
int ndir = get_direction_d<BDIM_X,
11321143
BDIM_Y,
11331144
5>(st,
1145+
max_angle,
1146+
min_signal,
11341147
direction,
11351148
dimx, dimy, dimz, dimt, dataf,
11361149
b0s_mask /* !dwi_mask */,
@@ -1161,9 +1174,12 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
11611174
}
11621175
//return;
11631176
#endif
1164-
point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
1165-
point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
1166-
point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
1177+
//point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
1178+
//point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
1179+
//point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
1180+
point.x += (direction.x / voxel_size.x) * step_size;
1181+
point.y += (direction.y / voxel_size.y) * step_size;
1182+
point.z += (direction.z / voxel_size.z) * step_size;
11671183

11681184
if (tidx == 0) {
11691185
streamline[i] = point;
@@ -1175,7 +1191,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
11751191
}
11761192
__syncwarp(WMASK);
11771193

1178-
tissue_class = check_point_d<BDIM_X, BDIM_Y>(point, dimx, dimy, dimz, metric_map);
1194+
tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
11791195

11801196
if (tissue_class == ENDPOINT ||
11811197
tissue_class == INVALIDPOINT ||
@@ -1192,7 +1208,9 @@ template<int BDIM_X,
11921208
int BDIM_Y,
11931209
typename REAL_T,
11941210
typename REAL3_T>
1195-
__global__ void getNumStreamlines_k(const long long rndSeed,
1211+
__global__ void getNumStreamlines_k(const REAL_T max_angle,
1212+
const REAL_T min_signal,
1213+
const long long rndSeed,
11961214
const int rndOffset,
11971215
const int nseed,
11981216
const REAL3_T *__restrict__ seeds,
@@ -1244,6 +1262,8 @@ __global__ void getNumStreamlines_k(const long long rndSeed,
12441262
int ndir = get_direction_d<BDIM_X,
12451263
BDIM_Y,
12461264
1>(&st,
1265+
max_angle,
1266+
min_signal,
12471267
MAKE_REAL3(0,0,0),
12481268
dimx, dimy, dimz, dimt, dataf,
12491269
b0s_mask /* !dwi_mask */,
@@ -1280,7 +1300,11 @@ template<int BDIM_X,
12801300
int BDIM_Y,
12811301
typename REAL_T,
12821302
typename REAL3_T>
1283-
__global__ void genStreamlinesMerge_k(const long long rndSeed,
1303+
__global__ void genStreamlinesMerge_k(const REAL_T max_angle,
1304+
const REAL_T min_signal,
1305+
const REAL_T tc_threshold,
1306+
const REAL_T step_size,
1307+
const long long rndSeed,
12841308
const int rndOffset,
12851309
const int nseed,
12861310
const REAL3_T *__restrict__ seeds,
@@ -1358,6 +1382,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
13581382
int stepsB;
13591383
const int tissue_classB = tracker_d<BDIM_X,
13601384
BDIM_Y>(&st,
1385+
max_angle,
1386+
min_signal,
1387+
tc_threshold,
1388+
step_size,
13611389
seed,
13621390
MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
13631391
MAKE_REAL3(1, 1, 1),
@@ -1391,6 +1419,10 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
13911419
int stepsF;
13921420
const int tissue_classF = tracker_d<BDIM_X,
13931421
BDIM_Y>(&st,
1422+
max_angle,
1423+
min_signal,
1424+
tc_threshold,
1425+
step_size,
13941426
seed,
13951427
first_step,
13961428
MAKE_REAL3(1, 1, 1),
@@ -1433,7 +1465,8 @@ __global__ void genStreamlinesMerge_k(const long long rndSeed,
14331465
return;
14341466
}
14351467

1436-
void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &seeds_d,
1468+
void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
1469+
const int nseeds, const std::vector<REAL*> &seeds_d,
14371470
const int dimx, const int dimy, const int dimz, const int dimt,
14381471
const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
14391472
const int delta_nr,
@@ -1464,10 +1497,6 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
14641497
CHECK_CUDA(cudaMalloc(&shDirTemp1_d[n], sizeof(*shDirTemp1_d[n])*samplm_nr*grid.x*block.y));
14651498
}
14661499

1467-
1468-
// int delta_nr = 28; // TO BE MADE PARAMETERS!
1469-
// int samplm_nr = 181;
1470-
14711500
int n32dimt = ((dimt+31)/32)*32;
14721501

14731502
size_t shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_d
@@ -1486,7 +1515,9 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
14861515
// Precompute number of streamlines before allocating memory
14871516
getNumStreamlines_k<THR_X_SL,
14881517
THR_X_BL/THR_X_SL>
1489-
<<<grid, block, shSizeGNS>>>(rng_seed,
1518+
<<<grid, block, shSizeGNS>>>(max_angle,
1519+
min_signal,
1520+
rng_seed,
14901521
rng_offset + n*nseeds_per_gpu,
14911522
nseeds_gpu,
14921523
reinterpret_cast<const REAL3 *>(seeds_d[n]),
@@ -1591,7 +1622,11 @@ void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &
15911622
//fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
15921623
genStreamlinesMerge_k<THR_X_SL,
15931624
THR_X_BL/THR_X_SL>
1594-
<<<grid, block, shSizeGNS, streams[n]>>>(rng_seed,
1625+
<<<grid, block, shSizeGNS, streams[n]>>>(max_angle,
1626+
min_signal,
1627+
tc_threshold,
1628+
step_size,
1629+
rng_seed,
15951630
rng_offset + n*nseeds_per_gpu,
15961631
nseeds_gpu,
15971632
reinterpret_cast<const REAL3 *>(seeds_d[n]),

cuslines/generate_streamlines_cuda.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333

3434
#include "globals.h"
3535

36-
void generate_streamlines_cuda_mgpu(const int nseeds, const std::vector<REAL*> &seeds_d,
36+
void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
37+
const int nseeds, const std::vector<REAL*> &seeds_d,
3738
const int dimx, const int dimy, const int dimz, const int dimt,
3839
const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
3940
const int delta_nr,

cuslines/globals.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,11 @@
6969

7070
#define MAX_SLINE_LEN (501)
7171
#define PMF_THRESHOLD_P ((REAL)0.1)
72-
#define TC_THRESHOLD_P ((REAL)0.1)
73-
#define STEP_SIZE_P ((REAL)0.5)
74-
#define MAX_ANGLE_P ((REAL)1.0471975511965976) // 60 deg in radians
75-
#define MIN_SIGNAL_P ((REAL)1.0)
72+
73+
//#define TC_THRESHOLD_P ((REAL)0.1)
74+
//#define STEP_SIZE_P ((REAL)0.5) // only for TRK generation
75+
//#define MAX_ANGLE_P ((REAL)1.0471975511965976) // 60 deg in radians
76+
//#define MIN_SIGNAL_P ((REAL)1.0)
7677

7778
#define MAX_SLINES_PER_SEED (10)
7879

docker/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
88
RUN apt update && \
99
apt install --assume-yes apt-transport-https \
1010
ca-certificates gnupg \
11-
software-properties-common gcc git wget
11+
software-properties-common gcc git wget numactl
1212
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \
1313
| gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
1414
RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ focal main"
@@ -23,7 +23,7 @@ ENV PATH /opt/anaconda/bin:${PATH}
2323
ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH}
2424

2525
# python prereqs
26-
RUN pip install numpy scipy cython nibabel dipy
26+
RUN pip install numpy scipy cython nibabel dipy tqdm
2727

2828
# copy stuff
2929
COPY CMakeLists.txt /opt/GPUStreamlines/CMakeLists.txt

0 commit comments

Comments
 (0)