From 3f9b6449b57d98004a731771dba595d3a851f2e2 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 18:19:49 +0100 Subject: [PATCH 01/35] add support when cuda absent --- CMakeLists.txt | 7 +- atomic_sum.h | 3 +- cuda_compat.h | 5 +- joseph3d_fwd.cpp | 21 +++ joseph3d_fwd.cu | 402 ++---------------------------------------- joseph3d_fwd_worker.h | 401 +++++++++++++++++++++++++++++++++++++++++ parallelproj.h | 2 +- 7 files changed, 439 insertions(+), 402 deletions(-) create mode 100644 joseph3d_fwd.cpp create mode 100644 joseph3d_fwd_worker.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e63938..2ee254a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,9 +44,8 @@ if(CMAKE_CUDA_COMPILER) target_include_directories(main PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) target_link_libraries(main PRIVATE parallelproj CUDA::cudart) else() - #find_package(OpenMP REQUIRED) - #add_library(parallelproj SHARED parallelproj.cpp) - #target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX) + find_package(OpenMP REQUIRED) + add_library(parallelproj SHARED joseph3d_fwd.cpp) + target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX) #add_executable(main main_no_cuda.cpp) endif() - diff --git a/atomic_sum.h b/atomic_sum.h index a3848e0..dc08883 100644 --- a/atomic_sum.h +++ b/atomic_sum.h @@ -1,7 +1,7 @@ #pragma once #include "cuda_compat.h" -CUDA_HOST_DEVICE inline void atomic_sum(float* target, float value) { +WORKER_QUALIFIER inline void atomic_sum(float* target, float value) { #ifdef __CUDA_ARCH__ atomicAdd(target, value); #else @@ -9,4 +9,3 @@ CUDA_HOST_DEVICE inline void atomic_sum(float* target, float value) { *target += value; #endif } - diff --git a/cuda_compat.h b/cuda_compat.h index 9205bf2..21451f1 100644 --- a/cuda_compat.h +++ b/cuda_compat.h @@ -1,8 +1,7 @@ #pragma once #ifdef __CUDACC__ - #define CUDA_HOST_DEVICE __host__ __device__ + #define WORKER_QUALIFIER __device__ #else - #define CUDA_HOST_DEVICE + #define WORKER_QUALIFIER #endif - diff --git a/joseph3d_fwd.cpp b/joseph3d_fwd.cpp new file mode 100644 index 0000000..bcd480c --- /dev/null +++ b/joseph3d_fwd.cpp @@ -0,0 +1,21 @@ +#include "parallelproj.h" +#include "joseph3d_fwd_worker.h" +#include "debug.h" + +void joseph3d_fwd(const float *xstart, + const float *xend, + const float *img, + const float *img_origin, + const float *voxsize, + float *p, + size_t nlors, + const int *img_dim, + int device_id, + int threadsperblock){ + +#pragma omp parallel for + for (size_t i = 0; i < nlors; ++i) { + joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); + } + +} diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu index 3e72a27..575e1cb 100644 --- a/joseph3d_fwd.cu +++ b/joseph3d_fwd.cu @@ -1,417 +1,35 @@ #include "parallelproj.h" +#include "joseph3d_fwd_worker.h" #include "debug.h" #include #include #include -__device__ unsigned char ray_cube_intersection_cuda(float orig0, - float orig1, - float orig2, - float bounds0_min, - float bounds1_min, - float bounds2_min, - float bounds0_max, - float bounds1_max, - float bounds2_max, - float rdir0, - float rdir1, - float rdir2, - float* t1, - float* t2){ - // the inverse of the directional vector - // using the inverse of the directional vector and IEEE floating point arith standard 754 - // makes sure that 0's in the directional vector are handled correctly - float invdir0 = 1.f/rdir0; - float invdir1 = 1.f/rdir1; - float invdir2 = 1.f/rdir2; - - unsigned char intersec = 1; - - float t11, t12, t21, t22; - - if (invdir0 >= 0){ - *t1 = (bounds0_min - orig0) * invdir0; - *t2 = (bounds0_max - orig0) * invdir0; - } - else{ - *t1 = (bounds0_max - orig0) * invdir0; - *t2 = (bounds0_min - orig0) * invdir0; - } - - if (invdir1 >= 0){ - t11 = (bounds1_min - orig1) * invdir1; - t12 = (bounds1_max - orig1) * invdir1; - } - else{ - t11 = (bounds1_max - orig1) * invdir1; - t12 = (bounds1_min - orig1) * invdir1; - } - - if ((*t1 > t12) || (t11 > *t2)){intersec = 0;} - if (t11 > *t1){*t1 = t11;} - if (t12 < *t2){*t2 = t12;} - - if (invdir2 >= 0){ - t21 = (bounds2_min - orig2) * invdir2; - t22 = (bounds2_max - orig2) * invdir2; - } - else{ - t21 = (bounds2_max - orig2) * invdir2; - t22 = (bounds2_min - orig2) * invdir2; - } - - if ((*t1 > t22) || (t21 > *t2)){intersec = 0;} - if (t21 > *t1){*t1 = t21;} - if (t22 < *t2){*t2 = t22;} - - return(intersec); -} - - __global__ void joseph3d_fwd_kernel(const float *xstart, const float *xend, const float *img, const float *img_origin, const float *voxsize, float *p, - long long nlors, - const int *img_dim) -{ - long long i = blockDim.x * blockIdx.x + threadIdx.x; - - if(i < nlors) - { - int n0 = img_dim[0]; - int n1 = img_dim[1]; - int n2 = img_dim[2]; - - float d0, d1, d2, d0_sq, d1_sq, d2_sq; - float lsq, cos0_sq, cos1_sq, cos2_sq; - unsigned short direction; - int i0, i1, i2; - int i0_floor, i1_floor, i2_floor; - int i0_ceil, i1_ceil, i2_ceil; - float x_pr0, x_pr1, x_pr2; - float tmp_0, tmp_1, tmp_2; - - float toAdd, cf; - - float xstart0 = xstart[i*3 + 0]; - float xstart1 = xstart[i*3 + 1]; - float xstart2 = xstart[i*3 + 2]; - - float xend0 = xend[i*3 + 0]; - float xend1 = xend[i*3 + 1]; - float xend2 = xend[i*3 + 2]; - - float voxsize0 = voxsize[0]; - float voxsize1 = voxsize[1]; - float voxsize2 = voxsize[2]; - - float img_origin0 = img_origin[0]; - float img_origin1 = img_origin[1]; - float img_origin2 = img_origin[2]; - - unsigned char intersec; - float t1, t2; - float istart_f, iend_f, tmp; - int istart, iend; - - // test whether the ray between the two detectors is most parallel - // with the 0, 1, or 2 axis - d0 = xend0 - xstart0; - d1 = xend1 - xstart1; - d2 = xend2 - xstart2; - - //----------- - //--- test whether ray and cube intersect - intersec = ray_cube_intersection_cuda(xstart0, xstart1, xstart2, - img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2, - img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2, - d0, d1, d2, &t1, &t2); - - if (intersec == 1) - { - d0_sq = d0*d0; - d1_sq = d1*d1; - d2_sq = d2*d2; - - lsq = d0_sq + d1_sq + d2_sq; - - cos0_sq = d0_sq / lsq; - cos1_sq = d1_sq / lsq; - cos2_sq = d2_sq / lsq; - - direction = 0; - if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq)) - { - direction = 1; - } - else - { - if ((cos2_sq >= cos0_sq) && (cos2_sq >= cos1_sq)) - { - direction = 2; - } - } - - if (direction == 0) - { - cf = voxsize0 / sqrtf(cos0_sq); - - // case where ray is most parallel to the 0 axis - // we step through the volume along the 0 direction - - //--- check where ray enters / leaves cube - istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0; - iend_f = (xstart0 + t2*d0 - img_origin0) / voxsize0; - - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; - istart_f = tmp; - } - - istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); - - if (istart < 0){istart = 0;} - if (iend >= n0){iend = n0;} - - // check in which "plane" the start and end points are - // we have to do this to avoid that we include voxels - // that are "outside" the line segment bewteen xstart and xend - - // !! for these calculations we overwrite the istart_f and iend_f variables !! - istart_f = (xstart0 - img_origin0) / voxsize0; - iend_f = (xend0 - img_origin0) / voxsize0; - - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; - istart_f = tmp; - } - - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} - //--- - - for(i0 = istart; i0 < iend; i0++) - { - // get the indices where the ray intersects the image plane - x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0; - x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0; - - i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); - i1_ceil = i1_floor + 1; - - i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); - i2_ceil = i2_floor + 1; - - // calculate the distances to the floor normalized to [0,1] - // for the bilinear interpolation - tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; - tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; - - toAdd = 0; - - if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2)) - { - toAdd += img[n1*n2*i0 + n2*i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2); - } - if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2)) - { - toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2); - } - if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) - { - toAdd += img[n1*n2*i0 + n2*i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2; - } - if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) - { - toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_ceil] * tmp_1 * tmp_2; - } - - if(toAdd != 0){p[i] += (cf * toAdd);} - } - } - - //--------------------------------------------------------------------------------- - if (direction == 1) - { - cf = voxsize1 / sqrtf(cos1_sq); - - // case where ray is most parallel to the 1 axis - // we step through the volume along the 1 direction - - //--- check where ray enters / leaves cube - istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1; - iend_f = (xstart1 + t2*d1 - img_origin1) / voxsize1; - - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; - istart_f = tmp; - } - - istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); - - if (istart < 0){istart = 0;} - if (iend >= n1){iend = n1;} - - // check in which "plane" the start and end points are - // we have to do this to avoid that we include voxels - // that are "outside" the line segment bewteen xstart and xend - - // !! for these calculations we overwrite the istart_f and iend_f variables !! - istart_f = (xstart1 - img_origin1) / voxsize1; - iend_f = (xend1 - img_origin1) / voxsize1; - - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; - istart_f = tmp; - } - - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} - //--- - - for (i1 = istart; i1 < iend; i1++) - { - // get the indices where the ray intersects the image plane - x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1; - x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1; - - i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); - i0_ceil = i0_floor + 1; - - i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); - i2_ceil = i2_floor + 1; - - // calculate the distances to the floor normalized to [0,1] - // for the bilinear interpolation - tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; - tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; - - toAdd = 0; - - if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) - { - toAdd += img[n1*n2*i0_floor + n2*i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2); - } - if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2)) - { - toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_floor] * tmp_0 * (1 - tmp_2); - } - if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) - { - toAdd += img[n1*n2*i0_floor + n2*i1 + i2_ceil] * (1 - tmp_0) * tmp_2; - } - if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) - { - toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_ceil] * tmp_0 * tmp_2; - } - - if(toAdd != 0){p[i] += (cf * toAdd);} - } - } - - //--------------------------------------------------------------------------------- - if (direction == 2) - { - cf = voxsize2 / sqrtf(cos2_sq); - - // case where ray is most parallel to the 2 axis - // we step through the volume along the 2 direction - - //--- check where ray enters / leaves cube - istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2; - iend_f = (xstart2 + t2*d2 - img_origin2) / voxsize2; - - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; - istart_f = tmp; - } - - istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); - - if (istart < 0){istart = 0;} - if (iend >= n2){iend = n2;} - - // check in which "plane" the start and end points are - // we have to do this to avoid that we include voxels - // that are "outside" the line segment bewteen xstart and xend - - // !! for these calculations we overwrite the istart_f and iend_f variables !! - istart_f = (xstart2 - img_origin2) / voxsize2; - iend_f = (xend2 - img_origin2) / voxsize2; - - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; - istart_f = tmp; - } - - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} - //--- - - for(i2 = istart; i2 < iend; i2++) - { - // get the indices where the ray intersects the image plane - x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2; - x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2; - - i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); - i0_ceil = i0_floor + 1; - - i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); - i1_ceil = i1_floor + 1; - - // calculate the distances to the floor normalized to [0,1] - // for the bilinear interpolation - tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; - tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; - - toAdd = 0; - - if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1)) - { - toAdd += img[n1*n2*i0_floor + n2*i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1); - } - if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1)) - { - toAdd += img[n1*n2*i0_ceil + n2*i1_floor + i2] * tmp_0 * (1 - tmp_1); - } - if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) & (i1_ceil < n1)) - { - toAdd += img[n1*n2*i0_floor + n2*i1_ceil + i2] * (1 - tmp_0) * tmp_1; - } - if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1)) - { - toAdd += img[n1*n2*i0_ceil + n2*i1_ceil + i2] * tmp_0 * tmp_1; - } - - if(toAdd != 0){p[i] += (cf * toAdd);} - } - } + size_t nlors, + const int *img_dim){ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); } - } } +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// -extern "C" void joseph3d_fwd(const float *xstart, const float *xend, const float *img, const float *img_origin, const float *voxsize, float *p, - long long nlors, + size_t nlors, const int *img_dim, int device_id, int threadsperblock) diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h new file mode 100644 index 0000000..b97892f --- /dev/null +++ b/joseph3d_fwd_worker.h @@ -0,0 +1,401 @@ +#pragma once +#include "cuda_compat.h" +#include "atomic_sum.h" + +// we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA +#ifndef __CUDA_ARCH__ +#include +#endif + +WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0, + float orig1, + float orig2, + float bounds0_min, + float bounds1_min, + float bounds2_min, + float bounds0_max, + float bounds1_max, + float bounds2_max, + float rdir0, + float rdir1, + float rdir2, + float* t1, + float* t2){ + // the inverse of the directional vector + // using the inverse of the directional vector and IEEE floating point arith standard 754 + // makes sure that 0's in the directional vector are handled correctly + float invdir0 = 1.f/rdir0; + float invdir1 = 1.f/rdir1; + float invdir2 = 1.f/rdir2; + + unsigned char intersec = 1; + + float t11, t12, t21, t22; + + if (invdir0 >= 0){ + *t1 = (bounds0_min - orig0) * invdir0; + *t2 = (bounds0_max - orig0) * invdir0; + } + else{ + *t1 = (bounds0_max - orig0) * invdir0; + *t2 = (bounds0_min - orig0) * invdir0; + } + + if (invdir1 >= 0){ + t11 = (bounds1_min - orig1) * invdir1; + t12 = (bounds1_max - orig1) * invdir1; + } + else{ + t11 = (bounds1_max - orig1) * invdir1; + t12 = (bounds1_min - orig1) * invdir1; + } + + if ((*t1 > t12) || (t11 > *t2)){intersec = 0;} + if (t11 > *t1){*t1 = t11;} + if (t12 < *t2){*t2 = t12;} + + if (invdir2 >= 0){ + t21 = (bounds2_min - orig2) * invdir2; + t22 = (bounds2_max - orig2) * invdir2; + } + else{ + t21 = (bounds2_max - orig2) * invdir2; + t22 = (bounds2_min - orig2) * invdir2; + } + + if ((*t1 > t22) || (t21 > *t2)){intersec = 0;} + if (t21 > *t1){*t1 = t21;} + if (t22 < *t2){*t2 = t22;} + + return(intersec); +} + +WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, + const float *xstart, + const float *xend, + const float *img, + const float *img_origin, + const float *voxsize, + float *p, + const int *img_dim){ + + int n0 = img_dim[0]; + int n1 = img_dim[1]; + int n2 = img_dim[2]; + + float d0, d1, d2, d0_sq, d1_sq, d2_sq; + float lsq, cos0_sq, cos1_sq, cos2_sq; + unsigned short direction; + int i0, i1, i2; + int i0_floor, i1_floor, i2_floor; + int i0_ceil, i1_ceil, i2_ceil; + float x_pr0, x_pr1, x_pr2; + float tmp_0, tmp_1, tmp_2; + + float toAdd, cf; + + float xstart0 = xstart[i*3 + 0]; + float xstart1 = xstart[i*3 + 1]; + float xstart2 = xstart[i*3 + 2]; + + float xend0 = xend[i*3 + 0]; + float xend1 = xend[i*3 + 1]; + float xend2 = xend[i*3 + 2]; + + float voxsize0 = voxsize[0]; + float voxsize1 = voxsize[1]; + float voxsize2 = voxsize[2]; + + float img_origin0 = img_origin[0]; + float img_origin1 = img_origin[1]; + float img_origin2 = img_origin[2]; + + unsigned char intersec; + float t1, t2; + float istart_f, iend_f, tmp; + int istart, iend; + + // test whether the ray between the two detectors is most parallel + // with the 0, 1, or 2 axis + d0 = xend0 - xstart0; + d1 = xend1 - xstart1; + d2 = xend2 - xstart2; + + //----------- + //--- test whether ray and cube intersect + intersec = ray_cube_intersection(xstart0, xstart1, xstart2, + img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2, + img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2, + d0, d1, d2, &t1, &t2); + + if (intersec == 1) + { + d0_sq = d0*d0; + d1_sq = d1*d1; + d2_sq = d2*d2; + + lsq = d0_sq + d1_sq + d2_sq; + + cos0_sq = d0_sq / lsq; + cos1_sq = d1_sq / lsq; + cos2_sq = d2_sq / lsq; + + direction = 0; + if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq)) + { + direction = 1; + } + else + { + if ((cos2_sq >= cos0_sq) && (cos2_sq >= cos1_sq)) + { + direction = 2; + } + } + + if (direction == 0) + { + cf = voxsize0 / sqrtf(cos0_sq); + + // case where ray is most parallel to the 0 axis + // we step through the volume along the 0 direction + + //--- check where ray enters / leaves cube + istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0; + iend_f = (xstart0 + t2*d0 - img_origin0) / voxsize0; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + istart = (int)floor(istart_f); + iend = (int)ceil(iend_f); + + if (istart < 0){istart = 0;} + if (iend >= n0){iend = n0;} + + // check in which "plane" the start and end points are + // we have to do this to avoid that we include voxels + // that are "outside" the line segment bewteen xstart and xend + + // !! for these calculations we overwrite the istart_f and iend_f variables !! + istart_f = (xstart0 - img_origin0) / voxsize0; + iend_f = (xend0 - img_origin0) / voxsize0; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} + if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + //--- + + for(i0 = istart; i0 < iend; i0++) + { + // get the indices where the ray intersects the image plane + x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0; + x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0; + + i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); + i1_ceil = i1_floor + 1; + + i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); + i2_ceil = i2_floor + 1; + + // calculate the distances to the floor normalized to [0,1] + // for the bilinear interpolation + tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; + tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; + + toAdd = 0; + + if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2)) + { + toAdd += img[n1*n2*i0 + n2*i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2); + } + if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2)) + { + toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2); + } + if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + toAdd += img[n1*n2*i0 + n2*i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2; + } + if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_ceil] * tmp_1 * tmp_2; + } + + if(toAdd != 0){p[i] += (cf * toAdd);} + } + } + + //--------------------------------------------------------------------------------- + if (direction == 1) + { + cf = voxsize1 / sqrtf(cos1_sq); + + // case where ray is most parallel to the 1 axis + // we step through the volume along the 1 direction + + //--- check where ray enters / leaves cube + istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1; + iend_f = (xstart1 + t2*d1 - img_origin1) / voxsize1; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + istart = (int)floor(istart_f); + iend = (int)ceil(iend_f); + + if (istart < 0){istart = 0;} + if (iend >= n1){iend = n1;} + + // check in which "plane" the start and end points are + // we have to do this to avoid that we include voxels + // that are "outside" the line segment bewteen xstart and xend + + // !! for these calculations we overwrite the istart_f and iend_f variables !! + istart_f = (xstart1 - img_origin1) / voxsize1; + iend_f = (xend1 - img_origin1) / voxsize1; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} + if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + //--- + + for (i1 = istart; i1 < iend; i1++) + { + // get the indices where the ray intersects the image plane + x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1; + x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1; + + i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); + i0_ceil = i0_floor + 1; + + i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); + i2_ceil = i2_floor + 1; + + // calculate the distances to the floor normalized to [0,1] + // for the bilinear interpolation + tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; + tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; + + toAdd = 0; + + if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) + { + toAdd += img[n1*n2*i0_floor + n2*i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2); + } + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2)) + { + toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_floor] * tmp_0 * (1 - tmp_2); + } + if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + toAdd += img[n1*n2*i0_floor + n2*i1 + i2_ceil] * (1 - tmp_0) * tmp_2; + } + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_ceil] * tmp_0 * tmp_2; + } + + if(toAdd != 0){p[i] += (cf * toAdd);} + } + } + + //--------------------------------------------------------------------------------- + if (direction == 2) + { + cf = voxsize2 / sqrtf(cos2_sq); + + // case where ray is most parallel to the 2 axis + // we step through the volume along the 2 direction + + //--- check where ray enters / leaves cube + istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2; + iend_f = (xstart2 + t2*d2 - img_origin2) / voxsize2; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + istart = (int)floor(istart_f); + iend = (int)ceil(iend_f); + + if (istart < 0){istart = 0;} + if (iend >= n2){iend = n2;} + + // check in which "plane" the start and end points are + // we have to do this to avoid that we include voxels + // that are "outside" the line segment bewteen xstart and xend + + // !! for these calculations we overwrite the istart_f and iend_f variables !! + istart_f = (xstart2 - img_origin2) / voxsize2; + iend_f = (xend2 - img_origin2) / voxsize2; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} + if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + //--- + + for(i2 = istart; i2 < iend; i2++) + { + // get the indices where the ray intersects the image plane + x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2; + x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2; + + i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); + i0_ceil = i0_floor + 1; + + i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); + i1_ceil = i1_floor + 1; + + // calculate the distances to the floor normalized to [0,1] + // for the bilinear interpolation + tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; + tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; + + toAdd = 0; + + if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1)) + { + toAdd += img[n1*n2*i0_floor + n2*i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1); + } + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1)) + { + toAdd += img[n1*n2*i0_ceil + n2*i1_floor + i2] * tmp_0 * (1 - tmp_1); + } + if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) & (i1_ceil < n1)) + { + toAdd += img[n1*n2*i0_floor + n2*i1_ceil + i2] * (1 - tmp_0) * tmp_1; + } + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1)) + { + toAdd += img[n1*n2*i0_ceil + n2*i1_ceil + i2] * tmp_0 * tmp_1; + } + + if(toAdd != 0){p[i] += (cf * toAdd);} + } + } + } +} diff --git a/parallelproj.h b/parallelproj.h index bc4b7b8..ad98e1a 100644 --- a/parallelproj.h +++ b/parallelproj.h @@ -11,7 +11,7 @@ void joseph3d_fwd(const float *xstart, const float *img_origin, const float *voxsize, float *p, - long long nlors, + size_t nlors, const int *img_dim, int device_id = 0, int threadsperblock = 64); From 0b8de6f88908825a9a319d81e654d5e4120b9db5 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 18:41:01 +0100 Subject: [PATCH 02/35] add openmp example --- CMakeLists.txt | 5 +- main.cpp | 194 +++++++++++++++++++++++++++++++++++++++++ main_pp.cpp => main.cu | 46 ++++------ 3 files changed, 216 insertions(+), 29 deletions(-) create mode 100644 main.cpp rename main_pp.cpp => main.cu (85%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ee254a..e7ec75a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,12 +40,13 @@ if(CMAKE_CUDA_COMPILER) add_library(parallelproj SHARED joseph3d_fwd.cu) target_link_libraries(parallelproj PRIVATE CUDA::cudart) - add_executable(main main_pp.cpp) + add_executable(main main.cu) target_include_directories(main PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) target_link_libraries(main PRIVATE parallelproj CUDA::cudart) else() find_package(OpenMP REQUIRED) add_library(parallelproj SHARED joseph3d_fwd.cpp) target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX) - #add_executable(main main_no_cuda.cpp) + add_executable(main main.cpp) + target_link_libraries(main PRIVATE parallelproj) endif() diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..4f0bb76 --- /dev/null +++ b/main.cpp @@ -0,0 +1,194 @@ +#include "parallelproj.h" +#include +#include +#include + +int main() { + const size_t repetitions = 5; + size_t nlors = 10; + + //////////////////////////////////////////////////////// + // OpenMP managed memory use case + //////////////////////////////////////////////////////// + + std::cout << "OpenMP use case\n"; + + int img_dim[3] = {2, 3, 4}; + + float voxsize[3] = {4, 3, 2}; + + float img_origin[3]; + for (int i = 0; i < 3; ++i) { + img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i]; + } + + float* img = new float[img_dim[0] * img_dim[1] * img_dim[2]]; + + // fill the test image + for (int i0 = 0; i0 < img_dim[0]; i0++) + { + for (int i1 = 0; i1 < img_dim[1]; i1++) + { + for (int i2 = 0; i2 < img_dim[2]; i2++) + { + img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2] = float(img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2 + 1); + printf("%.1f ", img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + } + printf("\n"); + } + printf("\n"); + } + + float id0 = static_cast(img_dim[0]); + float id1 = static_cast(img_dim[1]); + float id2 = static_cast(img_dim[2]); + + float vstart[] = { + 0, -1, 0, // 0 + 0, -1, 0, // 1 + 0, -1, 1, // 2 + 0, -1, 0.5, // 3 + 0, 0, -1, // 4 + -1, 0, 0, // 5 + id0 - 1, -1, 0, // 6 - (shifted 1) + id0 - 1, -1, id2 - 1, // 7 - (shifted 6) + id0 - 1, 0, -1, // 8 - (shifted 4) + id0 - 1, id1 - 1, -1, // 9 - (shifted 8) + }; + + float vend[] = { + 0, id1, 0, // 0 + 0, id1, 0, // 1 + 0, id1, 1, // 2 + 0, id1, 0.5, // 3 + 0, 0, id2, // 4 + id0, 0, 0, // 5 + id0 - 1, id1, 0, // 6 - (shifted 1) + id0 - 1, id1, id2 - 1, // 7 - (shifted 6) + id0 - 1, 0, id2, // 8 - (shifted 4) + id0 - 1, id1 - 1, id2, // 9 - (shifted 8) + }; + + for (int ir = 0; ir < nlors; ir++) + { + printf("test ray %d\n", ir); + printf("start voxel num .: %.1f %.1f %.1f\n", vstart[ir * 3 + 0], vstart[ir * 3 + 1], vstart[ir * 3 + 2]); + printf("end voxel num .: %.1f %.1f %.1f\n", vend[ir * 3 + 0], vend[ir * 3 + 1], vend[ir * 3 + 2]); + } + + // calculate the start and end coordinates in world coordinates + + float *xstart = new float[3 * nlors]; + float *xend = new float[3 * nlors]; + + for (int ir = 0; ir < nlors; ir++) + + { + for (int j = 0; j < 3; j++) + { + xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j]; + xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j]; + } + } + + float *img_fwd = new float[nlors]; + joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); + + ///////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////// + + // calculate the expected values + + int retval = 0; + float eps = 1e-7; + + float* expected_fwd_vals = new float[nlors]; + // initialize expected_fwd_vals with 0s + for (int ir = 0; ir < nlors; ir++) + { + expected_fwd_vals[ir] = 0; + } + + for (int i1 = 0; i1 < img_dim[1]; i1++) + { + expected_fwd_vals[0] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1]; + } + + expected_fwd_vals[1] = expected_fwd_vals[0]; + + // calculate the expected value of ray2 from [0,-1,1] to [0,last+1,1] + for (int i1 = 0; i1 < img_dim[1]; i1++) + { + expected_fwd_vals[2] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 1] * voxsize[1]; + } + + // calculate the expected value of ray3 from [0,-1,0.5] to [0,last+1,0.5] + expected_fwd_vals[3] = 0.5 * (expected_fwd_vals[0] + expected_fwd_vals[2]); + + // calculate the expected value of ray4 from [0,0,-1] to [0,0,last+1] + for (int i2 = 0; i2 < img_dim[2]; i2++) + { + expected_fwd_vals[4] += img[0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2]; + } + + // calculate the expected value of ray5 from [-1,0,0] to [last+1,0,0] + for (int i0 = 0; i0 < img_dim[0]; i0++) + { + expected_fwd_vals[5] += img[i0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + 0] * voxsize[0]; + } + + // calculate the expected value of rays6 from [img_dim[0]-1,-1,0] to [img_dim[0]-1,last+1,0] + for (int i1 = 0; i1 < img_dim[1]; i1++) + { + expected_fwd_vals[6] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1]; + } + + // calculate the expected value of rays7 from [img_dim[0]-1,-1,img_dim[2]-1] to [img_dim[0]-1,last+1,img_dim[2]-1] + for (int i1 = 0; i1 < img_dim[1]; i1++) + { + expected_fwd_vals[7] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + (img_dim[2] - 1)] * voxsize[1]; + } + + // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1] + for (int i2 = 0; i2 < img_dim[2]; i2++) + { + expected_fwd_vals[8] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2]; + } + + // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1] + for (int i2 = 0; i2 < img_dim[2]; i2++) + { + expected_fwd_vals[9] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + (img_dim[1] - 1) * img_dim[2] + i2] * voxsize[2]; + } + + // check if we got the expected results + float fwd_diff = 0; + printf("\nforward projection test\n"); + for (int ir = 0; ir < nlors; ir++) + { + printf("test ray %d: fwd projected: %.7e expected: %.7e\n", ir, img_fwd[ir], expected_fwd_vals[ir]); + + fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]); + if (fwd_diff > eps) + { + printf("\n################################################################################"); + printf("\nabs(fwd projected - expected value) = %.2e for ray%d above tolerance %.2e", fwd_diff, ir, eps); + printf("\n################################################################################\n"); + retval = 1; + } + } + + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + free(img); + free(xstart); + free(xend); + free(img_fwd); + free(expected_fwd_vals); + + return 0; +} + diff --git a/main_pp.cpp b/main.cu similarity index 85% rename from main_pp.cpp rename to main.cu index 32b8339..fcc6e35 100644 --- a/main_pp.cpp +++ b/main.cu @@ -4,21 +4,9 @@ #include #include -void print_array(const char* label, float* array, size_t size) { - std::cout << label << ": "; - // print max 10 elements - size_t print_size = (size > 10) ? 10 : size; - for (size_t i = 0; i < print_size; ++i) - std::cout << array[i] << " "; - // print ellipses if size > 10 and the last element - if (size > 10) - std::cout << "... " << array[size - 1]; - std::cout << "\n"; -} - int main() { const size_t repetitions = 5; - long long nlors = 10; + size_t nlors = 10; // get the number of cuda devices - because we want to run on the last device int device_count; @@ -68,6 +56,10 @@ int main() { printf("\n"); } + float id0 = static_cast(img_dim[0]); + float id1 = static_cast(img_dim[1]); + float id2 = static_cast(img_dim[2]); + float vstart[] = { 0, -1, 0, // 0 0, -1, 0, // 1 @@ -75,23 +67,23 @@ int main() { 0, -1, 0.5, // 3 0, 0, -1, // 4 -1, 0, 0, // 5 - img_dim[0] - 1, -1, 0, // 6 - (shifted 1) - img_dim[0] - 1, -1, img_dim[2] - 1, // 7 - (shifted 6) - img_dim[0] - 1, 0, -1, // 8 - (shifted 4) - img_dim[0] - 1, img_dim[1] - 1, -1, // 9 - (shifted 8) + id0 - 1, -1, 0, // 6 - (shifted 1) + id0 - 1, -1, id2 - 1, // 7 - (shifted 6) + id0 - 1, 0, -1, // 8 - (shifted 4) + id0 - 1, id1 - 1, -1, // 9 - (shifted 8) }; float vend[] = { - 0, img_dim[1], 0, // 0 - 0, img_dim[1], 0, // 1 - 0, img_dim[1], 1, // 2 - 0, img_dim[1], 0.5, // 3 - 0, 0, img_dim[2], // 4 - img_dim[0], 0, 0, // 5 - img_dim[0] - 1, img_dim[1], 0, // 6 - (shifted 1) - img_dim[0] - 1, img_dim[1], img_dim[2] - 1, // 7 - (shifted 6) - img_dim[0] - 1, 0, img_dim[2], // 8 - (shifted 4) - img_dim[0] - 1, img_dim[1] - 1, img_dim[2], // 9 - (shifted 8) + 0, id1, 0, // 0 + 0, id1, 0, // 1 + 0, id1, 1, // 2 + 0, id1, 0.5, // 3 + 0, 0, id2, // 4 + id0, 0, 0, // 5 + id0 - 1, id1, 0, // 6 - (shifted 1) + id0 - 1, id1, id2 - 1, // 7 - (shifted 6) + id0 - 1, 0, id2, // 8 - (shifted 4) + id0 - 1, id1 - 1, id2, // 9 - (shifted 8) }; for (int ir = 0; ir < nlors; ir++) From 8d1793d201349905b0fd89fc7254a1605bdaca5b Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 18:49:07 +0100 Subject: [PATCH 03/35] return retval --- main.cpp | 2 +- main.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/main.cpp b/main.cpp index 4f0bb76..d48a01c 100644 --- a/main.cpp +++ b/main.cpp @@ -189,6 +189,6 @@ int main() { free(img_fwd); free(expected_fwd_vals); - return 0; + return retval; } diff --git a/main.cu b/main.cu index fcc6e35..5bdf872 100644 --- a/main.cu +++ b/main.cu @@ -216,6 +216,6 @@ int main() { free(expected_fwd_vals); - return 0; + return retval; } From 28a65cde0c64c1b962ea8f84e8c77f95ddf5cd80 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 18:51:42 +0100 Subject: [PATCH 04/35] mv ray cube intersection to utils --- joseph3d_fwd_worker.h | 64 +----------------------------------------- utils.h | 65 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 63 deletions(-) create mode 100644 utils.h diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h index b97892f..a506d61 100644 --- a/joseph3d_fwd_worker.h +++ b/joseph3d_fwd_worker.h @@ -1,75 +1,13 @@ #pragma once #include "cuda_compat.h" #include "atomic_sum.h" +#include "utils.h" // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA #ifndef __CUDA_ARCH__ #include #endif -WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0, - float orig1, - float orig2, - float bounds0_min, - float bounds1_min, - float bounds2_min, - float bounds0_max, - float bounds1_max, - float bounds2_max, - float rdir0, - float rdir1, - float rdir2, - float* t1, - float* t2){ - // the inverse of the directional vector - // using the inverse of the directional vector and IEEE floating point arith standard 754 - // makes sure that 0's in the directional vector are handled correctly - float invdir0 = 1.f/rdir0; - float invdir1 = 1.f/rdir1; - float invdir2 = 1.f/rdir2; - - unsigned char intersec = 1; - - float t11, t12, t21, t22; - - if (invdir0 >= 0){ - *t1 = (bounds0_min - orig0) * invdir0; - *t2 = (bounds0_max - orig0) * invdir0; - } - else{ - *t1 = (bounds0_max - orig0) * invdir0; - *t2 = (bounds0_min - orig0) * invdir0; - } - - if (invdir1 >= 0){ - t11 = (bounds1_min - orig1) * invdir1; - t12 = (bounds1_max - orig1) * invdir1; - } - else{ - t11 = (bounds1_max - orig1) * invdir1; - t12 = (bounds1_min - orig1) * invdir1; - } - - if ((*t1 > t12) || (t11 > *t2)){intersec = 0;} - if (t11 > *t1){*t1 = t11;} - if (t12 < *t2){*t2 = t12;} - - if (invdir2 >= 0){ - t21 = (bounds2_min - orig2) * invdir2; - t22 = (bounds2_max - orig2) * invdir2; - } - else{ - t21 = (bounds2_max - orig2) * invdir2; - t22 = (bounds2_min - orig2) * invdir2; - } - - if ((*t1 > t22) || (t21 > *t2)){intersec = 0;} - if (t21 > *t1){*t1 = t21;} - if (t22 < *t2){*t2 = t22;} - - return(intersec); -} - WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, const float *xstart, const float *xend, diff --git a/utils.h b/utils.h new file mode 100644 index 0000000..69ea5ce --- /dev/null +++ b/utils.h @@ -0,0 +1,65 @@ +#pragma once +#include "cuda_compat.h" + +WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0, + float orig1, + float orig2, + float bounds0_min, + float bounds1_min, + float bounds2_min, + float bounds0_max, + float bounds1_max, + float bounds2_max, + float rdir0, + float rdir1, + float rdir2, + float* t1, + float* t2){ + // the inverse of the directional vector + // using the inverse of the directional vector and IEEE floating point arith standard 754 + // makes sure that 0's in the directional vector are handled correctly + float invdir0 = 1.f/rdir0; + float invdir1 = 1.f/rdir1; + float invdir2 = 1.f/rdir2; + + unsigned char intersec = 1; + + float t11, t12, t21, t22; + + if (invdir0 >= 0){ + *t1 = (bounds0_min - orig0) * invdir0; + *t2 = (bounds0_max - orig0) * invdir0; + } + else{ + *t1 = (bounds0_max - orig0) * invdir0; + *t2 = (bounds0_min - orig0) * invdir0; + } + + if (invdir1 >= 0){ + t11 = (bounds1_min - orig1) * invdir1; + t12 = (bounds1_max - orig1) * invdir1; + } + else{ + t11 = (bounds1_max - orig1) * invdir1; + t12 = (bounds1_min - orig1) * invdir1; + } + + if ((*t1 > t12) || (t11 > *t2)){intersec = 0;} + if (t11 > *t1){*t1 = t11;} + if (t12 < *t2){*t2 = t12;} + + if (invdir2 >= 0){ + t21 = (bounds2_min - orig2) * invdir2; + t22 = (bounds2_max - orig2) * invdir2; + } + else{ + t21 = (bounds2_max - orig2) * invdir2; + t22 = (bounds2_min - orig2) * invdir2; + } + + if ((*t1 > t22) || (t21 > *t2)){intersec = 0;} + if (t21 > *t1){*t1 = t21;} + if (t22 < *t2){*t2 = t22;} + + return(intersec); +} From a79b17181371737a277e7549a239b2a389f443a9 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 21:47:30 +0200 Subject: [PATCH 05/35] add back kernel --- CMakeLists.txt | 2 +- joseph3d_back.cpp | 21 ++ joseph3d_back_worker.h | 329 +++++++++++++++++++++++++++++ joseph3d_fwd_worker.h | 3 +- utils.h => ray_cube_intersection.h | 0 5 files changed, 352 insertions(+), 3 deletions(-) create mode 100644 joseph3d_back.cpp create mode 100644 joseph3d_back_worker.h rename utils.h => ray_cube_intersection.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index e7ec75a..934f362 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,7 @@ if(CMAKE_CUDA_COMPILER) target_link_libraries(main PRIVATE parallelproj CUDA::cudart) else() find_package(OpenMP REQUIRED) - add_library(parallelproj SHARED joseph3d_fwd.cpp) + add_library(parallelproj SHARED joseph3d_fwd.cpp joseph3d_back.cpp) target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX) add_executable(main main.cpp) target_link_libraries(main PRIVATE parallelproj) diff --git a/joseph3d_back.cpp b/joseph3d_back.cpp new file mode 100644 index 0000000..a58e92b --- /dev/null +++ b/joseph3d_back.cpp @@ -0,0 +1,21 @@ +#include "parallelproj.h" +#include "joseph3d_back_worker.h" +#include "debug.h" + +void joseph3d_back(const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + size_t nlors, + const int *img_dim, + int device_id, + int threadsperblock){ + +#pragma omp parallel for + for (size_t i = 0; i < nlors; ++i) { + joseph3d_back_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); + } + +} diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h new file mode 100644 index 0000000..93c96f0 --- /dev/null +++ b/joseph3d_back_worker.h @@ -0,0 +1,329 @@ +#pragma once +#include "cuda_compat.h" +#include "atomic_sum.h" +#include "ray_cube_intersection.h" + +// we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA +#ifndef __CUDA_ARCH__ +#include +#endif + +WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i, + const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + const int *img_dim){ + + int n0 = img_dim[0]; + int n1 = img_dim[1]; + int n2 = img_dim[2]; + + float voxsize0 = voxsize[0]; + float voxsize1 = voxsize[1]; + float voxsize2 = voxsize[2]; + + float img_origin0 = img_origin[0]; + float img_origin1 = img_origin[1]; + float img_origin2 = img_origin[2]; + + if(p[i] != 0) + { + + float d0, d1, d2, d0_sq, d1_sq, d2_sq; + float cs0, cs1, cs2, cf; + float lsq, cos0_sq, cos1_sq, cos2_sq; + unsigned short direction; + int i0, i1, i2; + int i0_floor, i1_floor, i2_floor; + int i0_ceil, i1_ceil, i2_ceil; + float x_pr0, x_pr1, x_pr2; + float tmp_0, tmp_1, tmp_2; + + float xstart0 = xstart[i*3 + 0]; + float xstart1 = xstart[i*3 + 1]; + float xstart2 = xstart[i*3 + 2]; + + float xend0 = xend[i*3 + 0]; + float xend1 = xend[i*3 + 1]; + float xend2 = xend[i*3 + 2]; + + unsigned char intersec; + float t1, t2; + float istart_f, iend_f, tmp; + int istart, iend; + + // test whether the ray between the two detectors is most parallel + // with the 0, 1, or 2 axis + d0 = xend0 - xstart0; + d1 = xend1 - xstart1; + d2 = xend2 - xstart2; + + //----------- + //--- test whether ray and cube intersect + intersec = ray_cube_intersection(xstart0, xstart1, xstart2, + img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2, + img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2, + d0, d1, d2, &t1, &t2); + + if (intersec == 1) + { + d0_sq = d0*d0; + d1_sq = d1*d1; + d2_sq = d2*d2; + + lsq = d0_sq + d1_sq + d2_sq; + + cos0_sq = d0_sq / lsq; + cos1_sq = d1_sq / lsq; + cos2_sq = d2_sq / lsq; + + cs0 = sqrtf(cos0_sq); + cs1 = sqrtf(cos1_sq); + cs2 = sqrtf(cos2_sq); + + direction = 0; + if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq)) + { + direction = 1; + } + if ((cos2_sq >= cos0_sq) && (cos2_sq >= cos1_sq)) + { + direction = 2; + } + + if(direction == 0) + { + // case where ray is most parallel to the 0 axis + // we step through the volume along the 0 direction + + // factor for correctiong voxel size and |cos(theta)| + cf = voxsize0/cs0; + + //--- check where ray enters / leaves cube + istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0; + iend_f = (xstart0 + t2*d0 - img_origin0) / voxsize0; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + istart = (int)floor(istart_f); + iend = (int)ceil(iend_f); + if (istart < 0){istart = 0;} + if (iend >= n0){iend = n0;} + + // check in which "plane" the start and end points are + // we have to do this to avoid that we include voxels + // that are "outside" the line segment bewteen xstart and xend + + // !! for these calculations we overwrite the istart_f and iend_f variables !! + istart_f = (xstart0 - img_origin0) / voxsize0; + iend_f = (xend0 - img_origin0) / voxsize0; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} + if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + //--- + + for(i0 = istart; i0 < iend; i0++) + { + // get the indices where the ray intersects the image plane + x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0; + x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0; + + i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); + i1_ceil = i1_floor + 1; + + i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); + i2_ceil = i2_floor + 1; + + // calculate the distances to the floor normalized to [0,1] + // for the bilinear interpolation + tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; + tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; + + if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2)) + { + atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_floor, (p[i] * (1 - tmp_1) * (1 - tmp_2) * cf)); + } + if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2)) + { + atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_floor, (p[i] * tmp_1 * (1 - tmp_2) * cf)); + } + if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_ceil, (p[i] * (1 - tmp_1) * tmp_2 * cf)); + } + if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_ceil, (p[i] * tmp_1 * tmp_2 * cf)); + } + } + } + // --------------------------------------------------------------------------------- + if(direction == 1) + { + // case where ray is most parallel to the 1 axis + // we step through the volume along the 1 direction + + // factor for correctiong voxel size and |cos(theta)| + cf = voxsize1/cs1; + + //--- check where ray enters / leaves cube + istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1; + iend_f = (xstart1 + t2*d1 - img_origin1) / voxsize1; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + istart = (int)floor(istart_f); + iend = (int)ceil(iend_f); + if (istart < 0){istart = 0;} + if (iend >= n1){iend = n1;} + + // check in which "plane" the start and end points are + // we have to do this to avoid that we include voxels + // that are "outside" the line segment bewteen xstart and xend + + // !! for these calculations we overwrite the istart_f and iend_f variables !! + istart_f = (xstart1 - img_origin1) / voxsize1; + iend_f = (xend1 - img_origin1) / voxsize1; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} + if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + //--- + + for(i1 = istart; i1 < iend; i1++) + { + // get the indices where the ray intersects the image plane + x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1; + x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1; + + i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); + i0_ceil = i0_floor + 1; + + i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); + i2_ceil = i2_floor + 1; + + // calculate the distances to the floor normalized to [0,1] + // for the bilinear interpolation + tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; + tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; + + if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) + { + atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_floor, (p[i] * (1 - tmp_0) * (1 - tmp_2) * cf)); + } + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2)) + { + atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_floor, (p[i] * tmp_0 * (1 - tmp_2) * cf)); + } + if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_ceil, (p[i] * (1 - tmp_0) * tmp_2 * cf)); + } + if((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) + { + atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_ceil, (p[i] * tmp_0 * tmp_2 * cf)); + } + } + } + //--------------------------------------------------------------------------------- + if (direction == 2) + { + // case where ray is most parallel to the 2 axis + // we step through the volume along the 2 direction + + // factor for correctiong voxel size and |cos(theta)| + cf = voxsize2/cs2; + + //--- check where ray enters / leaves cube + istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2; + iend_f = (xstart2 + t2*d2 - img_origin2) / voxsize2; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + istart = (int)floor(istart_f); + iend = (int)ceil(iend_f); + if (istart < 0){istart = 0;} + if (iend >= n2){iend = n2;} + + // check in which "plane" the start and end points are + // we have to do this to avoid that we include voxels + // that are "outside" the line segment bewteen xstart and xend + + // !! for these calculations we overwrite the istart_f and iend_f variables !! + istart_f = (xstart2 - img_origin2) / voxsize2; + iend_f = (xend2 - img_origin2) / voxsize2; + + if (istart_f > iend_f){ + tmp = iend_f; + iend_f = istart_f; + istart_f = tmp; + } + + if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} + if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + //--- + + for(i2 = istart; i2 < iend; i2++) + { + // get the indices where the ray intersects the image plane + x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2; + x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2; + + i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); + i0_ceil = i0_floor + 1; + + i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); + i1_ceil = i1_floor + 1; + + // calculate the distances to the floor normalized to [0,1] + // for the bilinear interpolation + tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; + tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; + + if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1)) + { + atomic_sum(img + n1*n2*i0_floor + n2*i1_floor + i2, (p[i] * (1 - tmp_0) * (1 - tmp_1) * cf)); + } + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1)) + { + atomic_sum(img + n1*n2*i0_ceil + n2*i1_floor + i2, (p[i] * tmp_0 * (1 - tmp_1) * cf)); + } + if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) && (i1_ceil < n1)) + { + atomic_sum(img + n1*n2*i0_floor + n2*i1_ceil + i2, (p[i] * (1 - tmp_0) * tmp_1 * cf)); + } + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1)) + { + atomic_sum(img + n1*n2*i0_ceil + n2*i1_ceil + i2, (p[i] * tmp_0 * tmp_1 * cf)); + } + } + } + } + } +} diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h index a506d61..a8b3fe4 100644 --- a/joseph3d_fwd_worker.h +++ b/joseph3d_fwd_worker.h @@ -1,7 +1,6 @@ #pragma once #include "cuda_compat.h" -#include "atomic_sum.h" -#include "utils.h" +#include "ray_cube_intersection.h" // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA #ifndef __CUDA_ARCH__ diff --git a/utils.h b/ray_cube_intersection.h similarity index 100% rename from utils.h rename to ray_cube_intersection.h From 86fc2d03ad537bcddebdfcfcff27faebc1995638 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 21:53:17 +0200 Subject: [PATCH 06/35] rename files --- atomic_sum.h | 11 ----------- joseph3d_back_worker.h | 3 +-- joseph3d_fwd_worker.h | 2 +- square_op.h | 2 +- ray_cube_intersection.h => utils.h | 11 ++++++++++- 5 files changed, 13 insertions(+), 16 deletions(-) delete mode 100644 atomic_sum.h rename ray_cube_intersection.h => utils.h (89%) diff --git a/atomic_sum.h b/atomic_sum.h deleted file mode 100644 index dc08883..0000000 --- a/atomic_sum.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once -#include "cuda_compat.h" - -WORKER_QUALIFIER inline void atomic_sum(float* target, float value) { -#ifdef __CUDA_ARCH__ - atomicAdd(target, value); -#else -#pragma omp atomic - *target += value; -#endif -} diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h index 93c96f0..f3c7500 100644 --- a/joseph3d_back_worker.h +++ b/joseph3d_back_worker.h @@ -1,7 +1,6 @@ #pragma once #include "cuda_compat.h" -#include "atomic_sum.h" -#include "ray_cube_intersection.h" +#include "utils.h" // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA #ifndef __CUDA_ARCH__ diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h index a8b3fe4..f4f8f73 100644 --- a/joseph3d_fwd_worker.h +++ b/joseph3d_fwd_worker.h @@ -1,6 +1,6 @@ #pragma once #include "cuda_compat.h" -#include "ray_cube_intersection.h" +#include "utils.h" // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA #ifndef __CUDA_ARCH__ diff --git a/square_op.h b/square_op.h index 2eda75b..78d9b65 100644 --- a/square_op.h +++ b/square_op.h @@ -1,6 +1,6 @@ #pragma once #include "cuda_compat.h" -#include "atomic_sum.h" +#include "utils.h" CUDA_HOST_DEVICE inline void compute_and_accumulate(float* array, size_t idx, float* result_sum) { array[idx] += 1; diff --git a/ray_cube_intersection.h b/utils.h similarity index 89% rename from ray_cube_intersection.h rename to utils.h index 69ea5ce..a9e642c 100644 --- a/ray_cube_intersection.h +++ b/utils.h @@ -1,7 +1,16 @@ #pragma once #include "cuda_compat.h" -WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0, +WORKER_QUALIFIER inline void atomic_sum(float* target, float value) { +#ifdef __CUDA_ARCH__ + atomicAdd(target, value); +#else +#pragma omp atomic + *target += value; +#endif +} + +WORKER_QUALIFIER inline unsigned char ray_cube_intersection(float orig0, float orig1, float orig2, float bounds0_min, From 10269350b8a0098d3941749ed01df80770b50290 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 22:10:57 +0200 Subject: [PATCH 07/35] add back projection test to cpp file --- main.cpp | 63 +++++++++++++++++++++++++++++++++++++++++++++++++- parallelproj.h | 29 +++++++++++++++++++---- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/main.cpp b/main.cpp index d48a01c..641c0b1 100644 --- a/main.cpp +++ b/main.cpp @@ -183,12 +183,73 @@ int main() { //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// + // test the back projection + + float* bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]]; + for (size_t i = 0; i < (img_dim[0]*img_dim[1]*img_dim[2]); i++){ + bimg[i] = 0; + } + + float* ones = new float[nlors]; + for (size_t i = 0; i < nlors; i++){ + ones[i] = 1; + } + + joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim); + + printf("\nback projection of ones along all rays:\n"); + for (size_t i0 = 0; i0 < img_dim[0]; i0++) + { + for (size_t i1 = 0; i1 < img_dim[1]; i1++) + { + for (size_t i2 = 0; i2 < img_dim[2]; i2++) + { + printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + } + printf("\n"); + } + printf("\n"); + } + + // To test whether the back projection is correct, we test if the back projector is the adjoint + // of the forward projector. This is more practical than checking a lot of single voxels in the + // back projected image. + + float inner_product1 = 0; + float inner_product2 = 0; + + for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) + { + inner_product1 += (img[i] * bimg[i]); + } + + for (size_t ir = 0; ir < nlors; ir++) + { + inner_product2 += (img_fwd[ir] * ones[ir]); + } + + float ip_diff = fabs(inner_product1 - inner_product2); + + if (ip_diff > eps) + { + printf("\n#########################################################################"); + printf("\nback projection test failed. back projection seems not to be the adjoint."); + printf("\n %.7e", ip_diff); + printf("\n#########################################################################\n"); + retval = 1; + } + + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// free(img); free(xstart); free(xend); free(img_fwd); free(expected_fwd_vals); + free(bimg); + free(ones); + return retval; } - diff --git a/parallelproj.h b/parallelproj.h index ad98e1a..e554942 100644 --- a/parallelproj.h +++ b/parallelproj.h @@ -5,13 +5,13 @@ extern "C" { #endif -void joseph3d_fwd(const float *xstart, - const float *xend, +void joseph3d_fwd(const float *xstart, + const float *xend, const float *img, - const float *img_origin, - const float *voxsize, + const float *img_origin, + const float *voxsize, float *p, - size_t nlors, + size_t nlors, const int *img_dim, int device_id = 0, int threadsperblock = 64); @@ -19,3 +19,22 @@ void joseph3d_fwd(const float *xstart, #ifdef __cplusplus } #endif + +#ifdef __cplusplus +extern "C" { +#endif + +void joseph3d_back(const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + size_t nlors, + const int *img_dim, + int device_id = 0, + int threadsperblock = 64); + +#ifdef __cplusplus +} +#endif From 3cc377a05a2a1bda2bf3ab76895f836862ca2dde Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Wed, 9 Apr 2025 22:18:06 +0200 Subject: [PATCH 08/35] reformat code --- cuda_compat.h | 4 +- debug.h | 13 +- joseph3d_back.cpp | 25 +-- joseph3d_back_worker.h | 335 +++++++++++++++++++++++------------------ joseph3d_fwd.cpp | 17 ++- joseph3d_fwd.cu | 73 ++++----- joseph3d_fwd_worker.h | 274 +++++++++++++++++++-------------- main.cpp | 54 ++++--- main.cu | 57 ++++--- utils.h | 128 +++++++++------- 10 files changed, 559 insertions(+), 421 deletions(-) diff --git a/cuda_compat.h b/cuda_compat.h index 21451f1..9b3752c 100644 --- a/cuda_compat.h +++ b/cuda_compat.h @@ -1,7 +1,7 @@ #pragma once #ifdef __CUDACC__ - #define WORKER_QUALIFIER __device__ +#define WORKER_QUALIFIER __device__ #else - #define WORKER_QUALIFIER +#define WORKER_QUALIFIER #endif diff --git a/debug.h b/debug.h index 9535f81..fe5244e 100644 --- a/debug.h +++ b/debug.h @@ -4,11 +4,18 @@ #include // Safe printf wrapper -#define DPRINTF(...) do { printf(__VA_ARGS__); } while (0) +#define DPRINTF(...) \ + do \ + { \ + printf(__VA_ARGS__); \ + } while (0) // Conditional debug print #ifdef DEBUG - #define DEBUG_PRINT(...) DPRINTF(__VA_ARGS__) +#define DEBUG_PRINT(...) DPRINTF(__VA_ARGS__) #else - #define DEBUG_PRINT(...) do {} while (0) +#define DEBUG_PRINT(...) \ + do \ + { \ + } while (0) #endif diff --git a/joseph3d_back.cpp b/joseph3d_back.cpp index a58e92b..f63d8da 100644 --- a/joseph3d_back.cpp +++ b/joseph3d_back.cpp @@ -2,20 +2,21 @@ #include "joseph3d_back_worker.h" #include "debug.h" -void joseph3d_back(const float *xstart, - const float *xend, - float *img, - const float *img_origin, - const float *voxsize, - const float *p, - size_t nlors, - const int *img_dim, - int device_id, - int threadsperblock){ +void joseph3d_back(const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + size_t nlors, + const int *img_dim, + int device_id, + int threadsperblock) +{ #pragma omp parallel for - for (size_t i = 0; i < nlors; ++i) { + for (size_t i = 0; i < nlors; ++i) + { joseph3d_back_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); } - } diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h index f3c7500..b7fdb23 100644 --- a/joseph3d_back_worker.h +++ b/joseph3d_back_worker.h @@ -8,13 +8,14 @@ #endif WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i, - const float *xstart, - const float *xend, - float *img, - const float *img_origin, - const float *voxsize, - const float *p, - const int *img_dim){ + const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + const int *img_dim) +{ int n0 = img_dim[0]; int n1 = img_dim[1]; @@ -28,61 +29,61 @@ WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i, float img_origin1 = img_origin[1]; float img_origin2 = img_origin[2]; - if(p[i] != 0) + if (p[i] != 0) { float d0, d1, d2, d0_sq, d1_sq, d2_sq; - float cs0, cs1, cs2, cf; + float cs0, cs1, cs2, cf; float lsq, cos0_sq, cos1_sq, cos2_sq; - unsigned short direction; + unsigned short direction; int i0, i1, i2; int i0_floor, i1_floor, i2_floor; int i0_ceil, i1_ceil, i2_ceil; float x_pr0, x_pr1, x_pr2; float tmp_0, tmp_1, tmp_2; - float xstart0 = xstart[i*3 + 0]; - float xstart1 = xstart[i*3 + 1]; - float xstart2 = xstart[i*3 + 2]; + float xstart0 = xstart[i * 3 + 0]; + float xstart1 = xstart[i * 3 + 1]; + float xstart2 = xstart[i * 3 + 2]; - float xend0 = xend[i*3 + 0]; - float xend1 = xend[i*3 + 1]; - float xend2 = xend[i*3 + 2]; + float xend0 = xend[i * 3 + 0]; + float xend1 = xend[i * 3 + 1]; + float xend2 = xend[i * 3 + 2]; unsigned char intersec; float t1, t2; float istart_f, iend_f, tmp; - int istart, iend; + int istart, iend; // test whether the ray between the two detectors is most parallel // with the 0, 1, or 2 axis - d0 = xend0 - xstart0; - d1 = xend1 - xstart1; - d2 = xend2 - xstart2; - + d0 = xend0 - xstart0; + d1 = xend1 - xstart1; + d2 = xend2 - xstart2; + //----------- //--- test whether ray and cube intersect - intersec = ray_cube_intersection(xstart0, xstart1, xstart2, - img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2, - img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2, - d0, d1, d2, &t1, &t2); + intersec = ray_cube_intersection(xstart0, xstart1, xstart2, + img_origin0 - 1 * voxsize0, img_origin1 - 1 * voxsize1, img_origin2 - 1 * voxsize2, + img_origin0 + n0 * voxsize0, img_origin1 + n1 * voxsize1, img_origin2 + n2 * voxsize2, + d0, d1, d2, &t1, &t2); if (intersec == 1) { - d0_sq = d0*d0; - d1_sq = d1*d1; - d2_sq = d2*d2; - + d0_sq = d0 * d0; + d1_sq = d1 * d1; + d2_sq = d2 * d2; + lsq = d0_sq + d1_sq + d2_sq; - + cos0_sq = d0_sq / lsq; cos1_sq = d1_sq / lsq; cos2_sq = d2_sq / lsq; - cs0 = sqrtf(cos0_sq); - cs1 = sqrtf(cos1_sq); - cs2 = sqrtf(cos2_sq); - + cs0 = sqrtf(cos0_sq); + cs1 = sqrtf(cos1_sq); + cs2 = sqrtf(cos2_sq); + direction = 0; if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq)) { @@ -93,233 +94,275 @@ WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i, direction = 2; } - if(direction == 0) + if (direction == 0) { // case where ray is most parallel to the 0 axis // we step through the volume along the 0 direction // factor for correctiong voxel size and |cos(theta)| - cf = voxsize0/cs0; + cf = voxsize0 / cs0; //--- check where ray enters / leaves cube - istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0; - iend_f = (xstart0 + t2*d0 - img_origin0) / voxsize0; + istart_f = (xstart0 + t1 * d0 - img_origin0) / voxsize0; + iend_f = (xstart0 + t2 * d0 - img_origin0) / voxsize0; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - + istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); - if (istart < 0){istart = 0;} - if (iend >= n0){iend = n0;} + iend = (int)ceil(iend_f); + if (istart < 0) + { + istart = 0; + } + if (iend >= n0) + { + iend = n0; + } // check in which "plane" the start and end points are // we have to do this to avoid that we include voxels // that are "outside" the line segment bewteen xstart and xend - + // !! for these calculations we overwrite the istart_f and iend_f variables !! istart_f = (xstart0 - img_origin0) / voxsize0; - iend_f = (xend0 - img_origin0) / voxsize0; + iend_f = (xend0 - img_origin0) / voxsize0; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + if (istart < (int)floor(istart_f)) + { + istart = (int)floor(istart_f); + } + if (iend >= (int)ceil(iend_f)) + { + iend = (int)ceil(iend_f); + } //--- - for(i0 = istart; i0 < iend; i0++) + for (i0 = istart; i0 < iend; i0++) { // get the indices where the ray intersects the image plane - x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0; - x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0; - - i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); - i1_ceil = i1_floor + 1; - - i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); - i2_ceil = i2_floor + 1; - + x_pr1 = xstart1 + (img_origin0 + i0 * voxsize0 - xstart0) * d1 / d0; + x_pr2 = xstart2 + (img_origin0 + i0 * voxsize0 - xstart0) * d2 / d0; + + i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1); + i1_ceil = i1_floor + 1; + + i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2); + i2_ceil = i2_floor + 1; + // calculate the distances to the floor normalized to [0,1] // for the bilinear interpolation - tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; - tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; - + tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1; + tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2; + if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2)) { - atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_floor, (p[i] * (1 - tmp_1) * (1 - tmp_2) * cf)); + atomic_sum(img + n1 * n2 * i0 + n2 * i1_floor + i2_floor, (p[i] * (1 - tmp_1) * (1 - tmp_2) * cf)); } if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2)) { - atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_floor, (p[i] * tmp_1 * (1 - tmp_2) * cf)); + atomic_sum(img + n1 * n2 * i0 + n2 * i1_ceil + i2_floor, (p[i] * tmp_1 * (1 - tmp_2) * cf)); } if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) { - atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_ceil, (p[i] * (1 - tmp_1) * tmp_2 * cf)); + atomic_sum(img + n1 * n2 * i0 + n2 * i1_floor + i2_ceil, (p[i] * (1 - tmp_1) * tmp_2 * cf)); } if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) { - atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_ceil, (p[i] * tmp_1 * tmp_2 * cf)); + atomic_sum(img + n1 * n2 * i0 + n2 * i1_ceil + i2_ceil, (p[i] * tmp_1 * tmp_2 * cf)); } } - } - // --------------------------------------------------------------------------------- - if(direction == 1) + } + // --------------------------------------------------------------------------------- + if (direction == 1) { // case where ray is most parallel to the 1 axis // we step through the volume along the 1 direction - + // factor for correctiong voxel size and |cos(theta)| - cf = voxsize1/cs1; + cf = voxsize1 / cs1; //--- check where ray enters / leaves cube - istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1; - iend_f = (xstart1 + t2*d1 - img_origin1) / voxsize1; + istart_f = (xstart1 + t1 * d1 - img_origin1) / voxsize1; + iend_f = (xstart1 + t2 * d1 - img_origin1) / voxsize1; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - + istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); - if (istart < 0){istart = 0;} - if (iend >= n1){iend = n1;} + iend = (int)ceil(iend_f); + if (istart < 0) + { + istart = 0; + } + if (iend >= n1) + { + iend = n1; + } // check in which "plane" the start and end points are // we have to do this to avoid that we include voxels // that are "outside" the line segment bewteen xstart and xend - + // !! for these calculations we overwrite the istart_f and iend_f variables !! istart_f = (xstart1 - img_origin1) / voxsize1; - iend_f = (xend1 - img_origin1) / voxsize1; + iend_f = (xend1 - img_origin1) / voxsize1; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + if (istart < (int)floor(istart_f)) + { + istart = (int)floor(istart_f); + } + if (iend >= (int)ceil(iend_f)) + { + iend = (int)ceil(iend_f); + } //--- - for(i1 = istart; i1 < iend; i1++) + for (i1 = istart; i1 < iend; i1++) { // get the indices where the ray intersects the image plane - x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1; - x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1; - - i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); - i0_ceil = i0_floor + 1; - - i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); - i2_ceil = i2_floor + 1; - + x_pr0 = xstart0 + (img_origin1 + i1 * voxsize1 - xstart1) * d0 / d1; + x_pr2 = xstart2 + (img_origin1 + i1 * voxsize1 - xstart1) * d2 / d1; + + i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0); + i0_ceil = i0_floor + 1; + + i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2); + i2_ceil = i2_floor + 1; + // calculate the distances to the floor normalized to [0,1] // for the bilinear interpolation - tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; - tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; - - if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) + tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0; + tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2; + + if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) { - atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_floor, (p[i] * (1 - tmp_0) * (1 - tmp_2) * cf)); + atomic_sum(img + n1 * n2 * i0_floor + n2 * i1 + i2_floor, (p[i] * (1 - tmp_0) * (1 - tmp_2) * cf)); } if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2)) { - atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_floor, (p[i] * tmp_0 * (1 - tmp_2) * cf)); + atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1 + i2_floor, (p[i] * tmp_0 * (1 - tmp_2) * cf)); } if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) { - atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_ceil, (p[i] * (1 - tmp_0) * tmp_2 * cf)); + atomic_sum(img + n1 * n2 * i0_floor + n2 * i1 + i2_ceil, (p[i] * (1 - tmp_0) * tmp_2 * cf)); } - if((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) + if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) { - atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_ceil, (p[i] * tmp_0 * tmp_2 * cf)); + atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1 + i2_ceil, (p[i] * tmp_0 * tmp_2 * cf)); } } } - //--------------------------------------------------------------------------------- + //--------------------------------------------------------------------------------- if (direction == 2) { // case where ray is most parallel to the 2 axis // we step through the volume along the 2 direction - + // factor for correctiong voxel size and |cos(theta)| - cf = voxsize2/cs2; - + cf = voxsize2 / cs2; + //--- check where ray enters / leaves cube - istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2; - iend_f = (xstart2 + t2*d2 - img_origin2) / voxsize2; + istart_f = (xstart2 + t1 * d2 - img_origin2) / voxsize2; + iend_f = (xstart2 + t2 * d2 - img_origin2) / voxsize2; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - + istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); - if (istart < 0){istart = 0;} - if (iend >= n2){iend = n2;} + iend = (int)ceil(iend_f); + if (istart < 0) + { + istart = 0; + } + if (iend >= n2) + { + iend = n2; + } // check in which "plane" the start and end points are // we have to do this to avoid that we include voxels // that are "outside" the line segment bewteen xstart and xend - + // !! for these calculations we overwrite the istart_f and iend_f variables !! istart_f = (xstart2 - img_origin2) / voxsize2; - iend_f = (xend2 - img_origin2) / voxsize2; + iend_f = (xend2 - img_origin2) / voxsize2; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + if (istart < (int)floor(istart_f)) + { + istart = (int)floor(istart_f); + } + if (iend >= (int)ceil(iend_f)) + { + iend = (int)ceil(iend_f); + } //--- - for(i2 = istart; i2 < iend; i2++) + for (i2 = istart; i2 < iend; i2++) { // get the indices where the ray intersects the image plane - x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2; - x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2; - - i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); - i0_ceil = i0_floor + 1; - - i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); - i1_ceil = i1_floor + 1; - + x_pr0 = xstart0 + (img_origin2 + i2 * voxsize2 - xstart2) * d0 / d2; + x_pr1 = xstart1 + (img_origin2 + i2 * voxsize2 - xstart2) * d1 / d2; + + i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0); + i0_ceil = i0_floor + 1; + + i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1); + i1_ceil = i1_floor + 1; + // calculate the distances to the floor normalized to [0,1] // for the bilinear interpolation - tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; - tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; - + tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0; + tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1; + if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1)) { - atomic_sum(img + n1*n2*i0_floor + n2*i1_floor + i2, (p[i] * (1 - tmp_0) * (1 - tmp_1) * cf)); + atomic_sum(img + n1 * n2 * i0_floor + n2 * i1_floor + i2, (p[i] * (1 - tmp_0) * (1 - tmp_1) * cf)); } if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1)) { - atomic_sum(img + n1*n2*i0_ceil + n2*i1_floor + i2, (p[i] * tmp_0 * (1 - tmp_1) * cf)); + atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1_floor + i2, (p[i] * tmp_0 * (1 - tmp_1) * cf)); } if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) && (i1_ceil < n1)) { - atomic_sum(img + n1*n2*i0_floor + n2*i1_ceil + i2, (p[i] * (1 - tmp_0) * tmp_1 * cf)); + atomic_sum(img + n1 * n2 * i0_floor + n2 * i1_ceil + i2, (p[i] * (1 - tmp_0) * tmp_1 * cf)); } if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1)) { - atomic_sum(img + n1*n2*i0_ceil + n2*i1_ceil + i2, (p[i] * tmp_0 * tmp_1 * cf)); + atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1_ceil + i2, (p[i] * tmp_0 * tmp_1 * cf)); } } } diff --git a/joseph3d_fwd.cpp b/joseph3d_fwd.cpp index bcd480c..553cd8a 100644 --- a/joseph3d_fwd.cpp +++ b/joseph3d_fwd.cpp @@ -2,20 +2,21 @@ #include "joseph3d_fwd_worker.h" #include "debug.h" -void joseph3d_fwd(const float *xstart, - const float *xend, +void joseph3d_fwd(const float *xstart, + const float *xend, const float *img, - const float *img_origin, - const float *voxsize, + const float *img_origin, + const float *voxsize, float *p, - size_t nlors, + size_t nlors, const int *img_dim, int device_id, - int threadsperblock){ + int threadsperblock) +{ #pragma omp parallel for - for (size_t i = 0; i < nlors; ++i) { + for (size_t i = 0; i < nlors; ++i) + { joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); } - } diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu index 575e1cb..10e3f75 100644 --- a/joseph3d_fwd.cu +++ b/joseph3d_fwd.cu @@ -5,16 +5,18 @@ #include #include -__global__ void joseph3d_fwd_kernel(const float *xstart, - const float *xend, +__global__ void joseph3d_fwd_kernel(const float *xstart, + const float *xend, const float *img, - const float *img_origin, - const float *voxsize, + const float *img_origin, + const float *voxsize, float *p, - size_t nlors, - const int *img_dim){ + size_t nlors, + const int *img_dim) +{ size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) { + if (i < n) + { joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); } } @@ -23,25 +25,25 @@ __global__ void joseph3d_fwd_kernel(const float *xstart, ////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////// -void joseph3d_fwd(const float *xstart, - const float *xend, +void joseph3d_fwd(const float *xstart, + const float *xend, const float *img, - const float *img_origin, - const float *voxsize, + const float *img_origin, + const float *voxsize, float *p, - size_t nlors, + size_t nlors, const int *img_dim, int device_id, int threadsperblock) { - const float* d_xstart = nullptr; - const float* d_xend = nullptr; - const float* d_img = nullptr; - const float* d_img_origin = nullptr; - const float* d_voxsize = nullptr; - float* d_p = nullptr; - const int* d_img_dim = nullptr; + const float *d_xstart = nullptr; + const float *d_xend = nullptr; + const float *d_img = nullptr; + const float *d_img_origin = nullptr; + const float *d_voxsize = nullptr; + float *d_p = nullptr; + const int *d_img_dim = nullptr; // get pointer attributes of all input and output arrays cudaPointerAttributes xstart_attr; @@ -55,18 +57,21 @@ void joseph3d_fwd(const float *xstart, bool needs_copy_back = false; bool is_cuda_managed_ptr = false; - if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)){ + if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)) + { is_cuda_managed_ptr = true; DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device); } // else throw error - else{ + else + { needs_copy_back = true; throw std::runtime_error("Unsupported pointer type"); } - if (is_cuda_managed_ptr){ - // all arrays are cuda malloc managed, so no need to copy to the device + if (is_cuda_managed_ptr) + { + // all arrays are cuda malloc managed, so no need to copy to the device d_xstart = xstart; d_xend = xend; d_img = img; @@ -74,7 +79,9 @@ void joseph3d_fwd(const float *xstart, d_voxsize = voxsize; d_p = p; d_img_dim = img_dim; - } else { + } + else + { DEBUG_PRINT("COPYING HOST TO DEVICE"); } @@ -83,16 +90,14 @@ void joseph3d_fwd(const float *xstart, cudaGetDevice(¤t_device_id); DEBUG_PRINT("Using CUDA device: %d\n", current_device_id); - - int num_blocks = (int)((nlors + threadsperblock- 1) / threadsperblock); - joseph3d_fwd_kernel<<>>(d_xstart, d_xend, d_img, - d_img_origin, d_voxsize, - d_p, nlors, d_img_dim); + int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock); + joseph3d_fwd_kernel<<>>(d_xstart, d_xend, d_img, + d_img_origin, d_voxsize, + d_p, nlors, d_img_dim); cudaDeviceSynchronize(); - //if (needs_copy_back) { - // cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost); - // cudaFree(device_array); - //} + // if (needs_copy_back) { + // cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost); + // cudaFree(device_array); + // } } - diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h index f4f8f73..c8e402d 100644 --- a/joseph3d_fwd_worker.h +++ b/joseph3d_fwd_worker.h @@ -8,21 +8,22 @@ #endif WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, - const float *xstart, - const float *xend, + const float *xstart, + const float *xend, const float *img, - const float *img_origin, - const float *voxsize, + const float *img_origin, + const float *voxsize, float *p, - const int *img_dim){ + const int *img_dim) +{ int n0 = img_dim[0]; int n1 = img_dim[1]; int n2 = img_dim[2]; - float d0, d1, d2, d0_sq, d1_sq, d2_sq; + float d0, d1, d2, d0_sq, d1_sq, d2_sq; float lsq, cos0_sq, cos1_sq, cos2_sq; - unsigned short direction; + unsigned short direction; int i0, i1, i2; int i0_floor, i1_floor, i2_floor; int i0_ceil, i1_ceil, i2_ceil; @@ -31,13 +32,13 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, float toAdd, cf; - float xstart0 = xstart[i*3 + 0]; - float xstart1 = xstart[i*3 + 1]; - float xstart2 = xstart[i*3 + 2]; + float xstart0 = xstart[i * 3 + 0]; + float xstart1 = xstart[i * 3 + 1]; + float xstart2 = xstart[i * 3 + 2]; - float xend0 = xend[i*3 + 0]; - float xend1 = xend[i*3 + 1]; - float xend2 = xend[i*3 + 2]; + float xend0 = xend[i * 3 + 0]; + float xend1 = xend[i * 3 + 1]; + float xend2 = xend[i * 3 + 2]; float voxsize0 = voxsize[0]; float voxsize1 = voxsize[1]; @@ -50,7 +51,7 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, unsigned char intersec; float t1, t2; float istart_f, iend_f, tmp; - int istart, iend; + int istart, iend; // test whether the ray between the two detectors is most parallel // with the 0, 1, or 2 axis @@ -60,16 +61,16 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, //----------- //--- test whether ray and cube intersect - intersec = ray_cube_intersection(xstart0, xstart1, xstart2, - img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2, - img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2, + intersec = ray_cube_intersection(xstart0, xstart1, xstart2, + img_origin0 - 1 * voxsize0, img_origin1 - 1 * voxsize1, img_origin2 - 1 * voxsize2, + img_origin0 + n0 * voxsize0, img_origin1 + n1 * voxsize1, img_origin2 + n2 * voxsize2, d0, d1, d2, &t1, &t2); if (intersec == 1) { - d0_sq = d0*d0; - d1_sq = d1*d1; - d2_sq = d2*d2; + d0_sq = d0 * d0; + d1_sq = d1 * d1; + d2_sq = d2 * d2; lsq = d0_sq + d1_sq + d2_sq; @@ -98,80 +99,97 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, // we step through the volume along the 0 direction //--- check where ray enters / leaves cube - istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0; - iend_f = (xstart0 + t2*d0 - img_origin0) / voxsize0; + istart_f = (xstart0 + t1 * d0 - img_origin0) / voxsize0; + iend_f = (xstart0 + t2 * d0 - img_origin0) / voxsize0; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - + istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); + iend = (int)ceil(iend_f); - if (istart < 0){istart = 0;} - if (iend >= n0){iend = n0;} + if (istart < 0) + { + istart = 0; + } + if (iend >= n0) + { + iend = n0; + } // check in which "plane" the start and end points are // we have to do this to avoid that we include voxels // that are "outside" the line segment bewteen xstart and xend - + // !! for these calculations we overwrite the istart_f and iend_f variables !! istart_f = (xstart0 - img_origin0) / voxsize0; - iend_f = (xend0 - img_origin0) / voxsize0; + iend_f = (xend0 - img_origin0) / voxsize0; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + if (istart < (int)floor(istart_f)) + { + istart = (int)floor(istart_f); + } + if (iend >= (int)ceil(iend_f)) + { + iend = (int)ceil(iend_f); + } //--- - for(i0 = istart; i0 < iend; i0++) + for (i0 = istart; i0 < iend; i0++) { // get the indices where the ray intersects the image plane - x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0; - x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0; + x_pr1 = xstart1 + (img_origin0 + i0 * voxsize0 - xstart0) * d1 / d0; + x_pr2 = xstart2 + (img_origin0 + i0 * voxsize0 - xstart0) * d2 / d0; - i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); - i1_ceil = i1_floor + 1; + i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1); + i1_ceil = i1_floor + 1; - i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); - i2_ceil = i2_floor + 1; + i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2); + i2_ceil = i2_floor + 1; // calculate the distances to the floor normalized to [0,1] // for the bilinear interpolation - tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; - tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; + tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1; + tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2; toAdd = 0; if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2)) { - toAdd += img[n1*n2*i0 + n2*i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2); + toAdd += img[n1 * n2 * i0 + n2 * i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2); } if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2)) { - toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2); + toAdd += img[n1 * n2 * i0 + n2 * i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2); } if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) { - toAdd += img[n1*n2*i0 + n2*i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2; + toAdd += img[n1 * n2 * i0 + n2 * i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2; } if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2)) { - toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_ceil] * tmp_1 * tmp_2; + toAdd += img[n1 * n2 * i0 + n2 * i1_ceil + i2_ceil] * tmp_1 * tmp_2; } - if(toAdd != 0){p[i] += (cf * toAdd);} + if (toAdd != 0) + { + p[i] += (cf * toAdd); + } } } - //--------------------------------------------------------------------------------- + //--------------------------------------------------------------------------------- if (direction == 1) { cf = voxsize1 / sqrtf(cos1_sq); @@ -180,80 +198,97 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, // we step through the volume along the 1 direction //--- check where ray enters / leaves cube - istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1; - iend_f = (xstart1 + t2*d1 - img_origin1) / voxsize1; + istart_f = (xstart1 + t1 * d1 - img_origin1) / voxsize1; + iend_f = (xstart1 + t2 * d1 - img_origin1) / voxsize1; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - + istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); + iend = (int)ceil(iend_f); - if (istart < 0){istart = 0;} - if (iend >= n1){iend = n1;} + if (istart < 0) + { + istart = 0; + } + if (iend >= n1) + { + iend = n1; + } // check in which "plane" the start and end points are // we have to do this to avoid that we include voxels // that are "outside" the line segment bewteen xstart and xend - + // !! for these calculations we overwrite the istart_f and iend_f variables !! istart_f = (xstart1 - img_origin1) / voxsize1; - iend_f = (xend1 - img_origin1) / voxsize1; + iend_f = (xend1 - img_origin1) / voxsize1; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + if (istart < (int)floor(istart_f)) + { + istart = (int)floor(istart_f); + } + if (iend >= (int)ceil(iend_f)) + { + iend = (int)ceil(iend_f); + } //--- for (i1 = istart; i1 < iend; i1++) { // get the indices where the ray intersects the image plane - x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1; - x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1; + x_pr0 = xstart0 + (img_origin1 + i1 * voxsize1 - xstart1) * d0 / d1; + x_pr2 = xstart2 + (img_origin1 + i1 * voxsize1 - xstart1) * d2 / d1; - i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); - i0_ceil = i0_floor + 1; + i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0); + i0_ceil = i0_floor + 1; - i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2); - i2_ceil = i2_floor + 1; + i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2); + i2_ceil = i2_floor + 1; // calculate the distances to the floor normalized to [0,1] // for the bilinear interpolation - tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; - tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2; + tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0; + tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2; toAdd = 0; if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) { - toAdd += img[n1*n2*i0_floor + n2*i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2); + toAdd += img[n1 * n2 * i0_floor + n2 * i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2); } if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2)) { - toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_floor] * tmp_0 * (1 - tmp_2); + toAdd += img[n1 * n2 * i0_ceil + n2 * i1 + i2_floor] * tmp_0 * (1 - tmp_2); } if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) { - toAdd += img[n1*n2*i0_floor + n2*i1 + i2_ceil] * (1 - tmp_0) * tmp_2; + toAdd += img[n1 * n2 * i0_floor + n2 * i1 + i2_ceil] * (1 - tmp_0) * tmp_2; } if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2)) { - toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_ceil] * tmp_0 * tmp_2; + toAdd += img[n1 * n2 * i0_ceil + n2 * i1 + i2_ceil] * tmp_0 * tmp_2; } - if(toAdd != 0){p[i] += (cf * toAdd);} + if (toAdd != 0) + { + p[i] += (cf * toAdd); + } } } - //--------------------------------------------------------------------------------- + //--------------------------------------------------------------------------------- if (direction == 2) { cf = voxsize2 / sqrtf(cos2_sq); @@ -262,76 +297,93 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i, // we step through the volume along the 2 direction //--- check where ray enters / leaves cube - istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2; - iend_f = (xstart2 + t2*d2 - img_origin2) / voxsize2; + istart_f = (xstart2 + t1 * d2 - img_origin2) / voxsize2; + iend_f = (xstart2 + t2 * d2 - img_origin2) / voxsize2; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - + istart = (int)floor(istart_f); - iend = (int)ceil(iend_f); + iend = (int)ceil(iend_f); - if (istart < 0){istart = 0;} - if (iend >= n2){iend = n2;} + if (istart < 0) + { + istart = 0; + } + if (iend >= n2) + { + iend = n2; + } // check in which "plane" the start and end points are // we have to do this to avoid that we include voxels // that are "outside" the line segment bewteen xstart and xend - + // !! for these calculations we overwrite the istart_f and iend_f variables !! istart_f = (xstart2 - img_origin2) / voxsize2; - iend_f = (xend2 - img_origin2) / voxsize2; + iend_f = (xend2 - img_origin2) / voxsize2; - if (istart_f > iend_f){ - tmp = iend_f; - iend_f = istart_f; + if (istart_f > iend_f) + { + tmp = iend_f; + iend_f = istart_f; istart_f = tmp; } - if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);} - if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);} + if (istart < (int)floor(istart_f)) + { + istart = (int)floor(istart_f); + } + if (iend >= (int)ceil(iend_f)) + { + iend = (int)ceil(iend_f); + } //--- - for(i2 = istart; i2 < iend; i2++) + for (i2 = istart; i2 < iend; i2++) { // get the indices where the ray intersects the image plane - x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2; - x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2; + x_pr0 = xstart0 + (img_origin2 + i2 * voxsize2 - xstart2) * d0 / d2; + x_pr1 = xstart1 + (img_origin2 + i2 * voxsize2 - xstart2) * d1 / d2; - i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0); - i0_ceil = i0_floor + 1; + i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0); + i0_ceil = i0_floor + 1; - i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1); - i1_ceil = i1_floor + 1; + i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1); + i1_ceil = i1_floor + 1; // calculate the distances to the floor normalized to [0,1] // for the bilinear interpolation - tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0; - tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1; + tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0; + tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1; toAdd = 0; if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1)) { - toAdd += img[n1*n2*i0_floor + n2*i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1); + toAdd += img[n1 * n2 * i0_floor + n2 * i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1); } if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1)) { - toAdd += img[n1*n2*i0_ceil + n2*i1_floor + i2] * tmp_0 * (1 - tmp_1); + toAdd += img[n1 * n2 * i0_ceil + n2 * i1_floor + i2] * tmp_0 * (1 - tmp_1); } if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) & (i1_ceil < n1)) { - toAdd += img[n1*n2*i0_floor + n2*i1_ceil + i2] * (1 - tmp_0) * tmp_1; + toAdd += img[n1 * n2 * i0_floor + n2 * i1_ceil + i2] * (1 - tmp_0) * tmp_1; } if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1)) { - toAdd += img[n1*n2*i0_ceil + n2*i1_ceil + i2] * tmp_0 * tmp_1; + toAdd += img[n1 * n2 * i0_ceil + n2 * i1_ceil + i2] * tmp_0 * tmp_1; } - if(toAdd != 0){p[i] += (cf * toAdd);} + if (toAdd != 0) + { + p[i] += (cf * toAdd); + } } } } diff --git a/main.cpp b/main.cpp index 641c0b1..aac1d7f 100644 --- a/main.cpp +++ b/main.cpp @@ -3,7 +3,8 @@ #include #include -int main() { +int main() +{ const size_t repetitions = 5; size_t nlors = 10; @@ -18,11 +19,12 @@ int main() { float voxsize[3] = {4, 3, 2}; float img_origin[3]; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < 3; ++i) + { img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i]; } - float* img = new float[img_dim[0] * img_dim[1] * img_dim[2]]; + float *img = new float[img_dim[0] * img_dim[1] * img_dim[2]]; // fill the test image for (int i0 = 0; i0 < img_dim[0]; i0++) @@ -44,28 +46,28 @@ int main() { float id2 = static_cast(img_dim[2]); float vstart[] = { - 0, -1, 0, // 0 - 0, -1, 0, // 1 - 0, -1, 1, // 2 - 0, -1, 0.5, // 3 - 0, 0, -1, // 4 - -1, 0, 0, // 5 - id0 - 1, -1, 0, // 6 - (shifted 1) + 0, -1, 0, // 0 + 0, -1, 0, // 1 + 0, -1, 1, // 2 + 0, -1, 0.5, // 3 + 0, 0, -1, // 4 + -1, 0, 0, // 5 + id0 - 1, -1, 0, // 6 - (shifted 1) id0 - 1, -1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, -1, // 8 - (shifted 4) + id0 - 1, 0, -1, // 8 - (shifted 4) id0 - 1, id1 - 1, -1, // 9 - (shifted 8) }; float vend[] = { - 0, id1, 0, // 0 - 0, id1, 0, // 1 - 0, id1, 1, // 2 - 0, id1, 0.5, // 3 - 0, 0, id2, // 4 - id0, 0, 0, // 5 - id0 - 1, id1, 0, // 6 - (shifted 1) + 0, id1, 0, // 0 + 0, id1, 0, // 1 + 0, id1, 1, // 2 + 0, id1, 0.5, // 3 + 0, 0, id2, // 4 + id0, 0, 0, // 5 + id0 - 1, id1, 0, // 6 - (shifted 1) id0 - 1, id1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, id2, // 8 - (shifted 4) + id0 - 1, 0, id2, // 8 - (shifted 4) id0 - 1, id1 - 1, id2, // 9 - (shifted 8) }; @@ -77,7 +79,7 @@ int main() { } // calculate the start and end coordinates in world coordinates - + float *xstart = new float[3 * nlors]; float *xend = new float[3 * nlors]; @@ -103,7 +105,7 @@ int main() { int retval = 0; float eps = 1e-7; - float* expected_fwd_vals = new float[nlors]; + float *expected_fwd_vals = new float[nlors]; // initialize expected_fwd_vals with 0s for (int ir = 0; ir < nlors; ir++) { @@ -185,13 +187,15 @@ int main() { // test the back projection - float* bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]]; - for (size_t i = 0; i < (img_dim[0]*img_dim[1]*img_dim[2]); i++){ + float *bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]]; + for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) + { bimg[i] = 0; } - float* ones = new float[nlors]; - for (size_t i = 0; i < nlors; i++){ + float *ones = new float[nlors]; + for (size_t i = 0; i < nlors; i++) + { ones[i] = 1; } diff --git a/main.cu b/main.cu index 5bdf872..7437b4c 100644 --- a/main.cu +++ b/main.cu @@ -4,7 +4,8 @@ #include #include -int main() { +int main() +{ const size_t repetitions = 5; size_t nlors = 10; @@ -20,25 +21,26 @@ int main() { cudaSetDevice(device_count - 1); - int* img_dim; + int *img_dim; cudaMallocManaged(&img_dim, 3 * sizeof(int)); img_dim[0] = 2; img_dim[1] = 3; img_dim[2] = 4; - float* voxsize; + float *voxsize; cudaMallocManaged(&voxsize, 3 * sizeof(float)); voxsize[0] = 4; voxsize[1] = 3; voxsize[2] = 2; - float* img_origin; + float *img_origin; cudaMallocManaged(&img_origin, 3 * sizeof(float)); - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < 3; ++i) + { img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i]; } - float* img; + float *img; cudaMallocManaged(&img, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float)); // fill the test image @@ -61,28 +63,28 @@ int main() { float id2 = static_cast(img_dim[2]); float vstart[] = { - 0, -1, 0, // 0 - 0, -1, 0, // 1 - 0, -1, 1, // 2 - 0, -1, 0.5, // 3 - 0, 0, -1, // 4 - -1, 0, 0, // 5 - id0 - 1, -1, 0, // 6 - (shifted 1) + 0, -1, 0, // 0 + 0, -1, 0, // 1 + 0, -1, 1, // 2 + 0, -1, 0.5, // 3 + 0, 0, -1, // 4 + -1, 0, 0, // 5 + id0 - 1, -1, 0, // 6 - (shifted 1) id0 - 1, -1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, -1, // 8 - (shifted 4) + id0 - 1, 0, -1, // 8 - (shifted 4) id0 - 1, id1 - 1, -1, // 9 - (shifted 8) }; float vend[] = { - 0, id1, 0, // 0 - 0, id1, 0, // 1 - 0, id1, 1, // 2 - 0, id1, 0.5, // 3 - 0, 0, id2, // 4 - id0, 0, 0, // 5 - id0 - 1, id1, 0, // 6 - (shifted 1) + 0, id1, 0, // 0 + 0, id1, 0, // 1 + 0, id1, 1, // 2 + 0, id1, 0.5, // 3 + 0, 0, id2, // 4 + id0, 0, 0, // 5 + id0 - 1, id1, 0, // 6 - (shifted 1) id0 - 1, id1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, id2, // 8 - (shifted 4) + id0 - 1, 0, id2, // 8 - (shifted 4) id0 - 1, id1 - 1, id2, // 9 - (shifted 8) }; @@ -94,11 +96,11 @@ int main() { } // calculate the start and end coordinates in world coordinates - + float *xstart; - cudaMallocManaged(&xstart, (3*nlors) * sizeof(float)); + cudaMallocManaged(&xstart, (3 * nlors) * sizeof(float)); float *xend; - cudaMallocManaged(&xend, (3*nlors) * sizeof(float)); + cudaMallocManaged(&xend, (3 * nlors) * sizeof(float)); for (int ir = 0; ir < nlors; ir++) @@ -110,7 +112,6 @@ int main() { } } - float *img_fwd; cudaMallocManaged(&img_fwd, nlors * sizeof(float)); @@ -118,7 +119,6 @@ int main() { // calculate the expected values - ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// @@ -126,7 +126,7 @@ int main() { int retval = 0; float eps = 1e-7; - float* expected_fwd_vals = new float[nlors]; + float *expected_fwd_vals = new float[nlors]; // initialize expected_fwd_vals with 0s for (int ir = 0; ir < nlors; ir++) { @@ -218,4 +218,3 @@ int main() { return retval; } - diff --git a/utils.h b/utils.h index a9e642c..d046b35 100644 --- a/utils.h +++ b/utils.h @@ -1,74 +1,100 @@ #pragma once #include "cuda_compat.h" -WORKER_QUALIFIER inline void atomic_sum(float* target, float value) { +WORKER_QUALIFIER inline void atomic_sum(float *target, float value) +{ #ifdef __CUDA_ARCH__ - atomicAdd(target, value); + atomicAdd(target, value); #else #pragma omp atomic - *target += value; + *target += value; #endif } WORKER_QUALIFIER inline unsigned char ray_cube_intersection(float orig0, - float orig1, - float orig2, - float bounds0_min, - float bounds1_min, - float bounds2_min, - float bounds0_max, - float bounds1_max, - float bounds2_max, - float rdir0, - float rdir1, - float rdir2, - float* t1, - float* t2){ + float orig1, + float orig2, + float bounds0_min, + float bounds1_min, + float bounds2_min, + float bounds0_max, + float bounds1_max, + float bounds2_max, + float rdir0, + float rdir1, + float rdir2, + float *t1, + float *t2) +{ // the inverse of the directional vector // using the inverse of the directional vector and IEEE floating point arith standard 754 - // makes sure that 0's in the directional vector are handled correctly - float invdir0 = 1.f/rdir0; - float invdir1 = 1.f/rdir1; - float invdir2 = 1.f/rdir2; - + // makes sure that 0's in the directional vector are handled correctly + float invdir0 = 1.f / rdir0; + float invdir1 = 1.f / rdir1; + float invdir2 = 1.f / rdir2; + unsigned char intersec = 1; - - float t11, t12, t21, t22; - if (invdir0 >= 0){ - *t1 = (bounds0_min - orig0) * invdir0; - *t2 = (bounds0_max - orig0) * invdir0; + float t11, t12, t21, t22; + + if (invdir0 >= 0) + { + *t1 = (bounds0_min - orig0) * invdir0; + *t2 = (bounds0_max - orig0) * invdir0; } - else{ - *t1 = (bounds0_max - orig0) * invdir0; - *t2 = (bounds0_min - orig0) * invdir0; + else + { + *t1 = (bounds0_max - orig0) * invdir0; + *t2 = (bounds0_min - orig0) * invdir0; } - - if (invdir1 >= 0){ - t11 = (bounds1_min - orig1) * invdir1; - t12 = (bounds1_max - orig1) * invdir1; + + if (invdir1 >= 0) + { + t11 = (bounds1_min - orig1) * invdir1; + t12 = (bounds1_max - orig1) * invdir1; } - else{ + else + { t11 = (bounds1_max - orig1) * invdir1; - t12 = (bounds1_min - orig1) * invdir1; + t12 = (bounds1_min - orig1) * invdir1; + } + + if ((*t1 > t12) || (t11 > *t2)) + { + intersec = 0; + } + if (t11 > *t1) + { + *t1 = t11; + } + if (t12 < *t2) + { + *t2 = t12; } - - if ((*t1 > t12) || (t11 > *t2)){intersec = 0;} - if (t11 > *t1){*t1 = t11;} - if (t12 < *t2){*t2 = t12;} - - if (invdir2 >= 0){ - t21 = (bounds2_min - orig2) * invdir2; + + if (invdir2 >= 0) + { + t21 = (bounds2_min - orig2) * invdir2; t22 = (bounds2_max - orig2) * invdir2; - } - else{ - t21 = (bounds2_max - orig2) * invdir2; + } + else + { + t21 = (bounds2_max - orig2) * invdir2; t22 = (bounds2_min - orig2) * invdir2; - } - - if ((*t1 > t22) || (t21 > *t2)){intersec = 0;} - if (t21 > *t1){*t1 = t21;} - if (t22 < *t2){*t2 = t22;} + } + + if ((*t1 > t22) || (t21 > *t2)) + { + intersec = 0; + } + if (t21 > *t1) + { + *t1 = t21; + } + if (t22 < *t2) + { + *t2 = t22; + } - return(intersec); + return (intersec); } From d349b1fc697c7e4b7cb149f1ab2ddca33b521a97 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Thu, 10 Apr 2025 17:24:07 +0200 Subject: [PATCH 09/35] use __CUDACC__ and fix bug with variable name --- joseph3d_back_worker.h | 2 +- joseph3d_fwd.cu | 2 +- joseph3d_fwd_worker.h | 2 +- main.cu | 1 - utils.h | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h index b7fdb23..75e5751 100644 --- a/joseph3d_back_worker.h +++ b/joseph3d_back_worker.h @@ -3,7 +3,7 @@ #include "utils.h" // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA -#ifndef __CUDA_ARCH__ +#ifndef __CUDACC__ #include #endif diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu index 10e3f75..8c67f37 100644 --- a/joseph3d_fwd.cu +++ b/joseph3d_fwd.cu @@ -15,7 +15,7 @@ __global__ void joseph3d_fwd_kernel(const float *xstart, const int *img_dim) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) + if (i < nlors) { joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); } diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h index c8e402d..2e9106f 100644 --- a/joseph3d_fwd_worker.h +++ b/joseph3d_fwd_worker.h @@ -3,7 +3,7 @@ #include "utils.h" // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA -#ifndef __CUDA_ARCH__ +#ifndef __CUDACC__ #include #endif diff --git a/main.cu b/main.cu index 7437b4c..5bf0d46 100644 --- a/main.cu +++ b/main.cu @@ -6,7 +6,6 @@ int main() { - const size_t repetitions = 5; size_t nlors = 10; // get the number of cuda devices - because we want to run on the last device diff --git a/utils.h b/utils.h index d046b35..f5281ad 100644 --- a/utils.h +++ b/utils.h @@ -3,7 +3,7 @@ WORKER_QUALIFIER inline void atomic_sum(float *target, float value) { -#ifdef __CUDA_ARCH__ +#ifdef __CUDACC__ atomicAdd(target, value); #else #pragma omp atomic From ff616218ef7f2a05b54d44903cf8db0ec8674f0a Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Thu, 10 Apr 2025 17:37:31 +0200 Subject: [PATCH 10/35] add joseph3d_back.cu --- CMakeLists.txt | 2 +- TODO.txt | 5 +++ joseph3d_back.cu | 103 +++++++++++++++++++++++++++++++++++++++++++++++ main.cu | 68 +++++++++++++++++++++++++++++++ 4 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 TODO.txt create mode 100644 joseph3d_back.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 934f362..d17d3b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ if(CMAKE_CUDA_COMPILER) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) - add_library(parallelproj SHARED joseph3d_fwd.cu) + add_library(parallelproj SHARED joseph3d_fwd.cu joseph3d_back.cu) target_link_libraries(parallelproj PRIVATE CUDA::cudart) add_executable(main main.cu) diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..53bf847 --- /dev/null +++ b/TODO.txt @@ -0,0 +1,5 @@ +- managed cases: cudaprefetchasync to device, read / write hints +- host pointer cases +- device array cases + +- test all cases in main.cu \ No newline at end of file diff --git a/joseph3d_back.cu b/joseph3d_back.cu new file mode 100644 index 0000000..18ea918 --- /dev/null +++ b/joseph3d_back.cu @@ -0,0 +1,103 @@ +#include "parallelproj.h" +#include "joseph3d_back_worker.h" +#include "debug.h" +#include +#include +#include + +__global__ void joseph3d_back_kernel(const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + size_t nlors, + const int *img_dim) +{ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < nlors) + { + joseph3d_back_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim); + } +} + +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////// + +void joseph3d_back(const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + size_t nlors, + const int *img_dim, + int device_id, + int threadsperblock) +{ + + const float *d_xstart = nullptr; + const float *d_xend = nullptr; + float *d_img = nullptr; + const float *d_img_origin = nullptr; + const float *d_voxsize = nullptr; + const float *d_p = nullptr; + const int *d_img_dim = nullptr; + + // get pointer attributes of all input and output arrays + cudaPointerAttributes xstart_attr; + cudaError_t err = cudaPointerGetAttributes(&xstart_attr, xstart); + ///////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////// + // TODO get attributes of all other arrays + ///////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////// + + bool needs_copy_back = false; + bool is_cuda_managed_ptr = false; + + if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)) + { + is_cuda_managed_ptr = true; + DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device); + } + // else throw error + else + { + needs_copy_back = true; + throw std::runtime_error("Unsupported pointer type"); + } + + if (is_cuda_managed_ptr) + { + // all arrays are cuda malloc managed, so no need to copy to the device + d_xstart = xstart; + d_xend = xend; + d_img = img; + d_img_origin = img_origin; + d_voxsize = voxsize; + d_p = p; + d_img_dim = img_dim; + } + else + { + DEBUG_PRINT("COPYING HOST TO DEVICE"); + } + + // get and print the current cuda device ID + int current_device_id; + cudaGetDevice(¤t_device_id); + DEBUG_PRINT("Using CUDA device: %d\n", current_device_id); + + int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock); + joseph3d_back_kernel<<>>(d_xstart, d_xend, d_img, + d_img_origin, d_voxsize, + d_p, nlors, d_img_dim); + cudaDeviceSynchronize(); + + // if (needs_copy_back) { + // cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost); + // cudaFree(device_array); + // } +} diff --git a/main.cu b/main.cu index 5bf0d46..b6fa349 100644 --- a/main.cu +++ b/main.cu @@ -205,6 +205,71 @@ int main() //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// + // test the back projection + + float *bimg; + cudaMallocManaged(&bimg, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float)); + + for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) + { + bimg[i] = 0; + } + + float *ones; + cudaMallocManaged(&ones, nlors * sizeof(float)); + for (size_t i = 0; i < nlors; i++) + { + ones[i] = 1; + } + + joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim, 0, 64); + + printf("\nback projection of ones along all rays:\n"); + for (size_t i0 = 0; i0 < img_dim[0]; i0++) + { + for (size_t i1 = 0; i1 < img_dim[1]; i1++) + { + for (size_t i2 = 0; i2 < img_dim[2]; i2++) + { + printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + } + printf("\n"); + } + printf("\n"); + } + + // To test whether the back projection is correct, we test if the back projector is the adjoint + // of the forward projector. This is more practical than checking a lot of single voxels in the + // back projected image. + + float inner_product1 = 0; + float inner_product2 = 0; + + for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) + { + inner_product1 += (img[i] * bimg[i]); + } + + for (size_t ir = 0; ir < nlors; ir++) + { + inner_product2 += (img_fwd[ir] * ones[ir]); + } + + float ip_diff = fabs(inner_product1 - inner_product2); + + if (ip_diff > eps) + { + printf("\n#########################################################################"); + printf("\nback projection test failed. back projection seems not to be the adjoint."); + printf("\n %.7e", ip_diff); + printf("\n#########################################################################\n"); + retval = 1; + } + + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + cudaFree(img_dim); cudaFree(voxsize); cudaFree(img_origin); @@ -213,6 +278,9 @@ int main() cudaFree(xend); cudaFree(img_fwd); + cudaFree(bimg); + cudaFree(ones); + free(expected_fwd_vals); return retval; From 9332113adbbdf5038de081d2bfb5ba43d3af784d Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 15:32:21 +0200 Subject: [PATCH 11/35] start refactorization --- joseph3d_back.cu | 6 ++ joseph3d_fwd.cu | 224 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 196 insertions(+), 34 deletions(-) diff --git a/joseph3d_back.cu b/joseph3d_back.cu index 18ea918..d959808 100644 --- a/joseph3d_back.cu +++ b/joseph3d_back.cu @@ -37,6 +37,12 @@ void joseph3d_back(const float *xstart, int threadsperblock) { + // Set the CUDA device + if (device_id >= 0) + { + cudaSetDevice(device_id); + } + const float *d_xstart = nullptr; const float *d_xend = nullptr; float *d_img = nullptr; diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu index 8c67f37..9e7b950 100644 --- a/joseph3d_fwd.cu +++ b/joseph3d_fwd.cu @@ -36,59 +36,198 @@ void joseph3d_fwd(const float *xstart, int device_id, int threadsperblock) { + // Set the CUDA device + if (device_id >= 0) + { + cudaSetDevice(device_id); + } - const float *d_xstart = nullptr; - const float *d_xend = nullptr; - const float *d_img = nullptr; - const float *d_img_origin = nullptr; - const float *d_voxsize = nullptr; - float *d_p = nullptr; - const int *d_img_dim = nullptr; - - // get pointer attributes of all input and output arrays - cudaPointerAttributes xstart_attr; - cudaError_t err = cudaPointerGetAttributes(&xstart_attr, xstart); ///////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////// - // TODO get attributes of all other arrays + // copy arrays to device if needed ///////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////// - bool needs_copy_back = false; - bool is_cuda_managed_ptr = false; + // handle xstart (read only) + float *d_xstart = nullptr; + cudaPointerAttributes xstart_attr; + cudaError_t err_xstart = cudaPointerGetAttributes(&xstart_attr, xstart); + bool free_xstart = false; + if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)) + { + cudaMemPrefetchAsync(xstart, sizeof(float) * nlors * 3, device_id); + cudaMemAdvise(xstart, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id); + } + + if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged || + xstart_attr.type == cudaMemoryTypeDevice)) + { + d_xstart = const_cast(xstart); + } + else + { + // host pointer case, transfer to device + cudaMalloc(&d_xstart, sizeof(float) * nlors * 3); + cudaMemcpy(d_xstart, xstart, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice); + free_xstart = true; + } + + // handle xend (read only) + float *d_xend = nullptr; + cudaPointerAttributes xend_attr; + cudaError_t err_xend = cudaPointerGetAttributes(&xend_attr, xend); + bool free_xend = false; + if (err_xend == cudaSuccess && xend_attr.type == cudaMemoryTypeManaged) + { + cudaMemPrefetchAsync(xend, sizeof(float) * nlors * 3, device_id); + cudaMemAdvise(xend, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id); + } - if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)) + if (err_xend == cudaSuccess && (xend_attr.type == cudaMemoryTypeManaged || + xend_attr.type == cudaMemoryTypeDevice)) { - is_cuda_managed_ptr = true; - DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device); + d_xend = const_cast(xend); } - // else throw error else { - needs_copy_back = true; - throw std::runtime_error("Unsupported pointer type"); + // host pointer case, transfer to device + cudaMalloc(&d_xend, sizeof(float) * nlors * 3); + cudaMemcpy(d_xend, xend, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice); + free_xend = true; + } + + // handle img (read only) + float *d_img = nullptr; + cudaPointerAttributes img_attr; + cudaError_t err_img = cudaPointerGetAttributes(&img_attr, img); + bool free_img = false; + if (err_img == cudaSuccess && img_attr.type == cudaMemoryTypeManaged) + { + cudaMemPrefetchAsync(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], device_id); + cudaMemAdvise(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemAdviseSetReadMostly, device_id); } - if (is_cuda_managed_ptr) + if (err_img == cudaSuccess && (img_attr.type == cudaMemoryTypeManaged || + img_attr.type == cudaMemoryTypeDevice)) { - // all arrays are cuda malloc managed, so no need to copy to the device - d_xstart = xstart; - d_xend = xend; - d_img = img; - d_img_origin = img_origin; - d_voxsize = voxsize; - d_p = p; - d_img_dim = img_dim; + d_img = const_cast(img); } else { - DEBUG_PRINT("COPYING HOST TO DEVICE"); + // host pointer case, transfer to device + cudaMalloc(&d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2]); + cudaMemcpy(d_img, img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyHostToDevice); + free_img = true; + } + + // handle img_origin (read only) + float *d_img_origin = nullptr; + cudaPointerAttributes img_origin_attr; + cudaError_t err_img_origin = cudaPointerGetAttributes(&img_origin_attr, img_origin); + bool free_img_origin = false; + if (err_img_origin == cudaSuccess && img_origin_attr.type == cudaMemoryTypeManaged) + { + cudaMemPrefetchAsync(img_origin, sizeof(float) * 3, device_id); + cudaMemAdvise(img_origin, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id); } + if (err_img_origin == cudaSuccess && (img_origin_attr.type == cudaMemoryTypeManaged || + img_origin_attr.type == cudaMemoryTypeDevice)) + { + d_img_origin = const_cast(img_origin); + } + else + { + // host pointer case, transfer to device + cudaMalloc(&d_img_origin, sizeof(float) * 3); + cudaMemcpy(d_img_origin, img_origin, sizeof(float) * 3, cudaMemcpyHostToDevice); + free_img_origin = true; + } + + // handle voxsize (read only) + float *d_voxsize = nullptr; + cudaPointerAttributes voxsize_attr; + cudaError_t err_voxsize = cudaPointerGetAttributes(&voxsize_attr, voxsize); + bool free_voxsize = false; + if (err_voxsize == cudaSuccess && voxsize_attr.type == cudaMemoryTypeManaged) + { + cudaMemPrefetchAsync(voxsize, sizeof(float) * 3, device_id); + cudaMemAdvise(voxsize, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id); + } + + if (err_voxsize == cudaSuccess && (voxsize_attr.type == cudaMemoryTypeManaged || + voxsize_attr.type == cudaMemoryTypeDevice)) + { + d_voxsize = const_cast(voxsize); + } + else + { + // host pointer case, transfer to device + cudaMalloc(&d_voxsize, sizeof(float) * 3); + cudaMemcpy(d_voxsize, voxsize, sizeof(float) * 3, cudaMemcpyHostToDevice); + free_voxsize = true; + } + + // handle p (write) + float *d_p = nullptr; + cudaPointerAttributes p_attr; + cudaError_t err_p = cudaPointerGetAttributes(&p_attr, p); + bool free_p = false; + if (err_p == cudaSuccess && p_attr.type == cudaMemoryTypeManaged) + { + cudaMemPrefetchAsync(p, sizeof(float) * nlors, device_id); + cudaMemAdvise(p, sizeof(float) * nlors, cudaMemAdviseSetAccessedBy, device_id); + } + + if (err_p == cudaSuccess && (p_attr.type == cudaMemoryTypeManaged || + p_attr.type == cudaMemoryTypeDevice)) + { + d_p = const_cast(p); + } + else + { + // host pointer case, transfer to device + cudaMalloc(&d_p, sizeof(float) * nlors); + cudaMemcpy(d_p, p, sizeof(float) * nlors, cudaMemcpyHostToDevice); + free_p = true; + } + + // handle img_dim (read only) + int *d_img_dim = nullptr; + cudaPointerAttributes img_dim_attr; + cudaError_t err_img_dim = cudaPointerGetAttributes(&img_dim_attr, img_dim); + bool free_img_dim = false; + if (err_img_dim == cudaSuccess && img_dim_attr.type == cudaMemoryTypeManaged) + { + cudaMemPrefetchAsync(img_dim, sizeof(int) * 3, device_id); + cudaMemAdvise(img_dim, sizeof(int) * 3, cudaMemAdviseSetReadMostly, device_id); + } + + if (err_img_dim == cudaSuccess && (img_dim_attr.type == cudaMemoryTypeManaged || + img_dim_attr.type == cudaMemoryTypeDevice)) + { + d_img_dim = const_cast(img_dim); + } + else + { + // host pointer case, transfer to device + cudaMalloc(&d_img_dim, sizeof(int) * 3); + cudaMemcpy(d_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyHostToDevice); + free_img_dim = true; + } + + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + // launch the kernel + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + +#ifdef DEBUG // get and print the current cuda device ID int current_device_id; cudaGetDevice(¤t_device_id); DEBUG_PRINT("Using CUDA device: %d\n", current_device_id); +#endif int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock); joseph3d_fwd_kernel<<>>(d_xstart, d_xend, d_img, @@ -96,8 +235,25 @@ void joseph3d_fwd(const float *xstart, d_p, nlors, d_img_dim); cudaDeviceSynchronize(); - // if (needs_copy_back) { - // cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost); - // cudaFree(device_array); - // } + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + // free device memory if needed + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + // Free device memory if it was allocated + if (free_xstart) + cudaFree(d_xstart); + if (free_xend) + cudaFree(d_xend); + if (free_img) + cudaFree(d_img); + if (free_img_origin) + cudaFree(d_img_origin); + if (free_voxsize) + cudaFree(d_voxsize); + if (free_p) + cudaFree(d_p); + if (free_img_dim) + cudaFree(d_img_dim); } From 921e126d8b2aaa348766c5de71c287ba15490b3e Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 16:00:54 +0200 Subject: [PATCH 12/35] wip --- cuda_utils.cu | 31 +++++++++ cuda_utils.h | 6 ++ joseph3d_fwd.cu | 162 +++++------------------------------------------- 3 files changed, 52 insertions(+), 147 deletions(-) create mode 100644 cuda_utils.cu create mode 100644 cuda_utils.h diff --git a/cuda_utils.cu b/cuda_utils.cu new file mode 100644 index 0000000..4be521a --- /dev/null +++ b/cuda_utils.cu @@ -0,0 +1,31 @@ +#include "cuda_utils.h" + +template +void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint) +{ + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, host_ptr); + free_flag = false; + + if (err == cudaSuccess && attr.type == cudaMemoryTypeManaged) + { + cudaMemPrefetchAsync(const_cast(static_cast(host_ptr)), size, device_id); + cudaMemAdvise(const_cast(static_cast(host_ptr)), size, memory_hint, device_id); + } + + if (err == cudaSuccess && (attr.type == cudaMemoryTypeManaged || attr.type == cudaMemoryTypeDevice)) + { + *device_ptr = const_cast(host_ptr); + } + else + { + // Host pointer case, transfer to device + cudaMalloc(device_ptr, size); + cudaMemcpy(*device_ptr, host_ptr, size, cudaMemcpyHostToDevice); + free_flag = true; + } +} + +// Explicit template instantiations +template void handle_cuda_input_array(const float *, float **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(const int *, int **, size_t, bool &, int, cudaMemoryAdvise); diff --git a/cuda_utils.h b/cuda_utils.h new file mode 100644 index 0000000..2accedb --- /dev/null +++ b/cuda_utils.h @@ -0,0 +1,6 @@ +#pragma once + +#include + +template +void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint); diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu index 9e7b950..1093983 100644 --- a/joseph3d_fwd.cu +++ b/joseph3d_fwd.cu @@ -1,6 +1,7 @@ #include "parallelproj.h" #include "joseph3d_fwd_worker.h" #include "debug.h" +#include "cuda_utils.h" #include #include #include @@ -48,173 +49,40 @@ void joseph3d_fwd(const float *xstart, ///////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////// - // handle xstart (read only) + // Handle xstart (read mostly) float *d_xstart = nullptr; - cudaPointerAttributes xstart_attr; - cudaError_t err_xstart = cudaPointerGetAttributes(&xstart_attr, xstart); bool free_xstart = false; - if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)) - { - cudaMemPrefetchAsync(xstart, sizeof(float) * nlors * 3, device_id); - cudaMemAdvise(xstart, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id); - } - - if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged || - xstart_attr.type == cudaMemoryTypeDevice)) - { - d_xstart = const_cast(xstart); - } - else - { - // host pointer case, transfer to device - cudaMalloc(&d_xstart, sizeof(float) * nlors * 3); - cudaMemcpy(d_xstart, xstart, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice); - free_xstart = true; - } + handle_cuda_input_array(xstart, &d_xstart, sizeof(float) * nlors * 3, free_xstart, device_id, cudaMemAdviseSetReadMostly); - // handle xend (read only) + // Handle xend (read mostly) float *d_xend = nullptr; - cudaPointerAttributes xend_attr; - cudaError_t err_xend = cudaPointerGetAttributes(&xend_attr, xend); bool free_xend = false; - if (err_xend == cudaSuccess && xend_attr.type == cudaMemoryTypeManaged) - { - cudaMemPrefetchAsync(xend, sizeof(float) * nlors * 3, device_id); - cudaMemAdvise(xend, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id); - } + handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly); - if (err_xend == cudaSuccess && (xend_attr.type == cudaMemoryTypeManaged || - xend_attr.type == cudaMemoryTypeDevice)) - { - d_xend = const_cast(xend); - } - else - { - // host pointer case, transfer to device - cudaMalloc(&d_xend, sizeof(float) * nlors * 3); - cudaMemcpy(d_xend, xend, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice); - free_xend = true; - } - - // handle img (read only) + // Handle img (read mostly) float *d_img = nullptr; - cudaPointerAttributes img_attr; - cudaError_t err_img = cudaPointerGetAttributes(&img_attr, img); bool free_img = false; - if (err_img == cudaSuccess && img_attr.type == cudaMemoryTypeManaged) - { - cudaMemPrefetchAsync(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], device_id); - cudaMemAdvise(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemAdviseSetReadMostly, device_id); - } - - if (err_img == cudaSuccess && (img_attr.type == cudaMemoryTypeManaged || - img_attr.type == cudaMemoryTypeDevice)) - { - d_img = const_cast(img); - } - else - { - // host pointer case, transfer to device - cudaMalloc(&d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2]); - cudaMemcpy(d_img, img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyHostToDevice); - free_img = true; - } + handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetReadMostly); - // handle img_origin (read only) + // Handle img_origin (read mostly) float *d_img_origin = nullptr; - cudaPointerAttributes img_origin_attr; - cudaError_t err_img_origin = cudaPointerGetAttributes(&img_origin_attr, img_origin); bool free_img_origin = false; - if (err_img_origin == cudaSuccess && img_origin_attr.type == cudaMemoryTypeManaged) - { - cudaMemPrefetchAsync(img_origin, sizeof(float) * 3, device_id); - cudaMemAdvise(img_origin, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id); - } + handle_cuda_input_array(img_origin, &d_img_origin, sizeof(float) * 3, free_img_origin, device_id, cudaMemAdviseSetReadMostly); - if (err_img_origin == cudaSuccess && (img_origin_attr.type == cudaMemoryTypeManaged || - img_origin_attr.type == cudaMemoryTypeDevice)) - { - d_img_origin = const_cast(img_origin); - } - else - { - // host pointer case, transfer to device - cudaMalloc(&d_img_origin, sizeof(float) * 3); - cudaMemcpy(d_img_origin, img_origin, sizeof(float) * 3, cudaMemcpyHostToDevice); - free_img_origin = true; - } - - // handle voxsize (read only) + // Handle voxsize (read mostly) float *d_voxsize = nullptr; - cudaPointerAttributes voxsize_attr; - cudaError_t err_voxsize = cudaPointerGetAttributes(&voxsize_attr, voxsize); bool free_voxsize = false; - if (err_voxsize == cudaSuccess && voxsize_attr.type == cudaMemoryTypeManaged) - { - cudaMemPrefetchAsync(voxsize, sizeof(float) * 3, device_id); - cudaMemAdvise(voxsize, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id); - } - - if (err_voxsize == cudaSuccess && (voxsize_attr.type == cudaMemoryTypeManaged || - voxsize_attr.type == cudaMemoryTypeDevice)) - { - d_voxsize = const_cast(voxsize); - } - else - { - // host pointer case, transfer to device - cudaMalloc(&d_voxsize, sizeof(float) * 3); - cudaMemcpy(d_voxsize, voxsize, sizeof(float) * 3, cudaMemcpyHostToDevice); - free_voxsize = true; - } + handle_cuda_input_array(voxsize, &d_voxsize, sizeof(float) * 3, free_voxsize, device_id, cudaMemAdviseSetReadMostly); - // handle p (write) + // Handle p (write access) float *d_p = nullptr; - cudaPointerAttributes p_attr; - cudaError_t err_p = cudaPointerGetAttributes(&p_attr, p); bool free_p = false; - if (err_p == cudaSuccess && p_attr.type == cudaMemoryTypeManaged) - { - cudaMemPrefetchAsync(p, sizeof(float) * nlors, device_id); - cudaMemAdvise(p, sizeof(float) * nlors, cudaMemAdviseSetAccessedBy, device_id); - } + handle_cuda_input_array(p, &d_p, sizeof(float) * nlors, free_p, device_id, cudaMemAdviseSetAccessedBy); - if (err_p == cudaSuccess && (p_attr.type == cudaMemoryTypeManaged || - p_attr.type == cudaMemoryTypeDevice)) - { - d_p = const_cast(p); - } - else - { - // host pointer case, transfer to device - cudaMalloc(&d_p, sizeof(float) * nlors); - cudaMemcpy(d_p, p, sizeof(float) * nlors, cudaMemcpyHostToDevice); - free_p = true; - } - - // handle img_dim (read only) + // Handle img_dim (read mostly) int *d_img_dim = nullptr; - cudaPointerAttributes img_dim_attr; - cudaError_t err_img_dim = cudaPointerGetAttributes(&img_dim_attr, img_dim); bool free_img_dim = false; - if (err_img_dim == cudaSuccess && img_dim_attr.type == cudaMemoryTypeManaged) - { - cudaMemPrefetchAsync(img_dim, sizeof(int) * 3, device_id); - cudaMemAdvise(img_dim, sizeof(int) * 3, cudaMemAdviseSetReadMostly, device_id); - } - - if (err_img_dim == cudaSuccess && (img_dim_attr.type == cudaMemoryTypeManaged || - img_dim_attr.type == cudaMemoryTypeDevice)) - { - d_img_dim = const_cast(img_dim); - } - else - { - // host pointer case, transfer to device - cudaMalloc(&d_img_dim, sizeof(int) * 3); - cudaMemcpy(d_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyHostToDevice); - free_img_dim = true; - } + handle_cuda_input_array(img_dim, &d_img_dim, sizeof(int) * 3, free_img_dim, device_id, cudaMemAdviseSetReadMostly); //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// From 01eaed0bfa1f5b53f83745f8ce5aa205d79241e6 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 16:17:44 +0200 Subject: [PATCH 13/35] re-organize project --- CMakeLists.txt | 27 +++++++++++++++---- cuda_compat.h => include/cuda_compat.h | 0 cuda_utils.h => include/cuda_utils.h | 0 debug.h => include/debug.h | 0 parallelproj.h => include/parallelproj.h | 0 square_array.h => include/square_array.h | 0 square_op.h => include/square_op.h | 0 utils.h => include/utils.h | 0 cuda_utils.cu => src/cuda_utils.cu | 0 joseph3d_back.cpp => src/joseph3d_back.cpp | 0 joseph3d_back.cu => src/joseph3d_back.cu | 0 .../joseph3d_back_worker.h | 0 joseph3d_fwd.cpp => src/joseph3d_fwd.cpp | 0 joseph3d_fwd.cu => src/joseph3d_fwd.cu | 0 .../joseph3d_fwd_worker.h | 0 main.cpp => src/main.cpp | 0 main.cu => src/main.cu | 0 17 files changed, 22 insertions(+), 5 deletions(-) rename cuda_compat.h => include/cuda_compat.h (100%) rename cuda_utils.h => include/cuda_utils.h (100%) rename debug.h => include/debug.h (100%) rename parallelproj.h => include/parallelproj.h (100%) rename square_array.h => include/square_array.h (100%) rename square_op.h => include/square_op.h (100%) rename utils.h => include/utils.h (100%) rename cuda_utils.cu => src/cuda_utils.cu (100%) rename joseph3d_back.cpp => src/joseph3d_back.cpp (100%) rename joseph3d_back.cu => src/joseph3d_back.cu (100%) rename joseph3d_back_worker.h => src/joseph3d_back_worker.h (100%) rename joseph3d_fwd.cpp => src/joseph3d_fwd.cpp (100%) rename joseph3d_fwd.cu => src/joseph3d_fwd.cu (100%) rename joseph3d_fwd_worker.h => src/joseph3d_fwd_worker.h (100%) rename main.cpp => src/main.cpp (100%) rename main.cu => src/main.cu (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index d17d3b8..f27d71c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,20 +33,37 @@ if(DEBUG) add_compile_definitions(DEBUG) endif() + + if(CMAKE_CUDA_COMPILER) + set(LIB_SOURCES + src/joseph3d_fwd.cu + src/joseph3d_back.cu + src/cuda_utils.cu + ) + enable_language(CUDA) find_package(CUDAToolkit REQUIRED) - add_library(parallelproj SHARED joseph3d_fwd.cu joseph3d_back.cu) + add_library(parallelproj SHARED ${LIB_SOURCES}) + target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries(parallelproj PRIVATE CUDA::cudart) - add_executable(main main.cu) - target_include_directories(main PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) + add_executable(main src/main.cu) + target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include ${CUDAToolkit_INCLUDE_DIRS}) target_link_libraries(main PRIVATE parallelproj CUDA::cudart) else() + set(LIB_SOURCES + src/joseph3d_fwd.cpp + src/joseph3d_back.cpp + ) + find_package(OpenMP REQUIRED) - add_library(parallelproj SHARED joseph3d_fwd.cpp joseph3d_back.cpp) + add_library(parallelproj SHARED ${LIB_SOURCES}) + target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX) - add_executable(main main.cpp) + + add_executable(main src/main.cpp) + target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries(main PRIVATE parallelproj) endif() diff --git a/cuda_compat.h b/include/cuda_compat.h similarity index 100% rename from cuda_compat.h rename to include/cuda_compat.h diff --git a/cuda_utils.h b/include/cuda_utils.h similarity index 100% rename from cuda_utils.h rename to include/cuda_utils.h diff --git a/debug.h b/include/debug.h similarity index 100% rename from debug.h rename to include/debug.h diff --git a/parallelproj.h b/include/parallelproj.h similarity index 100% rename from parallelproj.h rename to include/parallelproj.h diff --git a/square_array.h b/include/square_array.h similarity index 100% rename from square_array.h rename to include/square_array.h diff --git a/square_op.h b/include/square_op.h similarity index 100% rename from square_op.h rename to include/square_op.h diff --git a/utils.h b/include/utils.h similarity index 100% rename from utils.h rename to include/utils.h diff --git a/cuda_utils.cu b/src/cuda_utils.cu similarity index 100% rename from cuda_utils.cu rename to src/cuda_utils.cu diff --git a/joseph3d_back.cpp b/src/joseph3d_back.cpp similarity index 100% rename from joseph3d_back.cpp rename to src/joseph3d_back.cpp diff --git a/joseph3d_back.cu b/src/joseph3d_back.cu similarity index 100% rename from joseph3d_back.cu rename to src/joseph3d_back.cu diff --git a/joseph3d_back_worker.h b/src/joseph3d_back_worker.h similarity index 100% rename from joseph3d_back_worker.h rename to src/joseph3d_back_worker.h diff --git a/joseph3d_fwd.cpp b/src/joseph3d_fwd.cpp similarity index 100% rename from joseph3d_fwd.cpp rename to src/joseph3d_fwd.cpp diff --git a/joseph3d_fwd.cu b/src/joseph3d_fwd.cu similarity index 100% rename from joseph3d_fwd.cu rename to src/joseph3d_fwd.cu diff --git a/joseph3d_fwd_worker.h b/src/joseph3d_fwd_worker.h similarity index 100% rename from joseph3d_fwd_worker.h rename to src/joseph3d_fwd_worker.h diff --git a/main.cpp b/src/main.cpp similarity index 100% rename from main.cpp rename to src/main.cpp diff --git a/main.cu b/src/main.cu similarity index 100% rename from main.cu rename to src/main.cu From 6f7c0815b8cd3578c29c45f0c59d9bb57adf3368 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 16:24:04 +0200 Subject: [PATCH 14/35] refactor joseph3d_back.cu --- src/joseph3d_back.cu | 110 +++++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu index d959808..c090d5d 100644 --- a/src/joseph3d_back.cu +++ b/src/joseph3d_back.cu @@ -1,6 +1,7 @@ #include "parallelproj.h" #include "joseph3d_back_worker.h" #include "debug.h" +#include "cuda_utils.h" #include #include #include @@ -36,65 +37,65 @@ void joseph3d_back(const float *xstart, int device_id, int threadsperblock) { - // Set the CUDA device if (device_id >= 0) { cudaSetDevice(device_id); } - const float *d_xstart = nullptr; - const float *d_xend = nullptr; - float *d_img = nullptr; - const float *d_img_origin = nullptr; - const float *d_voxsize = nullptr; - const float *d_p = nullptr; - const int *d_img_dim = nullptr; - - // get pointer attributes of all input and output arrays - cudaPointerAttributes xstart_attr; - cudaError_t err = cudaPointerGetAttributes(&xstart_attr, xstart); ///////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////// - // TODO get attributes of all other arrays + // copy arrays to device if needed ///////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////// - bool needs_copy_back = false; - bool is_cuda_managed_ptr = false; + // Handle xstart (read mostly) + float *d_xstart = nullptr; + bool free_xstart = false; + handle_cuda_input_array(xstart, &d_xstart, sizeof(float) * nlors * 3, free_xstart, device_id, cudaMemAdviseSetReadMostly); - if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)) - { - is_cuda_managed_ptr = true; - DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device); - } - // else throw error - else - { - needs_copy_back = true; - throw std::runtime_error("Unsupported pointer type"); - } + // Handle xend (read mostly) + float *d_xend = nullptr; + bool free_xend = false; + handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly); - if (is_cuda_managed_ptr) - { - // all arrays are cuda malloc managed, so no need to copy to the device - d_xstart = xstart; - d_xend = xend; - d_img = img; - d_img_origin = img_origin; - d_voxsize = voxsize; - d_p = p; - d_img_dim = img_dim; - } - else - { - DEBUG_PRINT("COPYING HOST TO DEVICE"); - } + // Handle img (write access) + float *d_img = nullptr; + bool free_img = false; + handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetAccessedBy); + + // Handle img_origin (read mostly) + float *d_img_origin = nullptr; + bool free_img_origin = false; + handle_cuda_input_array(img_origin, &d_img_origin, sizeof(float) * 3, free_img_origin, device_id, cudaMemAdviseSetReadMostly); + + // Handle voxsize (read mostly) + float *d_voxsize = nullptr; + bool free_voxsize = false; + handle_cuda_input_array(voxsize, &d_voxsize, sizeof(float) * 3, free_voxsize, device_id, cudaMemAdviseSetReadMostly); + + // Handle p (read mostly) + float *d_p = nullptr; + bool free_p = false; + handle_cuda_input_array(p, &d_p, sizeof(float) * nlors, free_p, device_id, cudaMemAdviseSetReadMostly); + // Handle img_dim (read mostly) + int *d_img_dim = nullptr; + bool free_img_dim = false; + handle_cuda_input_array(img_dim, &d_img_dim, sizeof(int) * 3, free_img_dim, device_id, cudaMemAdviseSetReadMostly); + + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + // launch the kernel + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + +#ifdef DEBUG // get and print the current cuda device ID int current_device_id; cudaGetDevice(¤t_device_id); DEBUG_PRINT("Using CUDA device: %d\n", current_device_id); +#endif int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock); joseph3d_back_kernel<<>>(d_xstart, d_xend, d_img, @@ -102,8 +103,25 @@ void joseph3d_back(const float *xstart, d_p, nlors, d_img_dim); cudaDeviceSynchronize(); - // if (needs_copy_back) { - // cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost); - // cudaFree(device_array); - // } -} + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + // free device memory if needed + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + // Free device memory if it was allocated + if (free_xstart) + cudaFree(d_xstart); + if (free_xend) + cudaFree(d_xend); + if (free_img) + cudaFree(d_img); + if (free_img_origin) + cudaFree(d_img_origin); + if (free_voxsize) + cudaFree(d_voxsize); + if (free_p) + cudaFree(d_p); + if (free_img_dim) + cudaFree(d_img_dim); +} \ No newline at end of file From fd9816c360d7d9567db78594526a07233d277cb6 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 16:26:44 +0200 Subject: [PATCH 15/35] mv tests --- src/main.cpp => tests/test_nontof_fwd_back.cpp | 0 src/main.cu => tests/test_nontof_fwd_back.cu | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/main.cpp => tests/test_nontof_fwd_back.cpp (100%) rename src/main.cu => tests/test_nontof_fwd_back.cu (100%) diff --git a/src/main.cpp b/tests/test_nontof_fwd_back.cpp similarity index 100% rename from src/main.cpp rename to tests/test_nontof_fwd_back.cpp diff --git a/src/main.cu b/tests/test_nontof_fwd_back.cu similarity index 100% rename from src/main.cu rename to tests/test_nontof_fwd_back.cu From 5e92e0f90d0f16a9d6e5ef651b1606a24be9bf05 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 16:42:30 +0200 Subject: [PATCH 16/35] build tests --- CMakeLists.txt | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f27d71c..3e6f048 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,9 +49,9 @@ if(CMAKE_CUDA_COMPILER) target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries(parallelproj PRIVATE CUDA::cudart) - add_executable(main src/main.cu) - target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include ${CUDAToolkit_INCLUDE_DIRS}) - target_link_libraries(main PRIVATE parallelproj CUDA::cudart) + set(TEST_SOURCES + tests/test_nontof_fwd_back.cu + ) else() set(LIB_SOURCES src/joseph3d_fwd.cpp @@ -63,7 +63,26 @@ else() target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX) - add_executable(main src/main.cpp) - target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include) - target_link_libraries(main PRIVATE parallelproj) + set(TEST_SOURCES + tests/test_nontof_fwd_back.cpp + ) endif() + + +################################################################################ +### BUILD TESTS ################################################################ +################################################################################ + +enable_testing() + +foreach(TEST_SOURCE ${TEST_SOURCES}) + get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE) + add_executable(${TEST_NAME} ${TEST_SOURCE}) + target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include) + if(CMAKE_CUDA_COMPILER) + target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart) + else() + target_link_libraries(${TEST_NAME} PRIVATE parallelproj OpenMP::OpenMP_CXX) + endif() + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) +endforeach() From b299358d063ec56502faf0b83dbc5a6076e979b7 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 16:54:41 +0200 Subject: [PATCH 17/35] move obsolete files --- CMakeLists_square.txt => obsolete/CMakeLists_square.txt | 0 main.py => obsolete/main.py | 0 main_cuda.cpp => obsolete/main_cuda.cpp | 0 main_no_cuda.cpp => obsolete/main_no_cuda.cpp | 0 square_array.cpp => obsolete/square_array.cpp | 0 square_array.cu => obsolete/square_array.cu | 0 utils.py => obsolete/utils.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename CMakeLists_square.txt => obsolete/CMakeLists_square.txt (100%) rename main.py => obsolete/main.py (100%) rename main_cuda.cpp => obsolete/main_cuda.cpp (100%) rename main_no_cuda.cpp => obsolete/main_no_cuda.cpp (100%) rename square_array.cpp => obsolete/square_array.cpp (100%) rename square_array.cu => obsolete/square_array.cu (100%) rename utils.py => obsolete/utils.py (100%) diff --git a/CMakeLists_square.txt b/obsolete/CMakeLists_square.txt similarity index 100% rename from CMakeLists_square.txt rename to obsolete/CMakeLists_square.txt diff --git a/main.py b/obsolete/main.py similarity index 100% rename from main.py rename to obsolete/main.py diff --git a/main_cuda.cpp b/obsolete/main_cuda.cpp similarity index 100% rename from main_cuda.cpp rename to obsolete/main_cuda.cpp diff --git a/main_no_cuda.cpp b/obsolete/main_no_cuda.cpp similarity index 100% rename from main_no_cuda.cpp rename to obsolete/main_no_cuda.cpp diff --git a/square_array.cpp b/obsolete/square_array.cpp similarity index 100% rename from square_array.cpp rename to obsolete/square_array.cpp diff --git a/square_array.cu b/obsolete/square_array.cu similarity index 100% rename from square_array.cu rename to obsolete/square_array.cu diff --git a/utils.py b/obsolete/utils.py similarity index 100% rename from utils.py rename to obsolete/utils.py From 30589e01b9ab755a0274b1517a413be6e4c992b7 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 17:37:34 +0200 Subject: [PATCH 18/35] add cmake config --- CMakeLists.txt | 64 ++++++++++++++++++++++++++++++- cmake/parallelprojConfig.cmake.in | 9 +++++ 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 cmake/parallelprojConfig.cmake.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e6f048..0763c78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,9 +2,15 @@ cmake_minimum_required(VERSION 3.18) project(PARALLELPROJ LANGUAGES C CXX) include(CheckLanguage) +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + check_language(CUDA) -# Derive version from Git or folder name +################################################################################ +### DERIVE VERSION FROM GIT #################################################### +################################################################################ + if(EXISTS "${CMAKE_SOURCE_DIR}/.git") execute_process( COMMAND git describe --tags --long --dirty @@ -33,6 +39,9 @@ if(DEBUG) add_compile_definitions(DEBUG) endif() +################################################################################ +### DEFINE TARGETS ############################################################# +################################################################################ if(CMAKE_CUDA_COMPILER) @@ -44,10 +53,12 @@ if(CMAKE_CUDA_COMPILER) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) + set(PARALLELPROJ_CUDA 1) add_library(parallelproj SHARED ${LIB_SOURCES}) target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries(parallelproj PRIVATE CUDA::cudart) + target_compile_definitions(parallelproj PUBLIC PARALLELPROJ_CUDA=1) set(TEST_SOURCES tests/test_nontof_fwd_back.cu @@ -59,15 +70,66 @@ else() ) find_package(OpenMP REQUIRED) + set(PARALLELPROJ_CUDA 0) + add_library(parallelproj SHARED ${LIB_SOURCES}) target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX) + target_compile_definitions(parallelproj PUBLIC PARALLELPROJ_CUDA=0) set(TEST_SOURCES tests/test_nontof_fwd_back.cpp ) endif() +################################################################################ +### INSTALL AND CONFIGS ######################################################## +################################################################################ + +# Install the library and headers +install(TARGETS parallelproj + EXPORT parallelprojTargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +# Export the build tree for other projects +export(EXPORT parallelprojTargets + FILE ${CMAKE_BINARY_DIR}/parallelprojTargets.cmake + NAMESPACE parallelproj:: +) + +# Install the export configuration +install(EXPORT parallelprojTargets + FILE parallelprojTargets.cmake + NAMESPACE parallelproj:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parallelproj +) + +# Create and install a package configuration file +write_basic_package_version_file( + ${CMAKE_BINARY_DIR}/parallelprojConfigVersion.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion +) + +configure_file(${CMAKE_SOURCE_DIR}/cmake/parallelprojConfig.cmake.in + ${CMAKE_BINARY_DIR}/parallelprojConfig.cmake + @ONLY +) + +install(FILES + ${CMAKE_BINARY_DIR}/parallelprojConfig.cmake + ${CMAKE_BINARY_DIR}/parallelprojConfigVersion.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parallelproj +) + ################################################################################ ### BUILD TESTS ################################################################ diff --git a/cmake/parallelprojConfig.cmake.in b/cmake/parallelprojConfig.cmake.in new file mode 100644 index 0000000..5e263d9 --- /dev/null +++ b/cmake/parallelprojConfig.cmake.in @@ -0,0 +1,9 @@ +include("${CMAKE_CURRENT_LIST_DIR}/parallelprojTargets.cmake") + +# Include the version file for compatibility checks +include("${CMAKE_CURRENT_LIST_DIR}/parallelprojConfigVersion.cmake") + +set(PARALLELPROJ_CUDA @PARALLELPROJ_CUDA@) +set(PARALLELPROJ_INCLUDE_DIRS "${CMAKE_INSTALL_INCLUDEDIR}") +set(PARALLELPROJ_LIBRARY_DIRS "${CMAKE_INSTALL_LIBDIR}") +set(PARALLELPROJ_VERSION "@PROJECT_VERSION@") \ No newline at end of file From 38ec122bb63563738f20ea454f675a6ce81d7371 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Fri, 11 Apr 2025 17:40:22 +0200 Subject: [PATCH 19/35] add windows_export symbols --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0763c78..f3180c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,11 @@ include(CMakePackageConfigHelpers) check_language(CUDA) +# Enable automatic symbol export on Windows for shared libraries +if(WIN32 AND BUILD_SHARED_LIBS) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE) +endif() + ################################################################################ ### DERIVE VERSION FROM GIT #################################################### ################################################################################ From 81f8285ea485bf1496e22856e00e569655718b95 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 15:56:24 +0200 Subject: [PATCH 20/35] read test input data from text files --- CMakeLists.txt | 29 ++++++- tests/expected_fwd_vals.txt | 10 +++ tests/file_utils.h | 14 ++++ tests/file_utils.tpp | 40 +++++++++ tests/img.txt | 24 ++++++ tests/test_nontof_fwd_back.cpp | 1 + tests/test_nontof_fwd_back.cu | 145 +++++++-------------------------- tests/vend.txt | 30 +++++++ tests/vstart.txt | 30 +++++++ 9 files changed, 208 insertions(+), 115 deletions(-) create mode 100644 tests/expected_fwd_vals.txt create mode 100644 tests/file_utils.h create mode 100644 tests/file_utils.tpp create mode 100644 tests/img.txt create mode 100644 tests/vend.txt create mode 100644 tests/vstart.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index f3180c3..7650395 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,10 +142,37 @@ install(FILES enable_testing() +set(TEST_DATA_DIR ${CMAKE_SOURCE_DIR}/tests) + +# List all the .txt files that need to be copied +set(TEST_DATA_FILES + img.txt + vstart.txt + vend.txt + expected_fwd_vals.txt +) + +# Copy each file to the build directory +foreach(TEST_FILE ${TEST_DATA_FILES}) + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/${TEST_FILE} + COMMAND ${CMAKE_COMMAND} -E copy ${TEST_DATA_DIR}/${TEST_FILE} ${CMAKE_BINARY_DIR}/${TEST_FILE} + DEPENDS ${TEST_DATA_DIR}/${TEST_FILE} + ) + list(APPEND TEST_DATA_OUTPUTS ${CMAKE_BINARY_DIR}/${TEST_FILE}) +endforeach() + +# Add a custom target to ensure the files are copied before running tests +add_custom_target(copy_test_data ALL DEPENDS ${TEST_DATA_OUTPUTS}) + +# Add the directory containing file_utils.h to the include paths +include_directories(${CMAKE_SOURCE_DIR}/tests) + foreach(TEST_SOURCE ${TEST_SOURCES}) get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE) add_executable(${TEST_NAME} ${TEST_SOURCE}) - target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include) + target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/tests) + add_dependencies(${TEST_NAME} copy_test_data) # Ensure test data is copied before running tests if(CMAKE_CUDA_COMPILER) target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart) else() diff --git a/tests/expected_fwd_vals.txt b/tests/expected_fwd_vals.txt new file mode 100644 index 0000000..29456bf --- /dev/null +++ b/tests/expected_fwd_vals.txt @@ -0,0 +1,10 @@ +45 +45 +54 +49.5 +20 +56 +153 +180 +116 +180 diff --git a/tests/file_utils.h b/tests/file_utils.h new file mode 100644 index 0000000..e258e6f --- /dev/null +++ b/tests/file_utils.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include + +// Function to save a 1D array to a text file +template +void saveArrayToFile(const std::string &filename, const T *array, size_t size); + +// Function to read a 1D array from a text file +template +std::vector readArrayFromFile(const std::string &filename); + +#include "file_utils.tpp" // Include the template implementation diff --git a/tests/file_utils.tpp b/tests/file_utils.tpp new file mode 100644 index 0000000..26f9cee --- /dev/null +++ b/tests/file_utils.tpp @@ -0,0 +1,40 @@ +#include +#include +#include + +// Function to save a 1D array to a text file +template +void saveArrayToFile(const std::string &filename, const T *array, size_t size) +{ + std::ofstream file(filename); + if (!file.is_open()) + { + std::cerr << "Error: Could not open file " << filename << " for writing.\n"; + return; + } + for (size_t i = 0; i < size; ++i) + { + file << array[i] << "\n"; + } + file.close(); +} + +// Function to read a 1D array from a text file +template +std::vector readArrayFromFile(const std::string &filename) +{ + std::ifstream file(filename); + if (!file.is_open()) + { + std::cerr << "Error: Could not open file " << filename << " for reading.\n"; + return {}; + } + std::vector array; + T value; + while (file >> value) + { + array.push_back(value); + } + file.close(); + return array; +} diff --git a/tests/img.txt b/tests/img.txt new file mode 100644 index 0000000..7ed2844 --- /dev/null +++ b/tests/img.txt @@ -0,0 +1,24 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp index aac1d7f..9e07908 100644 --- a/tests/test_nontof_fwd_back.cpp +++ b/tests/test_nontof_fwd_back.cpp @@ -1,4 +1,5 @@ #include "parallelproj.h" +#include "file_utils.h" #include #include #include diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu index b6fa349..ec8bf2e 100644 --- a/tests/test_nontof_fwd_back.cu +++ b/tests/test_nontof_fwd_back.cu @@ -1,8 +1,10 @@ #include "parallelproj.h" +#include "file_utils.h" #include #include #include #include +#include int main() { @@ -39,53 +41,25 @@ int main() img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i]; } + // read the image from file and convert into a cuda managed array + std::vector img_from_file = readArrayFromFile("img.txt"); float *img; cudaMallocManaged(&img, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float)); - - // fill the test image - for (int i0 = 0; i0 < img_dim[0]; i0++) - { - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2] = float(img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2 + 1); - printf("%.1f ", img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); - } - printf("\n"); - } - printf("\n"); - } - - float id0 = static_cast(img_dim[0]); - float id1 = static_cast(img_dim[1]); - float id2 = static_cast(img_dim[2]); - - float vstart[] = { - 0, -1, 0, // 0 - 0, -1, 0, // 1 - 0, -1, 1, // 2 - 0, -1, 0.5, // 3 - 0, 0, -1, // 4 - -1, 0, 0, // 5 - id0 - 1, -1, 0, // 6 - (shifted 1) - id0 - 1, -1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, -1, // 8 - (shifted 4) - id0 - 1, id1 - 1, -1, // 9 - (shifted 8) - }; - - float vend[] = { - 0, id1, 0, // 0 - 0, id1, 0, // 1 - 0, id1, 1, // 2 - 0, id1, 0.5, // 3 - 0, 0, id2, // 4 - id0, 0, 0, // 5 - id0 - 1, id1, 0, // 6 - (shifted 1) - id0 - 1, id1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, id2, // 8 - (shifted 4) - id0 - 1, id1 - 1, id2, // 9 - (shifted 8) - }; + memcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float)); + + // read the ray start coordinates in voxel coordinates + // reast vstart from vstart.txt and convert into a cuda managed array + std::vector vstart_from_file = readArrayFromFile("vstart.txt"); + float *vstart; + cudaMallocManaged(&vstart, (3 * nlors) * sizeof(float)); + memcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float)); + + // read the ray end coordinates in voxel coordinates + // read vend from vend.txt and convert into a cuda managed array + std::vector vend_from_file = readArrayFromFile("vend.txt"); + float *vend; + cudaMallocManaged(&vend, (3 * nlors) * sizeof(float)); + memcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float)); for (int ir = 0; ir < nlors; ir++) { @@ -95,20 +69,20 @@ int main() } // calculate the start and end coordinates in world coordinates - float *xstart; cudaMallocManaged(&xstart, (3 * nlors) * sizeof(float)); float *xend; cudaMallocManaged(&xend, (3 * nlors) * sizeof(float)); for (int ir = 0; ir < nlors; ir++) - { - for (int j = 0; j < 3; j++) - { - xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j]; - xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j]; - } + xstart[ir * 3 + 0] = img_origin[0] + vstart[ir * 3 + 0] * voxsize[0]; + xstart[ir * 3 + 1] = img_origin[1] + vstart[ir * 3 + 1] * voxsize[1]; + xstart[ir * 3 + 2] = img_origin[2] + vstart[ir * 3 + 2] * voxsize[2]; + + xend[ir * 3 + 0] = img_origin[0] + vend[ir * 3 + 0] * voxsize[0]; + xend[ir * 3 + 1] = img_origin[1] + vend[ir * 3 + 1] * voxsize[1]; + xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2]; } float *img_fwd; @@ -116,8 +90,6 @@ int main() joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); - // calculate the expected values - ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// @@ -125,64 +97,11 @@ int main() int retval = 0; float eps = 1e-7; - float *expected_fwd_vals = new float[nlors]; - // initialize expected_fwd_vals with 0s - for (int ir = 0; ir < nlors; ir++) - { - expected_fwd_vals[ir] = 0; - } - - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[0] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1]; - } - - expected_fwd_vals[1] = expected_fwd_vals[0]; - - // calculate the expected value of ray2 from [0,-1,1] to [0,last+1,1] - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[2] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 1] * voxsize[1]; - } - - // calculate the expected value of ray3 from [0,-1,0.5] to [0,last+1,0.5] - expected_fwd_vals[3] = 0.5 * (expected_fwd_vals[0] + expected_fwd_vals[2]); - - // calculate the expected value of ray4 from [0,0,-1] to [0,0,last+1] - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - expected_fwd_vals[4] += img[0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2]; - } - - // calculate the expected value of ray5 from [-1,0,0] to [last+1,0,0] - for (int i0 = 0; i0 < img_dim[0]; i0++) - { - expected_fwd_vals[5] += img[i0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + 0] * voxsize[0]; - } - - // calculate the expected value of rays6 from [img_dim[0]-1,-1,0] to [img_dim[0]-1,last+1,0] - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[6] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1]; - } - - // calculate the expected value of rays7 from [img_dim[0]-1,-1,img_dim[2]-1] to [img_dim[0]-1,last+1,img_dim[2]-1] - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[7] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + (img_dim[2] - 1)] * voxsize[1]; - } - - // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1] - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - expected_fwd_vals[8] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2]; - } - - // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1] - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - expected_fwd_vals[9] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + (img_dim[1] - 1) * img_dim[2] + i2] * voxsize[2]; - } + // read the expected_fwd_vals from expected_fwd_vals.txt and convert into a cuda managed array + std::vector expected_fwd_vals_from_file = readArrayFromFile("expected_fwd_vals.txt"); + float *expected_fwd_vals; + cudaMallocManaged(&expected_fwd_vals, (nlors) * sizeof(float)); + memcpy(expected_fwd_vals, expected_fwd_vals_from_file.data(), expected_fwd_vals_from_file.size() * sizeof(float)); // check if we got the expected results float fwd_diff = 0; @@ -281,7 +200,5 @@ int main() cudaFree(bimg); cudaFree(ones); - free(expected_fwd_vals); - return retval; } diff --git a/tests/vend.txt b/tests/vend.txt new file mode 100644 index 0000000..0421680 --- /dev/null +++ b/tests/vend.txt @@ -0,0 +1,30 @@ +0 +3 +0 +0 +3 +0 +0 +3 +1 +0 +3 +0.5 +0 +0 +4 +2 +0 +0 +1 +3 +0 +1 +3 +3 +1 +0 +4 +1 +2 +4 diff --git a/tests/vstart.txt b/tests/vstart.txt new file mode 100644 index 0000000..5965472 --- /dev/null +++ b/tests/vstart.txt @@ -0,0 +1,30 @@ +0 +-1 +0 +0 +-1 +0 +0 +-1 +1 +0 +-1 +0.5 +0 +0 +-1 +-1 +0 +0 +1 +-1 +0 +1 +-1 +3 +1 +0 +-1 +1 +2 +-1 From b27d29b05427f7aee54ba355ef82b512e61f9a5d Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 16:08:34 +0200 Subject: [PATCH 21/35] wip tests --- tests/test_nontof_fwd_back.cpp | 193 +++++++-------------------------- tests/test_nontof_fwd_back.cu | 4 +- 2 files changed, 40 insertions(+), 157 deletions(-) diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp index 9e07908..e88ac5b 100644 --- a/tests/test_nontof_fwd_back.cpp +++ b/tests/test_nontof_fwd_back.cpp @@ -3,12 +3,10 @@ #include #include #include +#include int main() { - const size_t repetitions = 5; - size_t nlors = 10; - //////////////////////////////////////////////////////// // OpenMP managed memory use case //////////////////////////////////////////////////////// @@ -16,7 +14,6 @@ int main() std::cout << "OpenMP use case\n"; int img_dim[3] = {2, 3, 4}; - float voxsize[3] = {4, 3, 2}; float img_origin[3]; @@ -25,148 +22,53 @@ int main() img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i]; } - float *img = new float[img_dim[0] * img_dim[1] * img_dim[2]]; + // Read the image from file + std::vector img_from_file = readArrayFromFile("img.txt"); - // fill the test image - for (int i0 = 0; i0 < img_dim[0]; i0++) - { - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2] = float(img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2 + 1); - printf("%.1f ", img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); - } - printf("\n"); - } - printf("\n"); - } + // Read the ray start coordinates from file + std::vector vstart_from_file = readArrayFromFile("vstart.txt"); - float id0 = static_cast(img_dim[0]); - float id1 = static_cast(img_dim[1]); - float id2 = static_cast(img_dim[2]); + // Read the ray end coordinates from file + std::vector vend_from_file = readArrayFromFile("vend.txt"); - float vstart[] = { - 0, -1, 0, // 0 - 0, -1, 0, // 1 - 0, -1, 1, // 2 - 0, -1, 0.5, // 3 - 0, 0, -1, // 4 - -1, 0, 0, // 5 - id0 - 1, -1, 0, // 6 - (shifted 1) - id0 - 1, -1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, -1, // 8 - (shifted 4) - id0 - 1, id1 - 1, -1, // 9 - (shifted 8) - }; + size_t nlors = vstart_from_file.size() / 3; - float vend[] = { - 0, id1, 0, // 0 - 0, id1, 0, // 1 - 0, id1, 1, // 2 - 0, id1, 0.5, // 3 - 0, 0, id2, // 4 - id0, 0, 0, // 5 - id0 - 1, id1, 0, // 6 - (shifted 1) - id0 - 1, id1, id2 - 1, // 7 - (shifted 6) - id0 - 1, 0, id2, // 8 - (shifted 4) - id0 - 1, id1 - 1, id2, // 9 - (shifted 8) - }; + // Calculate the start and end coordinates in world coordinates + std::vector xstart(3 * nlors); + std::vector xend(3 * nlors); for (int ir = 0; ir < nlors; ir++) { - printf("test ray %d\n", ir); - printf("start voxel num .: %.1f %.1f %.1f\n", vstart[ir * 3 + 0], vstart[ir * 3 + 1], vstart[ir * 3 + 2]); - printf("end voxel num .: %.1f %.1f %.1f\n", vend[ir * 3 + 0], vend[ir * 3 + 1], vend[ir * 3 + 2]); - } - - // calculate the start and end coordinates in world coordinates - - float *xstart = new float[3 * nlors]; - float *xend = new float[3 * nlors]; + xstart[ir * 3 + 0] = img_origin[0] + vstart_from_file[ir * 3 + 0] * voxsize[0]; + xstart[ir * 3 + 1] = img_origin[1] + vstart_from_file[ir * 3 + 1] * voxsize[1]; + xstart[ir * 3 + 2] = img_origin[2] + vstart_from_file[ir * 3 + 2] * voxsize[2]; - for (int ir = 0; ir < nlors; ir++) - - { - for (int j = 0; j < 3; j++) - { - xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j]; - xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j]; - } + xend[ir * 3 + 0] = img_origin[0] + vend_from_file[ir * 3 + 0] * voxsize[0]; + xend[ir * 3 + 1] = img_origin[1] + vend_from_file[ir * 3 + 1] * voxsize[1]; + xend[ir * 3 + 2] = img_origin[2] + vend_from_file[ir * 3 + 2] * voxsize[2]; } - float *img_fwd = new float[nlors]; - joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); + // Allocate memory for forward projection results + std::vector img_fwd(nlors); + + // Perform forward projection + joseph3d_fwd( + xstart.data(), xend.data(), img_from_file.data(), + img_origin, voxsize, img_fwd.data(), + nlors, img_dim, 0, 64); ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// - // calculate the expected values + // Read the expected forward values from file + std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); - int retval = 0; + // Check if we got the expected results + float fwd_diff = 0; float eps = 1e-7; + int retval = 0; - float *expected_fwd_vals = new float[nlors]; - // initialize expected_fwd_vals with 0s - for (int ir = 0; ir < nlors; ir++) - { - expected_fwd_vals[ir] = 0; - } - - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[0] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1]; - } - - expected_fwd_vals[1] = expected_fwd_vals[0]; - - // calculate the expected value of ray2 from [0,-1,1] to [0,last+1,1] - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[2] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 1] * voxsize[1]; - } - - // calculate the expected value of ray3 from [0,-1,0.5] to [0,last+1,0.5] - expected_fwd_vals[3] = 0.5 * (expected_fwd_vals[0] + expected_fwd_vals[2]); - - // calculate the expected value of ray4 from [0,0,-1] to [0,0,last+1] - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - expected_fwd_vals[4] += img[0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2]; - } - - // calculate the expected value of ray5 from [-1,0,0] to [last+1,0,0] - for (int i0 = 0; i0 < img_dim[0]; i0++) - { - expected_fwd_vals[5] += img[i0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + 0] * voxsize[0]; - } - - // calculate the expected value of rays6 from [img_dim[0]-1,-1,0] to [img_dim[0]-1,last+1,0] - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[6] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1]; - } - - // calculate the expected value of rays7 from [img_dim[0]-1,-1,img_dim[2]-1] to [img_dim[0]-1,last+1,img_dim[2]-1] - for (int i1 = 0; i1 < img_dim[1]; i1++) - { - expected_fwd_vals[7] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + (img_dim[2] - 1)] * voxsize[1]; - } - - // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1] - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - expected_fwd_vals[8] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2]; - } - - // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1] - for (int i2 = 0; i2 < img_dim[2]; i2++) - { - expected_fwd_vals[9] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + (img_dim[1] - 1) * img_dim[2] + i2] * voxsize[2]; - } - - // check if we got the expected results - float fwd_diff = 0; printf("\nforward projection test\n"); for (int ir = 0; ir < nlors; ir++) { @@ -186,21 +88,14 @@ int main() //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// - // test the back projection - - float *bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]]; - for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) - { - bimg[i] = 0; - } - - float *ones = new float[nlors]; - for (size_t i = 0; i < nlors; i++) - { - ones[i] = 1; - } + // Test the back projection + std::vector bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f); + std::vector ones(nlors, 1.0f); - joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim); + joseph3d_back( + xstart.data(), xend.data(), bimg.data(), + img_origin, voxsize, ones.data(), + nlors, img_dim); printf("\nback projection of ones along all rays:\n"); for (size_t i0 = 0; i0 < img_dim[0]; i0++) @@ -225,7 +120,7 @@ int main() for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) { - inner_product1 += (img[i] * bimg[i]); + inner_product1 += (img_from_file[i] * bimg[i]); } for (size_t ir = 0; ir < nlors; ir++) @@ -244,17 +139,5 @@ int main() retval = 1; } - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - free(img); - free(xstart); - free(xend); - free(img_fwd); - free(expected_fwd_vals); - - free(bimg); - free(ones); - return retval; } diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu index ec8bf2e..f944204 100644 --- a/tests/test_nontof_fwd_back.cu +++ b/tests/test_nontof_fwd_back.cu @@ -8,8 +8,6 @@ int main() { - size_t nlors = 10; - // get the number of cuda devices - because we want to run on the last device int device_count; cudaGetDeviceCount(&device_count); @@ -50,6 +48,8 @@ int main() // read the ray start coordinates in voxel coordinates // reast vstart from vstart.txt and convert into a cuda managed array std::vector vstart_from_file = readArrayFromFile("vstart.txt"); + size_t nlors = vstart_from_file.size() / 3; + float *vstart; cudaMallocManaged(&vstart, (3 * nlors) * sizeof(float)); memcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float)); From facf7be219f2bcbbfa9425dbcfe8f897ec14e41d Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 16:11:03 +0200 Subject: [PATCH 22/35] free all cuda managed arrays --- tests/test_nontof_fwd_back.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu index f944204..bec2107 100644 --- a/tests/test_nontof_fwd_back.cu +++ b/tests/test_nontof_fwd_back.cu @@ -193,10 +193,12 @@ int main() cudaFree(voxsize); cudaFree(img_origin); cudaFree(img); + cudaFree(vstart); + cudaFree(vend); cudaFree(xstart); cudaFree(xend); cudaFree(img_fwd); - + cudaFree(expected_fwd_vals); cudaFree(bimg); cudaFree(ones); From 99f0d5ac246ffd31211ec2a4d0d24cfb54e5779b Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 17:09:07 +0200 Subject: [PATCH 23/35] simplify example --- tests/test_nontof_fwd_back.cpp | 53 ++++++++++++++-------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp index e88ac5b..df6d5df 100644 --- a/tests/test_nontof_fwd_back.cpp +++ b/tests/test_nontof_fwd_back.cpp @@ -4,6 +4,7 @@ #include #include #include +#include int main() { @@ -13,25 +14,25 @@ int main() std::cout << "OpenMP use case\n"; - int img_dim[3] = {2, 3, 4}; - float voxsize[3] = {4, 3, 2}; + std::vector img_dim = {2, 3, 4}; + std::vector voxsize = {4.0f, 3.0f, 2.0f}; - float img_origin[3]; + std::vector img_origin(3); for (int i = 0; i < 3; ++i) { - img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i]; + img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; } // Read the image from file - std::vector img_from_file = readArrayFromFile("img.txt"); + std::vector img = readArrayFromFile("img.txt"); // Read the ray start coordinates from file - std::vector vstart_from_file = readArrayFromFile("vstart.txt"); + std::vector vstart = readArrayFromFile("vstart.txt"); // Read the ray end coordinates from file - std::vector vend_from_file = readArrayFromFile("vend.txt"); + std::vector vend = readArrayFromFile("vend.txt"); - size_t nlors = vstart_from_file.size() / 3; + size_t nlors = vstart.size() / 3; // Calculate the start and end coordinates in world coordinates std::vector xstart(3 * nlors); @@ -39,13 +40,13 @@ int main() for (int ir = 0; ir < nlors; ir++) { - xstart[ir * 3 + 0] = img_origin[0] + vstart_from_file[ir * 3 + 0] * voxsize[0]; - xstart[ir * 3 + 1] = img_origin[1] + vstart_from_file[ir * 3 + 1] * voxsize[1]; - xstart[ir * 3 + 2] = img_origin[2] + vstart_from_file[ir * 3 + 2] * voxsize[2]; + xstart[ir * 3 + 0] = img_origin[0] + vstart[ir * 3 + 0] * voxsize[0]; + xstart[ir * 3 + 1] = img_origin[1] + vstart[ir * 3 + 1] * voxsize[1]; + xstart[ir * 3 + 2] = img_origin[2] + vstart[ir * 3 + 2] * voxsize[2]; - xend[ir * 3 + 0] = img_origin[0] + vend_from_file[ir * 3 + 0] * voxsize[0]; - xend[ir * 3 + 1] = img_origin[1] + vend_from_file[ir * 3 + 1] * voxsize[1]; - xend[ir * 3 + 2] = img_origin[2] + vend_from_file[ir * 3 + 2] * voxsize[2]; + xend[ir * 3 + 0] = img_origin[0] + vend[ir * 3 + 0] * voxsize[0]; + xend[ir * 3 + 1] = img_origin[1] + vend[ir * 3 + 1] * voxsize[1]; + xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2]; } // Allocate memory for forward projection results @@ -53,9 +54,9 @@ int main() // Perform forward projection joseph3d_fwd( - xstart.data(), xend.data(), img_from_file.data(), - img_origin, voxsize, img_fwd.data(), - nlors, img_dim, 0, 64); + xstart.data(), xend.data(), img.data(), + img_origin.data(), voxsize.data(), img_fwd.data(), + nlors, img_dim.data(), 0, 64); ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// @@ -94,8 +95,8 @@ int main() joseph3d_back( xstart.data(), xend.data(), bimg.data(), - img_origin, voxsize, ones.data(), - nlors, img_dim); + img_origin.data(), voxsize.data(), ones.data(), + nlors, img_dim.data()); printf("\nback projection of ones along all rays:\n"); for (size_t i0 = 0; i0 < img_dim[0]; i0++) @@ -115,18 +116,8 @@ int main() // of the forward projector. This is more practical than checking a lot of single voxels in the // back projected image. - float inner_product1 = 0; - float inner_product2 = 0; - - for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) - { - inner_product1 += (img_from_file[i] * bimg[i]); - } - - for (size_t ir = 0; ir < nlors; ir++) - { - inner_product2 += (img_fwd[ir] * ones[ir]); - } + float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f); + float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f); float ip_diff = fabs(inner_product1 - inner_product2); From 7bb90bd83f0178f0b25f5a44132fcf13d750b9cd Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 17:20:31 +0200 Subject: [PATCH 24/35] copy results back to host and simply cuda test (host case so far) --- src/joseph3d_back.cu | 4 + src/joseph3d_fwd.cu | 3 + tests/test_nontof_fwd_back.cu | 152 +++++++++------------------------- 3 files changed, 47 insertions(+), 112 deletions(-) diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu index c090d5d..378d387 100644 --- a/src/joseph3d_back.cu +++ b/src/joseph3d_back.cu @@ -115,7 +115,11 @@ void joseph3d_back(const float *xstart, if (free_xend) cudaFree(d_xend); if (free_img) + { + // Copy the result back to the host + cudaMemcpy(const_cast(img), d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyDeviceToHost); cudaFree(d_img); + } if (free_img_origin) cudaFree(d_img_origin); if (free_voxsize) diff --git a/src/joseph3d_fwd.cu b/src/joseph3d_fwd.cu index 1093983..a5444c4 100644 --- a/src/joseph3d_fwd.cu +++ b/src/joseph3d_fwd.cu @@ -121,7 +121,10 @@ void joseph3d_fwd(const float *xstart, if (free_voxsize) cudaFree(d_voxsize); if (free_p) + { + cudaMemcpy(const_cast(p), d_p, sizeof(float) * nlors, cudaMemcpyDeviceToHost); cudaFree(d_p); + } if (free_img_dim) cudaFree(d_img_dim); } diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu index bec2107..2c309e7 100644 --- a/tests/test_nontof_fwd_back.cu +++ b/tests/test_nontof_fwd_back.cu @@ -1,78 +1,42 @@ #include "parallelproj.h" #include "file_utils.h" #include -#include #include #include -#include +#include +#include int main() { - // get the number of cuda devices - because we want to run on the last device - int device_count; - cudaGetDeviceCount(&device_count); - //////////////////////////////////////////////////////// - // CUDA memory managed use case + // Host array use case //////////////////////////////////////////////////////// - std::cout << "CUDA managed memory use case\n"; - - cudaSetDevice(device_count - 1); + std::cout << "Host array use case\n"; - int *img_dim; - cudaMallocManaged(&img_dim, 3 * sizeof(int)); - img_dim[0] = 2; - img_dim[1] = 3; - img_dim[2] = 4; + std::vector img_dim = {2, 3, 4}; + std::vector voxsize = {4.0f, 3.0f, 2.0f}; - float *voxsize; - cudaMallocManaged(&voxsize, 3 * sizeof(float)); - voxsize[0] = 4; - voxsize[1] = 3; - voxsize[2] = 2; - - float *img_origin; - cudaMallocManaged(&img_origin, 3 * sizeof(float)); + std::vector img_origin(3); for (int i = 0; i < 3; ++i) { - img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i]; + img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; } - // read the image from file and convert into a cuda managed array - std::vector img_from_file = readArrayFromFile("img.txt"); - float *img; - cudaMallocManaged(&img, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float)); - memcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float)); - - // read the ray start coordinates in voxel coordinates - // reast vstart from vstart.txt and convert into a cuda managed array - std::vector vstart_from_file = readArrayFromFile("vstart.txt"); - size_t nlors = vstart_from_file.size() / 3; + // Read the image from file + std::vector img = readArrayFromFile("img.txt"); - float *vstart; - cudaMallocManaged(&vstart, (3 * nlors) * sizeof(float)); - memcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float)); + // Read the ray start coordinates from file + std::vector vstart = readArrayFromFile("vstart.txt"); - // read the ray end coordinates in voxel coordinates - // read vend from vend.txt and convert into a cuda managed array - std::vector vend_from_file = readArrayFromFile("vend.txt"); - float *vend; - cudaMallocManaged(&vend, (3 * nlors) * sizeof(float)); - memcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float)); + // Read the ray end coordinates from file + std::vector vend = readArrayFromFile("vend.txt"); - for (int ir = 0; ir < nlors; ir++) - { - printf("test ray %d\n", ir); - printf("start voxel num .: %.1f %.1f %.1f\n", vstart[ir * 3 + 0], vstart[ir * 3 + 1], vstart[ir * 3 + 2]); - printf("end voxel num .: %.1f %.1f %.1f\n", vend[ir * 3 + 0], vend[ir * 3 + 1], vend[ir * 3 + 2]); - } + size_t nlors = vstart.size() / 3; - // calculate the start and end coordinates in world coordinates - float *xstart; - cudaMallocManaged(&xstart, (3 * nlors) * sizeof(float)); - float *xend; - cudaMallocManaged(&xend, (3 * nlors) * sizeof(float)); + // Calculate the start and end coordinates in world coordinates + std::vector xstart(3 * nlors); + std::vector xend(3 * nlors); for (int ir = 0; ir < nlors; ir++) { @@ -85,26 +49,27 @@ int main() xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2]; } - float *img_fwd; - cudaMallocManaged(&img_fwd, nlors * sizeof(float)); + // Allocate memory for forward projection results + std::vector img_fwd(nlors); - joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); + // Perform forward projection + joseph3d_fwd( + xstart.data(), xend.data(), img.data(), + img_origin.data(), voxsize.data(), img_fwd.data(), + nlors, img_dim.data(), 0, 64); ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// - int retval = 0; - float eps = 1e-7; - - // read the expected_fwd_vals from expected_fwd_vals.txt and convert into a cuda managed array - std::vector expected_fwd_vals_from_file = readArrayFromFile("expected_fwd_vals.txt"); - float *expected_fwd_vals; - cudaMallocManaged(&expected_fwd_vals, (nlors) * sizeof(float)); - memcpy(expected_fwd_vals, expected_fwd_vals_from_file.data(), expected_fwd_vals_from_file.size() * sizeof(float)); + // Read the expected forward values from file + std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); - // check if we got the expected results + // Check if we got the expected results float fwd_diff = 0; + float eps = 1e-7; + int retval = 0; + printf("\nforward projection test\n"); for (int ir = 0; ir < nlors; ir++) { @@ -124,24 +89,14 @@ int main() //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// - // test the back projection - - float *bimg; - cudaMallocManaged(&bimg, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float)); + // Test the back projection + std::vector bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f); + std::vector ones(nlors, 1.0f); - for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) - { - bimg[i] = 0; - } - - float *ones; - cudaMallocManaged(&ones, nlors * sizeof(float)); - for (size_t i = 0; i < nlors; i++) - { - ones[i] = 1; - } - - joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim, 0, 64); + joseph3d_back( + xstart.data(), xend.data(), bimg.data(), + img_origin.data(), voxsize.data(), ones.data(), + nlors, img_dim.data()); printf("\nback projection of ones along all rays:\n"); for (size_t i0 = 0; i0 < img_dim[0]; i0++) @@ -161,18 +116,8 @@ int main() // of the forward projector. This is more practical than checking a lot of single voxels in the // back projected image. - float inner_product1 = 0; - float inner_product2 = 0; - - for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++) - { - inner_product1 += (img[i] * bimg[i]); - } - - for (size_t ir = 0; ir < nlors; ir++) - { - inner_product2 += (img_fwd[ir] * ones[ir]); - } + float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f); + float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f); float ip_diff = fabs(inner_product1 - inner_product2); @@ -185,22 +130,5 @@ int main() retval = 1; } - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - - cudaFree(img_dim); - cudaFree(voxsize); - cudaFree(img_origin); - cudaFree(img); - cudaFree(vstart); - cudaFree(vend); - cudaFree(xstart); - cudaFree(xend); - cudaFree(img_fwd); - cudaFree(expected_fwd_vals); - cudaFree(bimg); - cudaFree(ones); - return retval; } From 23add1621786ccca26f8e4d501d523c4e0c38def Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 17:30:15 +0200 Subject: [PATCH 25/35] add forward test in 3 modes --- tests/test_nontof_fwd_back.cu | 211 ++++++++++++++++++++++++---------- 1 file changed, 151 insertions(+), 60 deletions(-) diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu index 2c309e7..3ba2636 100644 --- a/tests/test_nontof_fwd_back.cu +++ b/tests/test_nontof_fwd_back.cu @@ -5,15 +5,31 @@ #include #include #include +#include + +void test_host_arrays(); +void test_cuda_managed_arrays(); +void test_cuda_device_arrays(); int main() { - //////////////////////////////////////////////////////// - // Host array use case - //////////////////////////////////////////////////////// + std::cout << "Testing joseph3d_fwd and joseph3d_back with different memory modes:\n"; + + std::cout << "\n--- Testing with Host Arrays ---\n"; + test_host_arrays(); + + std::cout << "\n--- Testing with CUDA-Managed Arrays ---\n"; + test_cuda_managed_arrays(); + + std::cout << "\n--- Testing with CUDA Device Arrays ---\n"; + test_cuda_device_arrays(); - std::cout << "Host array use case\n"; + return 0; +} +void test_host_arrays() +{ + // Host array test (same as your current implementation) std::vector img_dim = {2, 3, 4}; std::vector voxsize = {4.0f, 3.0f, 2.0f}; @@ -23,18 +39,11 @@ int main() img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; } - // Read the image from file std::vector img = readArrayFromFile("img.txt"); - - // Read the ray start coordinates from file std::vector vstart = readArrayFromFile("vstart.txt"); - - // Read the ray end coordinates from file std::vector vend = readArrayFromFile("vend.txt"); - size_t nlors = vstart.size() / 3; - // Calculate the start and end coordinates in world coordinates std::vector xstart(3 * nlors); std::vector xend(3 * nlors); @@ -49,86 +58,168 @@ int main() xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2]; } - // Allocate memory for forward projection results std::vector img_fwd(nlors); - - // Perform forward projection joseph3d_fwd( xstart.data(), xend.data(), img.data(), img_origin.data(), voxsize.data(), img_fwd.data(), nlors, img_dim.data(), 0, 64); - ///////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////// - - // Read the expected forward values from file std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); - - // Check if we got the expected results float fwd_diff = 0; float eps = 1e-7; - int retval = 0; - printf("\nforward projection test\n"); for (int ir = 0; ir < nlors; ir++) { - printf("test ray %d: fwd projected: %.7e expected: %.7e\n", ir, img_fwd[ir], expected_fwd_vals[ir]); + fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]); + if (fwd_diff > eps) + { + std::cerr << "Host array test failed for ray " << ir << "\n"; + return; + } + } +} + +void test_cuda_managed_arrays() +{ + // CUDA-managed array test + int img_dim[3] = {2, 3, 4}; + float voxsize[3] = {4.0f, 3.0f, 2.0f}; + + float *img_origin; + cudaMallocManaged(&img_origin, 3 * sizeof(float)); + for (int i = 0; i < 3; ++i) + { + img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; + } + + std::vector img_from_file = readArrayFromFile("img.txt"); + float *img; + cudaMallocManaged(&img, img_from_file.size() * sizeof(float)); + std::copy(img_from_file.begin(), img_from_file.end(), img); + + std::vector vstart_from_file = readArrayFromFile("vstart.txt"); + float *vstart; + cudaMallocManaged(&vstart, vstart_from_file.size() * sizeof(float)); + std::copy(vstart_from_file.begin(), vstart_from_file.end(), vstart); + + std::vector vend_from_file = readArrayFromFile("vend.txt"); + float *vend; + cudaMallocManaged(&vend, vend_from_file.size() * sizeof(float)); + std::copy(vend_from_file.begin(), vend_from_file.end(), vend); + size_t nlors = vstart_from_file.size() / 3; + + float *xstart, *xend; + cudaMallocManaged(&xstart, 3 * nlors * sizeof(float)); + cudaMallocManaged(&xend, 3 * nlors * sizeof(float)); + + for (int ir = 0; ir < nlors; ir++) + { + for (int j = 0; j < 3; j++) + { + xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j]; + xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j]; + } + } + + float *img_fwd; + cudaMallocManaged(&img_fwd, nlors * sizeof(float)); + joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); + + cudaDeviceSynchronize(); + + std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); + float fwd_diff = 0; + float eps = 1e-7; + + for (int ir = 0; ir < nlors; ir++) + { fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]); if (fwd_diff > eps) { - printf("\n################################################################################"); - printf("\nabs(fwd projected - expected value) = %.2e for ray%d above tolerance %.2e", fwd_diff, ir, eps); - printf("\n################################################################################\n"); - retval = 1; + std::cerr << "CUDA-managed array test failed for ray " << ir << "\n"; + return; } } - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// + cudaFree(img_origin); + cudaFree(img); + cudaFree(vstart); + cudaFree(vend); + cudaFree(xstart); + cudaFree(xend); + cudaFree(img_fwd); +} - // Test the back projection - std::vector bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f); - std::vector ones(nlors, 1.0f); +void test_cuda_device_arrays() +{ + // CUDA device array test + int img_dim[3] = {2, 3, 4}; + float voxsize[3] = {4.0f, 3.0f, 2.0f}; - joseph3d_back( - xstart.data(), xend.data(), bimg.data(), - img_origin.data(), voxsize.data(), ones.data(), - nlors, img_dim.data()); + float img_origin[3]; + for (int i = 0; i < 3; ++i) + { + img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; + } + + std::vector img_from_file = readArrayFromFile("img.txt"); + float *img; + cudaMalloc(&img, img_from_file.size() * sizeof(float)); + cudaMemcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + + std::vector vstart_from_file = readArrayFromFile("vstart.txt"); + float *vstart; + cudaMalloc(&vstart, vstart_from_file.size() * sizeof(float)); + cudaMemcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); - printf("\nback projection of ones along all rays:\n"); - for (size_t i0 = 0; i0 < img_dim[0]; i0++) + std::vector vend_from_file = readArrayFromFile("vend.txt"); + float *vend; + cudaMalloc(&vend, vend_from_file.size() * sizeof(float)); + cudaMemcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + + size_t nlors = vstart_from_file.size() / 3; + + float *xstart, *xend; + cudaMalloc(&xstart, 3 * nlors * sizeof(float)); + cudaMalloc(&xend, 3 * nlors * sizeof(float)); + + for (int ir = 0; ir < nlors; ir++) { - for (size_t i1 = 0; i1 < img_dim[1]; i1++) + for (int j = 0; j < 3; j++) { - for (size_t i2 = 0; i2 < img_dim[2]; i2++) - { - printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); - } - printf("\n"); + float xstart_val = img_origin[j] + vstart_from_file[ir * 3 + j] * voxsize[j]; + float xend_val = img_origin[j] + vend_from_file[ir * 3 + j] * voxsize[j]; + cudaMemcpy(&xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(&xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice); } - printf("\n"); } - // To test whether the back projection is correct, we test if the back projector is the adjoint - // of the forward projector. This is more practical than checking a lot of single voxels in the - // back projected image. + float *img_fwd; + cudaMalloc(&img_fwd, nlors * sizeof(float)); + joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); - float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f); - float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f); + std::vector img_fwd_host(nlors); + cudaMemcpy(img_fwd_host.data(), img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost); - float ip_diff = fabs(inner_product1 - inner_product2); + std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); + float fwd_diff = 0; + float eps = 1e-7; - if (ip_diff > eps) + for (int ir = 0; ir < nlors; ir++) { - printf("\n#########################################################################"); - printf("\nback projection test failed. back projection seems not to be the adjoint."); - printf("\n %.7e", ip_diff); - printf("\n#########################################################################\n"); - retval = 1; + fwd_diff = std::abs(img_fwd_host[ir] - expected_fwd_vals[ir]); + if (fwd_diff > eps) + { + std::cerr << "CUDA device array test failed for ray " << ir << "\n"; + return; + } } - return retval; + cudaFree(img); + cudaFree(vstart); + cudaFree(vend); + cudaFree(xstart); + cudaFree(xend); + cudaFree(img_fwd); } From b252684f06d654275ede3317bda67a5ccdfda7a7 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 17:44:11 +0200 Subject: [PATCH 26/35] add back projection test for all 3 modes --- tests/test_nontof_fwd_back.cu | 156 +++++++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 2 deletions(-) diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu index 3ba2636..cf9bec3 100644 --- a/tests/test_nontof_fwd_back.cu +++ b/tests/test_nontof_fwd_back.cu @@ -77,6 +77,51 @@ void test_host_arrays() return; } } + + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + // Test the back projection using the defintion of the adjoint operator + std::vector bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f); + std::vector ones(nlors, 1.0f); + + joseph3d_back( + xstart.data(), xend.data(), bimg.data(), + img_origin.data(), voxsize.data(), ones.data(), + nlors, img_dim.data()); + + printf("\nback projection of ones along all rays:\n"); + for (size_t i0 = 0; i0 < img_dim[0]; i0++) + { + for (size_t i1 = 0; i1 < img_dim[1]; i1++) + { + for (size_t i2 = 0; i2 < img_dim[2]; i2++) + { + printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + } + printf("\n"); + } + printf("\n"); + } + + // To test whether the back projection is correct, we test if the back projector is the adjoint + // of the forward projector. This is more practical than checking a lot of single voxels in the + // back projected image. + + float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f); + float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f); + + float ip_diff = fabs(inner_product1 - inner_product2); + + if (ip_diff > eps) + { + printf("\n#########################################################################"); + printf("\nback projection test failed. back projection seems not to be the adjoint."); + printf("\n %.7e", ip_diff); + printf("\n#########################################################################\n"); + std::cerr << "Back projection via adjointness test failed.\n"; + } } void test_cuda_managed_arrays() @@ -126,8 +171,6 @@ void test_cuda_managed_arrays() cudaMallocManaged(&img_fwd, nlors * sizeof(float)); joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); - cudaDeviceSynchronize(); - std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); float fwd_diff = 0; float eps = 1e-7; @@ -142,6 +185,56 @@ void test_cuda_managed_arrays() } } + // Test the back projection + float *bimg; + cudaMallocManaged(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float)); + std::fill(bimg, bimg + (img_dim[0] * img_dim[1] * img_dim[2]), 0.0f); + + float *ones; + cudaMallocManaged(&ones, nlors * sizeof(float)); + std::fill(ones, ones + nlors, 1.0f); + + joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim); + + printf("\nCUDA-managed back projection of ones along all rays:\n"); + for (size_t i0 = 0; i0 < img_dim[0]; i0++) + { + for (size_t i1 = 0; i1 < img_dim[1]; i1++) + { + for (size_t i2 = 0; i2 < img_dim[2]; i2++) + { + printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + } + printf("\n"); + } + printf("\n"); + } + + // Validate the back projection using adjointness + float inner_product1 = 0.0f; + float inner_product2 = 0.0f; + + for (size_t i = 0; i < img_from_file.size(); i++) + { + inner_product1 += img[i] * bimg[i]; + } + + for (size_t ir = 0; ir < nlors; ir++) + { + inner_product2 += img_fwd[ir] * ones[ir]; + } + + float ip_diff = fabs(inner_product1 - inner_product2); + + if (ip_diff > eps) + { + std::cerr << "CUDA-managed array back projection test failed: adjointness property violated.\n"; + } + else + { + std::cout << "CUDA-managed array back projection test passed.\n"; + } + cudaFree(img_origin); cudaFree(img); cudaFree(vstart); @@ -149,6 +242,8 @@ void test_cuda_managed_arrays() cudaFree(xstart); cudaFree(xend); cudaFree(img_fwd); + cudaFree(bimg); + cudaFree(ones); } void test_cuda_device_arrays() @@ -216,10 +311,67 @@ void test_cuda_device_arrays() } } + // Test the back projection + float *bimg; + cudaMalloc(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float)); + cudaMemset(bimg, 0, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float)); + + float *ones; + cudaMalloc(&ones, nlors * sizeof(float)); + cudaMemset(ones, 0, nlors * sizeof(float)); + std::vector ones_host(nlors, 1.0f); + cudaMemcpy(ones, ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice); + + joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim); + + std::vector bimg_host(img_dim[0] * img_dim[1] * img_dim[2]); + cudaMemcpy(bimg_host.data(), bimg, bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost); + + printf("\nCUDA device back projection of ones along all rays:\n"); + for (size_t i0 = 0; i0 < img_dim[0]; i0++) + { + for (size_t i1 = 0; i1 < img_dim[1]; i1++) + { + for (size_t i2 = 0; i2 < img_dim[2]; i2++) + { + printf("%.1f ", bimg_host[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + } + printf("\n"); + } + printf("\n"); + } + + // Validate the back projection using adjointness + float inner_product1 = 0.0f; + float inner_product2 = 0.0f; + + for (size_t i = 0; i < img_from_file.size(); i++) + { + inner_product1 += img_from_file[i] * bimg_host[i]; + } + + for (size_t ir = 0; ir < nlors; ir++) + { + inner_product2 += img_fwd_host[ir] * ones_host[ir]; + } + + float ip_diff = fabs(inner_product1 - inner_product2); + + if (ip_diff > eps) + { + std::cerr << "CUDA device array back projection test failed: adjointness property violated.\n"; + } + else + { + std::cout << "CUDA device array back projection test passed.\n"; + } + cudaFree(img); cudaFree(vstart); cudaFree(vend); cudaFree(xstart); cudaFree(xend); cudaFree(img_fwd); + cudaFree(bimg); + cudaFree(ones); } From 6616d9553360c5e47d9ea6d25d84c8d38e48f6ed Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 19:42:03 +0200 Subject: [PATCH 27/35] use single source for host array tests --- CMakeLists.txt | 3 +- tests/test_nontof_fwd_back.cpp | 31 +++--- ...d_back.cu => test_nontof_fwd_back_cuda.cu} | 103 ------------------ 3 files changed, 20 insertions(+), 117 deletions(-) rename tests/{test_nontof_fwd_back.cu => test_nontof_fwd_back_cuda.cu} (68%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7650395..93b7347 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,8 @@ if(CMAKE_CUDA_COMPILER) target_compile_definitions(parallelproj PUBLIC PARALLELPROJ_CUDA=1) set(TEST_SOURCES - tests/test_nontof_fwd_back.cu + tests/test_nontof_fwd_back.cpp + tests/test_nontof_fwd_back_cuda.cu ) else() set(LIB_SOURCES diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp index df6d5df..dad7072 100644 --- a/tests/test_nontof_fwd_back.cpp +++ b/tests/test_nontof_fwd_back.cpp @@ -9,10 +9,14 @@ int main() { //////////////////////////////////////////////////////// - // OpenMP managed memory use case + // host array test cases //////////////////////////////////////////////////////// - std::cout << "OpenMP use case\n"; +#ifdef __CUDACC__ + std::cout << "CUDA host array test\n"; +#else + std::cout << "OpenMP test\n"; +#endif std::vector img_dim = {2, 3, 4}; std::vector voxsize = {4.0f, 3.0f, 2.0f}; @@ -68,7 +72,6 @@ int main() // Check if we got the expected results float fwd_diff = 0; float eps = 1e-7; - int retval = 0; printf("\nforward projection test\n"); for (int ir = 0; ir < nlors; ir++) @@ -78,10 +81,12 @@ int main() fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]); if (fwd_diff > eps) { - printf("\n################################################################################"); - printf("\nabs(fwd projected - expected value) = %.2e for ray%d above tolerance %.2e", fwd_diff, ir, eps); - printf("\n################################################################################\n"); - retval = 1; + std::cerr << "Forward projection test failed.\n"; + std::cerr << "Difference: " << fwd_diff << "\n"; + std::cerr << "Expected: " << expected_fwd_vals[ir] << "\n"; + std::cerr << "Actual: " << img_fwd[ir] << "\n"; + std::cerr << "Tolerance: " << eps << "\n"; + std::cerr << "Ray index: " << ir << "\n"; } } @@ -123,12 +128,12 @@ int main() if (ip_diff > eps) { - printf("\n#########################################################################"); - printf("\nback projection test failed. back projection seems not to be the adjoint."); - printf("\n %.7e", ip_diff); - printf("\n#########################################################################\n"); - retval = 1; + std::cerr << "Back projection test failed.\n"; + std::cerr << "Inner product 1: " << inner_product1 << "\n"; + std::cerr << "Inner product 2: " << inner_product2 << "\n"; + std::cerr << "Difference: " << ip_diff << "\n"; + std::cerr << "Tolerance: " << eps << "\n"; } - return retval; + return 0; } diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back_cuda.cu similarity index 68% rename from tests/test_nontof_fwd_back.cu rename to tests/test_nontof_fwd_back_cuda.cu index cf9bec3..82c1592 100644 --- a/tests/test_nontof_fwd_back.cu +++ b/tests/test_nontof_fwd_back_cuda.cu @@ -7,17 +7,11 @@ #include #include -void test_host_arrays(); void test_cuda_managed_arrays(); void test_cuda_device_arrays(); int main() { - std::cout << "Testing joseph3d_fwd and joseph3d_back with different memory modes:\n"; - - std::cout << "\n--- Testing with Host Arrays ---\n"; - test_host_arrays(); - std::cout << "\n--- Testing with CUDA-Managed Arrays ---\n"; test_cuda_managed_arrays(); @@ -27,103 +21,6 @@ int main() return 0; } -void test_host_arrays() -{ - // Host array test (same as your current implementation) - std::vector img_dim = {2, 3, 4}; - std::vector voxsize = {4.0f, 3.0f, 2.0f}; - - std::vector img_origin(3); - for (int i = 0; i < 3; ++i) - { - img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; - } - - std::vector img = readArrayFromFile("img.txt"); - std::vector vstart = readArrayFromFile("vstart.txt"); - std::vector vend = readArrayFromFile("vend.txt"); - size_t nlors = vstart.size() / 3; - - std::vector xstart(3 * nlors); - std::vector xend(3 * nlors); - - for (int ir = 0; ir < nlors; ir++) - { - xstart[ir * 3 + 0] = img_origin[0] + vstart[ir * 3 + 0] * voxsize[0]; - xstart[ir * 3 + 1] = img_origin[1] + vstart[ir * 3 + 1] * voxsize[1]; - xstart[ir * 3 + 2] = img_origin[2] + vstart[ir * 3 + 2] * voxsize[2]; - - xend[ir * 3 + 0] = img_origin[0] + vend[ir * 3 + 0] * voxsize[0]; - xend[ir * 3 + 1] = img_origin[1] + vend[ir * 3 + 1] * voxsize[1]; - xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2]; - } - - std::vector img_fwd(nlors); - joseph3d_fwd( - xstart.data(), xend.data(), img.data(), - img_origin.data(), voxsize.data(), img_fwd.data(), - nlors, img_dim.data(), 0, 64); - - std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); - float fwd_diff = 0; - float eps = 1e-7; - - for (int ir = 0; ir < nlors; ir++) - { - fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]); - if (fwd_diff > eps) - { - std::cerr << "Host array test failed for ray " << ir << "\n"; - return; - } - } - - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - - // Test the back projection using the defintion of the adjoint operator - std::vector bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f); - std::vector ones(nlors, 1.0f); - - joseph3d_back( - xstart.data(), xend.data(), bimg.data(), - img_origin.data(), voxsize.data(), ones.data(), - nlors, img_dim.data()); - - printf("\nback projection of ones along all rays:\n"); - for (size_t i0 = 0; i0 < img_dim[0]; i0++) - { - for (size_t i1 = 0; i1 < img_dim[1]; i1++) - { - for (size_t i2 = 0; i2 < img_dim[2]; i2++) - { - printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); - } - printf("\n"); - } - printf("\n"); - } - - // To test whether the back projection is correct, we test if the back projector is the adjoint - // of the forward projector. This is more practical than checking a lot of single voxels in the - // back projected image. - - float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f); - float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f); - - float ip_diff = fabs(inner_product1 - inner_product2); - - if (ip_diff > eps) - { - printf("\n#########################################################################"); - printf("\nback projection test failed. back projection seems not to be the adjoint."); - printf("\n %.7e", ip_diff); - printf("\n#########################################################################\n"); - std::cerr << "Back projection via adjointness test failed.\n"; - } -} - void test_cuda_managed_arrays() { // CUDA-managed array test From f04f74fd52d199239faf23c36fa8618f71ac731e Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 19:48:38 +0200 Subject: [PATCH 28/35] use h_, d_, cm_ prefixes --- tests/test_nontof_fwd_back_cuda.cu | 216 ++++++++++++++--------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu index 82c1592..cc9aabc 100644 --- a/tests/test_nontof_fwd_back_cuda.cu +++ b/tests/test_nontof_fwd_back_cuda.cu @@ -24,57 +24,57 @@ int main() void test_cuda_managed_arrays() { // CUDA-managed array test - int img_dim[3] = {2, 3, 4}; - float voxsize[3] = {4.0f, 3.0f, 2.0f}; + int h_img_dim[3] = {2, 3, 4}; + float h_voxsize[3] = {4.0f, 3.0f, 2.0f}; - float *img_origin; - cudaMallocManaged(&img_origin, 3 * sizeof(float)); + float *cm_img_origin; + cudaMallocManaged(&cm_img_origin, 3 * sizeof(float)); for (int i = 0; i < 3; ++i) { - img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; + cm_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i]; } - std::vector img_from_file = readArrayFromFile("img.txt"); - float *img; - cudaMallocManaged(&img, img_from_file.size() * sizeof(float)); - std::copy(img_from_file.begin(), img_from_file.end(), img); + std::vector h_img_from_file = readArrayFromFile("img.txt"); + float *cm_img; + cudaMallocManaged(&cm_img, h_img_from_file.size() * sizeof(float)); + std::copy(h_img_from_file.begin(), h_img_from_file.end(), cm_img); - std::vector vstart_from_file = readArrayFromFile("vstart.txt"); - float *vstart; - cudaMallocManaged(&vstart, vstart_from_file.size() * sizeof(float)); - std::copy(vstart_from_file.begin(), vstart_from_file.end(), vstart); + std::vector h_vstart_from_file = readArrayFromFile("vstart.txt"); + float *cm_vstart; + cudaMallocManaged(&cm_vstart, h_vstart_from_file.size() * sizeof(float)); + std::copy(h_vstart_from_file.begin(), h_vstart_from_file.end(), cm_vstart); - std::vector vend_from_file = readArrayFromFile("vend.txt"); - float *vend; - cudaMallocManaged(&vend, vend_from_file.size() * sizeof(float)); - std::copy(vend_from_file.begin(), vend_from_file.end(), vend); + std::vector h_vend_from_file = readArrayFromFile("vend.txt"); + float *cm_vend; + cudaMallocManaged(&cm_vend, h_vend_from_file.size() * sizeof(float)); + std::copy(h_vend_from_file.begin(), h_vend_from_file.end(), cm_vend); - size_t nlors = vstart_from_file.size() / 3; + size_t nlors = h_vstart_from_file.size() / 3; - float *xstart, *xend; - cudaMallocManaged(&xstart, 3 * nlors * sizeof(float)); - cudaMallocManaged(&xend, 3 * nlors * sizeof(float)); + float *cm_xstart, *cm_xend; + cudaMallocManaged(&cm_xstart, 3 * nlors * sizeof(float)); + cudaMallocManaged(&cm_xend, 3 * nlors * sizeof(float)); for (int ir = 0; ir < nlors; ir++) { for (int j = 0; j < 3; j++) { - xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j]; - xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j]; + cm_xstart[ir * 3 + j] = cm_img_origin[j] + cm_vstart[ir * 3 + j] * h_voxsize[j]; + cm_xend[ir * 3 + j] = cm_img_origin[j] + cm_vend[ir * 3 + j] * h_voxsize[j]; } } - float *img_fwd; - cudaMallocManaged(&img_fwd, nlors * sizeof(float)); - joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); + float *cm_img_fwd; + cudaMallocManaged(&cm_img_fwd, nlors * sizeof(float)); + joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, h_voxsize, cm_img_fwd, nlors, h_img_dim, 0, 64); - std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); + std::vector h_expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); float fwd_diff = 0; float eps = 1e-7; for (int ir = 0; ir < nlors; ir++) { - fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]); + fwd_diff = std::abs(cm_img_fwd[ir] - h_expected_fwd_vals[ir]); if (fwd_diff > eps) { std::cerr << "CUDA-managed array test failed for ray " << ir << "\n"; @@ -83,24 +83,24 @@ void test_cuda_managed_arrays() } // Test the back projection - float *bimg; - cudaMallocManaged(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float)); - std::fill(bimg, bimg + (img_dim[0] * img_dim[1] * img_dim[2]), 0.0f); + float *cm_bimg; + cudaMallocManaged(&cm_bimg, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float)); + std::fill(cm_bimg, cm_bimg + (h_img_dim[0] * h_img_dim[1] * h_img_dim[2]), 0.0f); - float *ones; - cudaMallocManaged(&ones, nlors * sizeof(float)); - std::fill(ones, ones + nlors, 1.0f); + float *cm_ones; + cudaMallocManaged(&cm_ones, nlors * sizeof(float)); + std::fill(cm_ones, cm_ones + nlors, 1.0f); - joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim); + joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, h_voxsize, cm_ones, nlors, h_img_dim); printf("\nCUDA-managed back projection of ones along all rays:\n"); - for (size_t i0 = 0; i0 < img_dim[0]; i0++) + for (size_t i0 = 0; i0 < h_img_dim[0]; i0++) { - for (size_t i1 = 0; i1 < img_dim[1]; i1++) + for (size_t i1 = 0; i1 < h_img_dim[1]; i1++) { - for (size_t i2 = 0; i2 < img_dim[2]; i2++) + for (size_t i2 = 0; i2 < h_img_dim[2]; i2++) { - printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + printf("%.1f ", cm_bimg[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]); } printf("\n"); } @@ -111,14 +111,14 @@ void test_cuda_managed_arrays() float inner_product1 = 0.0f; float inner_product2 = 0.0f; - for (size_t i = 0; i < img_from_file.size(); i++) + for (size_t i = 0; i < h_img_from_file.size(); i++) { - inner_product1 += img[i] * bimg[i]; + inner_product1 += cm_img[i] * cm_bimg[i]; } for (size_t ir = 0; ir < nlors; ir++) { - inner_product2 += img_fwd[ir] * ones[ir]; + inner_product2 += cm_img_fwd[ir] * cm_ones[ir]; } float ip_diff = fabs(inner_product1 - inner_product2); @@ -132,75 +132,75 @@ void test_cuda_managed_arrays() std::cout << "CUDA-managed array back projection test passed.\n"; } - cudaFree(img_origin); - cudaFree(img); - cudaFree(vstart); - cudaFree(vend); - cudaFree(xstart); - cudaFree(xend); - cudaFree(img_fwd); - cudaFree(bimg); - cudaFree(ones); + cudaFree(cm_img_origin); + cudaFree(cm_img); + cudaFree(cm_vstart); + cudaFree(cm_vend); + cudaFree(cm_xstart); + cudaFree(cm_xend); + cudaFree(cm_img_fwd); + cudaFree(cm_bimg); + cudaFree(cm_ones); } void test_cuda_device_arrays() { // CUDA device array test - int img_dim[3] = {2, 3, 4}; - float voxsize[3] = {4.0f, 3.0f, 2.0f}; + int h_img_dim[3] = {2, 3, 4}; + float h_voxsize[3] = {4.0f, 3.0f, 2.0f}; - float img_origin[3]; + float h_img_origin[3]; for (int i = 0; i < 3; ++i) { - img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i]; + h_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i]; } - std::vector img_from_file = readArrayFromFile("img.txt"); - float *img; - cudaMalloc(&img, img_from_file.size() * sizeof(float)); - cudaMemcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + std::vector h_img_from_file = readArrayFromFile("img.txt"); + float *d_img; + cudaMalloc(&d_img, h_img_from_file.size() * sizeof(float)); + cudaMemcpy(d_img, h_img_from_file.data(), h_img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); - std::vector vstart_from_file = readArrayFromFile("vstart.txt"); - float *vstart; - cudaMalloc(&vstart, vstart_from_file.size() * sizeof(float)); - cudaMemcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + std::vector h_vstart_from_file = readArrayFromFile("vstart.txt"); + float *d_vstart; + cudaMalloc(&d_vstart, h_vstart_from_file.size() * sizeof(float)); + cudaMemcpy(d_vstart, h_vstart_from_file.data(), h_vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); - std::vector vend_from_file = readArrayFromFile("vend.txt"); - float *vend; - cudaMalloc(&vend, vend_from_file.size() * sizeof(float)); - cudaMemcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + std::vector h_vend_from_file = readArrayFromFile("vend.txt"); + float *d_vend; + cudaMalloc(&d_vend, h_vend_from_file.size() * sizeof(float)); + cudaMemcpy(d_vend, h_vend_from_file.data(), h_vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); - size_t nlors = vstart_from_file.size() / 3; + size_t nlors = h_vstart_from_file.size() / 3; - float *xstart, *xend; - cudaMalloc(&xstart, 3 * nlors * sizeof(float)); - cudaMalloc(&xend, 3 * nlors * sizeof(float)); + float *d_xstart, *d_xend; + cudaMalloc(&d_xstart, 3 * nlors * sizeof(float)); + cudaMalloc(&d_xend, 3 * nlors * sizeof(float)); for (int ir = 0; ir < nlors; ir++) { for (int j = 0; j < 3; j++) { - float xstart_val = img_origin[j] + vstart_from_file[ir * 3 + j] * voxsize[j]; - float xend_val = img_origin[j] + vend_from_file[ir * 3 + j] * voxsize[j]; - cudaMemcpy(&xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(&xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice); + float xstart_val = h_img_origin[j] + h_vstart_from_file[ir * 3 + j] * h_voxsize[j]; + float xend_val = h_img_origin[j] + h_vend_from_file[ir * 3 + j] * h_voxsize[j]; + cudaMemcpy(&d_xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(&d_xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice); } } - float *img_fwd; - cudaMalloc(&img_fwd, nlors * sizeof(float)); - joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64); + float *d_img_fwd; + cudaMalloc(&d_img_fwd, nlors * sizeof(float)); + joseph3d_fwd(d_xstart, d_xend, d_img, h_img_origin, h_voxsize, d_img_fwd, nlors, h_img_dim, 0, 64); - std::vector img_fwd_host(nlors); - cudaMemcpy(img_fwd_host.data(), img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost); + std::vector h_img_fwd_host(nlors); + cudaMemcpy(h_img_fwd_host.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost); - std::vector expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); + std::vector h_expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); float fwd_diff = 0; float eps = 1e-7; for (int ir = 0; ir < nlors; ir++) { - fwd_diff = std::abs(img_fwd_host[ir] - expected_fwd_vals[ir]); + fwd_diff = std::abs(h_img_fwd_host[ir] - h_expected_fwd_vals[ir]); if (fwd_diff > eps) { std::cerr << "CUDA device array test failed for ray " << ir << "\n"; @@ -209,29 +209,29 @@ void test_cuda_device_arrays() } // Test the back projection - float *bimg; - cudaMalloc(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float)); - cudaMemset(bimg, 0, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float)); + float *d_bimg; + cudaMalloc(&d_bimg, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float)); + cudaMemset(d_bimg, 0, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float)); - float *ones; - cudaMalloc(&ones, nlors * sizeof(float)); - cudaMemset(ones, 0, nlors * sizeof(float)); - std::vector ones_host(nlors, 1.0f); - cudaMemcpy(ones, ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice); + float *d_ones; + cudaMalloc(&d_ones, nlors * sizeof(float)); + cudaMemset(d_ones, 0, nlors * sizeof(float)); + std::vector h_ones_host(nlors, 1.0f); + cudaMemcpy(d_ones, h_ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice); - joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim); + joseph3d_back(d_xstart, d_xend, d_bimg, h_img_origin, h_voxsize, d_ones, nlors, h_img_dim); - std::vector bimg_host(img_dim[0] * img_dim[1] * img_dim[2]); - cudaMemcpy(bimg_host.data(), bimg, bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost); + std::vector h_bimg_host(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]); + cudaMemcpy(h_bimg_host.data(), d_bimg, h_bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost); printf("\nCUDA device back projection of ones along all rays:\n"); - for (size_t i0 = 0; i0 < img_dim[0]; i0++) + for (size_t i0 = 0; i0 < h_img_dim[0]; i0++) { - for (size_t i1 = 0; i1 < img_dim[1]; i1++) + for (size_t i1 = 0; i1 < h_img_dim[1]; i1++) { - for (size_t i2 = 0; i2 < img_dim[2]; i2++) + for (size_t i2 = 0; i2 < h_img_dim[2]; i2++) { - printf("%.1f ", bimg_host[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]); + printf("%.1f ", h_bimg_host[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]); } printf("\n"); } @@ -242,14 +242,14 @@ void test_cuda_device_arrays() float inner_product1 = 0.0f; float inner_product2 = 0.0f; - for (size_t i = 0; i < img_from_file.size(); i++) + for (size_t i = 0; i < h_img_from_file.size(); i++) { - inner_product1 += img_from_file[i] * bimg_host[i]; + inner_product1 += h_img_from_file[i] * h_bimg_host[i]; } for (size_t ir = 0; ir < nlors; ir++) { - inner_product2 += img_fwd_host[ir] * ones_host[ir]; + inner_product2 += h_img_fwd_host[ir] * h_ones_host[ir]; } float ip_diff = fabs(inner_product1 - inner_product2); @@ -263,12 +263,12 @@ void test_cuda_device_arrays() std::cout << "CUDA device array back projection test passed.\n"; } - cudaFree(img); - cudaFree(vstart); - cudaFree(vend); - cudaFree(xstart); - cudaFree(xend); - cudaFree(img_fwd); - cudaFree(bimg); - cudaFree(ones); + cudaFree(d_img); + cudaFree(d_vstart); + cudaFree(d_vend); + cudaFree(d_xstart); + cudaFree(d_xend); + cudaFree(d_img_fwd); + cudaFree(d_bimg); + cudaFree(d_ones); } From 19e7bcffa707771ebca736e900af7a8286ad711e Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sat, 12 Apr 2025 19:53:40 +0200 Subject: [PATCH 29/35] remove _from_file --- tests/test_nontof_fwd_back_cuda.cu | 69 +++++++++++++++--------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu index cc9aabc..c65a5e5 100644 --- a/tests/test_nontof_fwd_back_cuda.cu +++ b/tests/test_nontof_fwd_back_cuda.cu @@ -34,22 +34,22 @@ void test_cuda_managed_arrays() cm_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i]; } - std::vector h_img_from_file = readArrayFromFile("img.txt"); + std::vector h_img = readArrayFromFile("img.txt"); float *cm_img; - cudaMallocManaged(&cm_img, h_img_from_file.size() * sizeof(float)); - std::copy(h_img_from_file.begin(), h_img_from_file.end(), cm_img); + cudaMallocManaged(&cm_img, h_img.size() * sizeof(float)); + std::copy(h_img.begin(), h_img.end(), cm_img); - std::vector h_vstart_from_file = readArrayFromFile("vstart.txt"); + std::vector h_vstart = readArrayFromFile("vstart.txt"); float *cm_vstart; - cudaMallocManaged(&cm_vstart, h_vstart_from_file.size() * sizeof(float)); - std::copy(h_vstart_from_file.begin(), h_vstart_from_file.end(), cm_vstart); + cudaMallocManaged(&cm_vstart, h_vstart.size() * sizeof(float)); + std::copy(h_vstart.begin(), h_vstart.end(), cm_vstart); - std::vector h_vend_from_file = readArrayFromFile("vend.txt"); + std::vector h_vend = readArrayFromFile("vend.txt"); float *cm_vend; - cudaMallocManaged(&cm_vend, h_vend_from_file.size() * sizeof(float)); - std::copy(h_vend_from_file.begin(), h_vend_from_file.end(), cm_vend); + cudaMallocManaged(&cm_vend, h_vend.size() * sizeof(float)); + std::copy(h_vend.begin(), h_vend.end(), cm_vend); - size_t nlors = h_vstart_from_file.size() / 3; + size_t nlors = h_vstart.size() / 3; float *cm_xstart, *cm_xend; cudaMallocManaged(&cm_xstart, 3 * nlors * sizeof(float)); @@ -111,7 +111,7 @@ void test_cuda_managed_arrays() float inner_product1 = 0.0f; float inner_product2 = 0.0f; - for (size_t i = 0; i < h_img_from_file.size(); i++) + for (size_t i = 0; i < h_img.size(); i++) { inner_product1 += cm_img[i] * cm_bimg[i]; } @@ -155,22 +155,22 @@ void test_cuda_device_arrays() h_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i]; } - std::vector h_img_from_file = readArrayFromFile("img.txt"); + std::vector h_img = readArrayFromFile("img.txt"); float *d_img; - cudaMalloc(&d_img, h_img_from_file.size() * sizeof(float)); - cudaMemcpy(d_img, h_img_from_file.data(), h_img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc(&d_img, h_img.size() * sizeof(float)); + cudaMemcpy(d_img, h_img.data(), h_img.size() * sizeof(float), cudaMemcpyHostToDevice); - std::vector h_vstart_from_file = readArrayFromFile("vstart.txt"); + std::vector h_vstart = readArrayFromFile("vstart.txt"); float *d_vstart; - cudaMalloc(&d_vstart, h_vstart_from_file.size() * sizeof(float)); - cudaMemcpy(d_vstart, h_vstart_from_file.data(), h_vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc(&d_vstart, h_vstart.size() * sizeof(float)); + cudaMemcpy(d_vstart, h_vstart.data(), h_vstart.size() * sizeof(float), cudaMemcpyHostToDevice); - std::vector h_vend_from_file = readArrayFromFile("vend.txt"); + std::vector h_vend = readArrayFromFile("vend.txt"); float *d_vend; - cudaMalloc(&d_vend, h_vend_from_file.size() * sizeof(float)); - cudaMemcpy(d_vend, h_vend_from_file.data(), h_vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc(&d_vend, h_vend.size() * sizeof(float)); + cudaMemcpy(d_vend, h_vend.data(), h_vend.size() * sizeof(float), cudaMemcpyHostToDevice); - size_t nlors = h_vstart_from_file.size() / 3; + size_t nlors = h_vstart.size() / 3; float *d_xstart, *d_xend; cudaMalloc(&d_xstart, 3 * nlors * sizeof(float)); @@ -180,8 +180,8 @@ void test_cuda_device_arrays() { for (int j = 0; j < 3; j++) { - float xstart_val = h_img_origin[j] + h_vstart_from_file[ir * 3 + j] * h_voxsize[j]; - float xend_val = h_img_origin[j] + h_vend_from_file[ir * 3 + j] * h_voxsize[j]; + float xstart_val = h_img_origin[j] + h_vstart[ir * 3 + j] * h_voxsize[j]; + float xend_val = h_img_origin[j] + h_vend[ir * 3 + j] * h_voxsize[j]; cudaMemcpy(&d_xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(&d_xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice); } @@ -191,8 +191,8 @@ void test_cuda_device_arrays() cudaMalloc(&d_img_fwd, nlors * sizeof(float)); joseph3d_fwd(d_xstart, d_xend, d_img, h_img_origin, h_voxsize, d_img_fwd, nlors, h_img_dim, 0, 64); - std::vector h_img_fwd_host(nlors); - cudaMemcpy(h_img_fwd_host.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost); + std::vector h_img_fwd(nlors); + cudaMemcpy(h_img_fwd.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost); std::vector h_expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); float fwd_diff = 0; @@ -200,7 +200,7 @@ void test_cuda_device_arrays() for (int ir = 0; ir < nlors; ir++) { - fwd_diff = std::abs(h_img_fwd_host[ir] - h_expected_fwd_vals[ir]); + fwd_diff = std::abs(h_img_fwd[ir] - h_expected_fwd_vals[ir]); if (fwd_diff > eps) { std::cerr << "CUDA device array test failed for ray " << ir << "\n"; @@ -215,14 +215,13 @@ void test_cuda_device_arrays() float *d_ones; cudaMalloc(&d_ones, nlors * sizeof(float)); - cudaMemset(d_ones, 0, nlors * sizeof(float)); - std::vector h_ones_host(nlors, 1.0f); - cudaMemcpy(d_ones, h_ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice); + std::vector h_ones(nlors, 1.0f); + cudaMemcpy(d_ones, h_ones.data(), nlors * sizeof(float), cudaMemcpyHostToDevice); joseph3d_back(d_xstart, d_xend, d_bimg, h_img_origin, h_voxsize, d_ones, nlors, h_img_dim); - std::vector h_bimg_host(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]); - cudaMemcpy(h_bimg_host.data(), d_bimg, h_bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost); + std::vector h_bimg(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]); + cudaMemcpy(h_bimg.data(), d_bimg, h_bimg.size() * sizeof(float), cudaMemcpyDeviceToHost); printf("\nCUDA device back projection of ones along all rays:\n"); for (size_t i0 = 0; i0 < h_img_dim[0]; i0++) @@ -231,7 +230,7 @@ void test_cuda_device_arrays() { for (size_t i2 = 0; i2 < h_img_dim[2]; i2++) { - printf("%.1f ", h_bimg_host[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]); + printf("%.1f ", h_bimg[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]); } printf("\n"); } @@ -242,14 +241,14 @@ void test_cuda_device_arrays() float inner_product1 = 0.0f; float inner_product2 = 0.0f; - for (size_t i = 0; i < h_img_from_file.size(); i++) + for (size_t i = 0; i < h_img.size(); i++) { - inner_product1 += h_img_from_file[i] * h_bimg_host[i]; + inner_product1 += h_img[i] * h_bimg[i]; } for (size_t ir = 0; ir < nlors; ir++) { - inner_product2 += h_img_fwd_host[ir] * h_ones_host[ir]; + inner_product2 += h_img_fwd[ir] * h_ones[ir]; } float ip_diff = fabs(inner_product1 - inner_product2); From 7d0a41de64bfb2570fb2ccd7dbc548c05fa63deb Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Sun, 13 Apr 2025 09:53:19 +0200 Subject: [PATCH 30/35] correct return values of tests --- src/joseph3d_back.cu | 7 +- src/joseph3d_fwd.cu | 6 +- tests/test_nontof_fwd_back.cpp | 14 ++-- tests/test_nontof_fwd_back_cuda.cu | 111 ++++++++++++++++++++--------- 4 files changed, 96 insertions(+), 42 deletions(-) diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu index 378d387..e20db17 100644 --- a/src/joseph3d_back.cu +++ b/src/joseph3d_back.cu @@ -60,9 +60,12 @@ void joseph3d_back(const float *xstart, handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly); // Handle img (write access) + int h_img_dim[3]; + cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost); + size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2]; float *d_img = nullptr; bool free_img = false; - handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetAccessedBy); + handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetAccessedBy); // Handle img_origin (read mostly) float *d_img_origin = nullptr; @@ -117,7 +120,7 @@ void joseph3d_back(const float *xstart, if (free_img) { // Copy the result back to the host - cudaMemcpy(const_cast(img), d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyDeviceToHost); + cudaMemcpy(const_cast(img), d_img, img_size, cudaMemcpyDeviceToHost); cudaFree(d_img); } if (free_img_origin) diff --git a/src/joseph3d_fwd.cu b/src/joseph3d_fwd.cu index a5444c4..7585cb1 100644 --- a/src/joseph3d_fwd.cu +++ b/src/joseph3d_fwd.cu @@ -60,9 +60,13 @@ void joseph3d_fwd(const float *xstart, handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly); // Handle img (read mostly) + // Copy img_dim to the host if needed to calculate the size + int h_img_dim[3]; + cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost); + size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2]; float *d_img = nullptr; bool free_img = false; - handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetReadMostly); + handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetReadMostly); // Handle img_origin (read mostly) float *d_img_origin = nullptr; diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp index dad7072..32e0856 100644 --- a/tests/test_nontof_fwd_back.cpp +++ b/tests/test_nontof_fwd_back.cpp @@ -12,11 +12,9 @@ int main() // host array test cases //////////////////////////////////////////////////////// -#ifdef __CUDACC__ - std::cout << "CUDA host array test\n"; -#else - std::cout << "OpenMP test\n"; -#endif + bool all_tests_passed = true; + + std::cout << "Host array test\n"; std::vector img_dim = {2, 3, 4}; std::vector voxsize = {4.0f, 3.0f, 2.0f}; @@ -87,6 +85,8 @@ int main() std::cerr << "Actual: " << img_fwd[ir] << "\n"; std::cerr << "Tolerance: " << eps << "\n"; std::cerr << "Ray index: " << ir << "\n"; + + all_tests_passed = false; } } @@ -133,7 +133,9 @@ int main() std::cerr << "Inner product 2: " << inner_product2 << "\n"; std::cerr << "Difference: " << ip_diff << "\n"; std::cerr << "Tolerance: " << eps << "\n"; + + all_tests_passed = false; } - return 0; + return all_tests_passed ? 0 : 1; } diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu index c65a5e5..2041cbe 100644 --- a/tests/test_nontof_fwd_back_cuda.cu +++ b/tests/test_nontof_fwd_back_cuda.cu @@ -7,31 +7,69 @@ #include #include -void test_cuda_managed_arrays(); -void test_cuda_device_arrays(); +void test_cuda_managed_arrays(int device_id, int threadsperblock); +void test_cuda_device_arrays(int device_id, int threadsperblock); int main() { - std::cout << "\n--- Testing with CUDA-Managed Arrays ---\n"; - test_cuda_managed_arrays(); + bool all_tests_passed = true; - std::cout << "\n--- Testing with CUDA Device Arrays ---\n"; - test_cuda_device_arrays(); + int device_count; + cudaGetDeviceCount(&device_count); - return 0; + for (int i = 0; i < device_count; i++) + { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + std::cout << "Device " << i << ": " << prop.name << "\n"; + + try + { + test_cuda_managed_arrays(i, 64); + } + catch (const std::exception &e) + { + std::cerr << "CUDA-managed array test failed on device " << i << ": " << e.what() << "\n"; + all_tests_passed = false; + } + + std::cout << "\n--- Testing with CUDA Device Arrays ---\n"; + try + { + test_cuda_device_arrays(i, 64); + } + catch (const std::exception &e) + { + std::cerr << "CUDA device array test failed on device " << i << ": " << e.what() << "\n"; + all_tests_passed = false; + } + } + + return all_tests_passed ? 0 : 1; } -void test_cuda_managed_arrays() +void test_cuda_managed_arrays(int device_id, int threadsperblock) { + cudaSetDevice(device_id); + // CUDA-managed array test - int h_img_dim[3] = {2, 3, 4}; - float h_voxsize[3] = {4.0f, 3.0f, 2.0f}; + int *cm_img_dim; + cudaMallocManaged(&cm_img_dim, 3 * sizeof(int)); + cm_img_dim[0] = 2; + cm_img_dim[1] = 3; + cm_img_dim[2] = 4; + + float *cm_voxsize; + cudaMallocManaged(&cm_voxsize, 3 * sizeof(float)); + cm_voxsize[0] = 4.0f; + cm_voxsize[1] = 3.0f; + cm_voxsize[2] = 2.0f; float *cm_img_origin; cudaMallocManaged(&cm_img_origin, 3 * sizeof(float)); for (int i = 0; i < 3; ++i) { - cm_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i]; + cm_img_origin[i] = (-(float)cm_img_dim[i] / 2 + 0.5f) * cm_voxsize[i]; } std::vector h_img = readArrayFromFile("img.txt"); @@ -59,14 +97,14 @@ void test_cuda_managed_arrays() { for (int j = 0; j < 3; j++) { - cm_xstart[ir * 3 + j] = cm_img_origin[j] + cm_vstart[ir * 3 + j] * h_voxsize[j]; - cm_xend[ir * 3 + j] = cm_img_origin[j] + cm_vend[ir * 3 + j] * h_voxsize[j]; + cm_xstart[ir * 3 + j] = cm_img_origin[j] + cm_vstart[ir * 3 + j] * cm_voxsize[j]; + cm_xend[ir * 3 + j] = cm_img_origin[j] + cm_vend[ir * 3 + j] * cm_voxsize[j]; } } float *cm_img_fwd; cudaMallocManaged(&cm_img_fwd, nlors * sizeof(float)); - joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, h_voxsize, cm_img_fwd, nlors, h_img_dim, 0, 64); + joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, cm_voxsize, cm_img_fwd, nlors, cm_img_dim, device_id, threadsperblock); std::vector h_expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); float fwd_diff = 0; @@ -84,23 +122,23 @@ void test_cuda_managed_arrays() // Test the back projection float *cm_bimg; - cudaMallocManaged(&cm_bimg, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float)); - std::fill(cm_bimg, cm_bimg + (h_img_dim[0] * h_img_dim[1] * h_img_dim[2]), 0.0f); + cudaMallocManaged(&cm_bimg, cm_img_dim[0] * cm_img_dim[1] * cm_img_dim[2] * sizeof(float)); + std::fill(cm_bimg, cm_bimg + (cm_img_dim[0] * cm_img_dim[1] * cm_img_dim[2]), 0.0f); float *cm_ones; cudaMallocManaged(&cm_ones, nlors * sizeof(float)); std::fill(cm_ones, cm_ones + nlors, 1.0f); - joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, h_voxsize, cm_ones, nlors, h_img_dim); + joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, cm_voxsize, cm_ones, nlors, cm_img_dim, device_id, threadsperblock); printf("\nCUDA-managed back projection of ones along all rays:\n"); - for (size_t i0 = 0; i0 < h_img_dim[0]; i0++) + for (size_t i0 = 0; i0 < cm_img_dim[0]; i0++) { - for (size_t i1 = 0; i1 < h_img_dim[1]; i1++) + for (size_t i1 = 0; i1 < cm_img_dim[1]; i1++) { - for (size_t i2 = 0; i2 < h_img_dim[2]; i2++) + for (size_t i2 = 0; i2 < cm_img_dim[2]; i2++) { - printf("%.1f ", cm_bimg[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]); + printf("%.1f ", cm_bimg[cm_img_dim[1] * cm_img_dim[2] * i0 + cm_img_dim[2] * i1 + i2]); } printf("\n"); } @@ -127,11 +165,9 @@ void test_cuda_managed_arrays() { std::cerr << "CUDA-managed array back projection test failed: adjointness property violated.\n"; } - else - { - std::cout << "CUDA-managed array back projection test passed.\n"; - } + cudaFree(cm_img_dim); + cudaFree(cm_voxsize); cudaFree(cm_img_origin); cudaFree(cm_img); cudaFree(cm_vstart); @@ -143,17 +179,29 @@ void test_cuda_managed_arrays() cudaFree(cm_ones); } -void test_cuda_device_arrays() +void test_cuda_device_arrays(int device_id, int threadsperblock) { + cudaSetDevice(device_id); + // CUDA device array test + int *d_img_dim; + cudaMalloc(&d_img_dim, 3 * sizeof(int)); int h_img_dim[3] = {2, 3, 4}; + cudaMemcpy(d_img_dim, h_img_dim, 3 * sizeof(int), cudaMemcpyHostToDevice); + + float *d_voxsize; + cudaMalloc(&d_voxsize, 3 * sizeof(float)); float h_voxsize[3] = {4.0f, 3.0f, 2.0f}; + cudaMemcpy(d_voxsize, h_voxsize, 3 * sizeof(float), cudaMemcpyHostToDevice); float h_img_origin[3]; for (int i = 0; i < 3; ++i) { h_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i]; } + float *d_img_origin; + cudaMalloc(&d_img_origin, 3 * sizeof(float)); + cudaMemcpy(d_img_origin, h_img_origin, 3 * sizeof(float), cudaMemcpyHostToDevice); std::vector h_img = readArrayFromFile("img.txt"); float *d_img; @@ -186,10 +234,9 @@ void test_cuda_device_arrays() cudaMemcpy(&d_xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice); } } - float *d_img_fwd; cudaMalloc(&d_img_fwd, nlors * sizeof(float)); - joseph3d_fwd(d_xstart, d_xend, d_img, h_img_origin, h_voxsize, d_img_fwd, nlors, h_img_dim, 0, 64); + joseph3d_fwd(d_xstart, d_xend, d_img, d_img_origin, d_voxsize, d_img_fwd, nlors, d_img_dim, device_id, threadsperblock); std::vector h_img_fwd(nlors); cudaMemcpy(h_img_fwd.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost); @@ -218,7 +265,7 @@ void test_cuda_device_arrays() std::vector h_ones(nlors, 1.0f); cudaMemcpy(d_ones, h_ones.data(), nlors * sizeof(float), cudaMemcpyHostToDevice); - joseph3d_back(d_xstart, d_xend, d_bimg, h_img_origin, h_voxsize, d_ones, nlors, h_img_dim); + joseph3d_back(d_xstart, d_xend, d_bimg, d_img_origin, d_voxsize, d_ones, nlors, d_img_dim, device_id, threadsperblock); std::vector h_bimg(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]); cudaMemcpy(h_bimg.data(), d_bimg, h_bimg.size() * sizeof(float), cudaMemcpyDeviceToHost); @@ -257,11 +304,9 @@ void test_cuda_device_arrays() { std::cerr << "CUDA device array back projection test failed: adjointness property violated.\n"; } - else - { - std::cout << "CUDA device array back projection test passed.\n"; - } + cudaFree(d_img_dim); + cudaFree(d_voxsize); cudaFree(d_img); cudaFree(d_vstart); cudaFree(d_vend); From 502a4a60bdd2ba4e0514aed5eaf426a280ad01ff Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Mon, 14 Apr 2025 10:26:41 +0200 Subject: [PATCH 31/35] add size_t nvoxels to facilitate img transfer --- include/parallelproj.h | 48 ++++++++++++++++-------------- src/joseph3d_back.cpp | 1 + src/joseph3d_back.cu | 8 ++--- src/joseph3d_fwd.cpp | 1 + src/joseph3d_fwd.cu | 9 ++---- tests/test_nontof_fwd_back.cpp | 6 ++-- tests/test_nontof_fwd_back_cuda.cu | 13 +++++--- 7 files changed, 47 insertions(+), 39 deletions(-) diff --git a/include/parallelproj.h b/include/parallelproj.h index e554942..2ec6a5f 100644 --- a/include/parallelproj.h +++ b/include/parallelproj.h @@ -2,38 +2,42 @@ #include #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -void joseph3d_fwd(const float *xstart, - const float *xend, - const float *img, - const float *img_origin, - const float *voxsize, - float *p, - size_t nlors, - const int *img_dim, - int device_id = 0, - int threadsperblock = 64); + void joseph3d_fwd(const float *xstart, + const float *xend, + const float *img, + const float *img_origin, + const float *voxsize, + float *p, + size_t nvoxels, + size_t nlors, + const int *img_dim, + int device_id = 0, + int threadsperblock = 64); #ifdef __cplusplus } #endif #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -void joseph3d_back(const float *xstart, - const float *xend, - float *img, - const float *img_origin, - const float *voxsize, - const float *p, - size_t nlors, - const int *img_dim, - int device_id = 0, - int threadsperblock = 64); + void joseph3d_back(const float *xstart, + const float *xend, + float *img, + const float *img_origin, + const float *voxsize, + const float *p, + size_t nvoxels, + size_t nlors, + const int *img_dim, + int device_id = 0, + int threadsperblock = 64); #ifdef __cplusplus } diff --git a/src/joseph3d_back.cpp b/src/joseph3d_back.cpp index f63d8da..bf5d037 100644 --- a/src/joseph3d_back.cpp +++ b/src/joseph3d_back.cpp @@ -8,6 +8,7 @@ void joseph3d_back(const float *xstart, const float *img_origin, const float *voxsize, const float *p, + size_t nvoxels, size_t nlors, const int *img_dim, int device_id, diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu index e20db17..e8bae57 100644 --- a/src/joseph3d_back.cu +++ b/src/joseph3d_back.cu @@ -32,6 +32,7 @@ void joseph3d_back(const float *xstart, const float *img_origin, const float *voxsize, const float *p, + size_t nvoxels, size_t nlors, const int *img_dim, int device_id, @@ -60,12 +61,9 @@ void joseph3d_back(const float *xstart, handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly); // Handle img (write access) - int h_img_dim[3]; - cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost); - size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2]; float *d_img = nullptr; bool free_img = false; - handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetAccessedBy); + handle_cuda_input_array(img, &d_img, sizeof(float) * nvoxels, free_img, device_id, cudaMemAdviseSetAccessedBy); // Handle img_origin (read mostly) float *d_img_origin = nullptr; @@ -120,7 +118,7 @@ void joseph3d_back(const float *xstart, if (free_img) { // Copy the result back to the host - cudaMemcpy(const_cast(img), d_img, img_size, cudaMemcpyDeviceToHost); + cudaMemcpy(img, d_img, sizeof(float) * nvoxels, cudaMemcpyDeviceToHost); cudaFree(d_img); } if (free_img_origin) diff --git a/src/joseph3d_fwd.cpp b/src/joseph3d_fwd.cpp index 553cd8a..e7bbd04 100644 --- a/src/joseph3d_fwd.cpp +++ b/src/joseph3d_fwd.cpp @@ -8,6 +8,7 @@ void joseph3d_fwd(const float *xstart, const float *img_origin, const float *voxsize, float *p, + size_t nvoxels, size_t nlors, const int *img_dim, int device_id, diff --git a/src/joseph3d_fwd.cu b/src/joseph3d_fwd.cu index 7585cb1..7305c5e 100644 --- a/src/joseph3d_fwd.cu +++ b/src/joseph3d_fwd.cu @@ -32,6 +32,7 @@ void joseph3d_fwd(const float *xstart, const float *img_origin, const float *voxsize, float *p, + size_t nvoxels, size_t nlors, const int *img_dim, int device_id, @@ -60,13 +61,9 @@ void joseph3d_fwd(const float *xstart, handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly); // Handle img (read mostly) - // Copy img_dim to the host if needed to calculate the size - int h_img_dim[3]; - cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost); - size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2]; float *d_img = nullptr; bool free_img = false; - handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetReadMostly); + handle_cuda_input_array(img, &d_img, sizeof(float) * nvoxels, free_img, device_id, cudaMemAdviseSetReadMostly); // Handle img_origin (read mostly) float *d_img_origin = nullptr; @@ -126,7 +123,7 @@ void joseph3d_fwd(const float *xstart, cudaFree(d_voxsize); if (free_p) { - cudaMemcpy(const_cast(p), d_p, sizeof(float) * nlors, cudaMemcpyDeviceToHost); + cudaMemcpy(p, d_p, sizeof(float) * nlors, cudaMemcpyDeviceToHost); cudaFree(d_p); } if (free_img_dim) diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp index 32e0856..781184a 100644 --- a/tests/test_nontof_fwd_back.cpp +++ b/tests/test_nontof_fwd_back.cpp @@ -54,10 +54,12 @@ int main() // Allocate memory for forward projection results std::vector img_fwd(nlors); + size_t nvoxels = img_dim[0] * img_dim[1] * img_dim[2]; + // Perform forward projection joseph3d_fwd( xstart.data(), xend.data(), img.data(), - img_origin.data(), voxsize.data(), img_fwd.data(), + img_origin.data(), voxsize.data(), img_fwd.data(), nvoxels, nlors, img_dim.data(), 0, 64); ///////////////////////////////////////////////////////////////////////////// @@ -100,7 +102,7 @@ int main() joseph3d_back( xstart.data(), xend.data(), bimg.data(), - img_origin.data(), voxsize.data(), ones.data(), + img_origin.data(), voxsize.data(), ones.data(), nvoxels, nlors, img_dim.data()); printf("\nback projection of ones along all rays:\n"); diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu index 2041cbe..2371f6e 100644 --- a/tests/test_nontof_fwd_back_cuda.cu +++ b/tests/test_nontof_fwd_back_cuda.cu @@ -25,6 +25,7 @@ int main() try { + std::cout << "\n--- Testing with CUDA Managed Arrays ---\n"; test_cuda_managed_arrays(i, 64); } catch (const std::exception &e) @@ -36,6 +37,7 @@ int main() std::cout << "\n--- Testing with CUDA Device Arrays ---\n"; try { + std::cout << "\n--- Testing with CUDA Device Arrays ---\n"; test_cuda_device_arrays(i, 64); } catch (const std::exception &e) @@ -72,6 +74,8 @@ void test_cuda_managed_arrays(int device_id, int threadsperblock) cm_img_origin[i] = (-(float)cm_img_dim[i] / 2 + 0.5f) * cm_voxsize[i]; } + size_t nvoxels = cm_img_dim[0] * cm_img_dim[1] * cm_img_dim[2]; + std::vector h_img = readArrayFromFile("img.txt"); float *cm_img; cudaMallocManaged(&cm_img, h_img.size() * sizeof(float)); @@ -104,7 +108,7 @@ void test_cuda_managed_arrays(int device_id, int threadsperblock) float *cm_img_fwd; cudaMallocManaged(&cm_img_fwd, nlors * sizeof(float)); - joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, cm_voxsize, cm_img_fwd, nlors, cm_img_dim, device_id, threadsperblock); + joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, cm_voxsize, cm_img_fwd, nvoxels, nlors, cm_img_dim, device_id, threadsperblock); std::vector h_expected_fwd_vals = readArrayFromFile("expected_fwd_vals.txt"); float fwd_diff = 0; @@ -129,7 +133,7 @@ void test_cuda_managed_arrays(int device_id, int threadsperblock) cudaMallocManaged(&cm_ones, nlors * sizeof(float)); std::fill(cm_ones, cm_ones + nlors, 1.0f); - joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, cm_voxsize, cm_ones, nlors, cm_img_dim, device_id, threadsperblock); + joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, cm_voxsize, cm_ones, nvoxels, nlors, cm_img_dim, device_id, threadsperblock); printf("\nCUDA-managed back projection of ones along all rays:\n"); for (size_t i0 = 0; i0 < cm_img_dim[0]; i0++) @@ -187,6 +191,7 @@ void test_cuda_device_arrays(int device_id, int threadsperblock) int *d_img_dim; cudaMalloc(&d_img_dim, 3 * sizeof(int)); int h_img_dim[3] = {2, 3, 4}; + size_t nvoxels = h_img_dim[0] * h_img_dim[1] * h_img_dim[2]; cudaMemcpy(d_img_dim, h_img_dim, 3 * sizeof(int), cudaMemcpyHostToDevice); float *d_voxsize; @@ -236,7 +241,7 @@ void test_cuda_device_arrays(int device_id, int threadsperblock) } float *d_img_fwd; cudaMalloc(&d_img_fwd, nlors * sizeof(float)); - joseph3d_fwd(d_xstart, d_xend, d_img, d_img_origin, d_voxsize, d_img_fwd, nlors, d_img_dim, device_id, threadsperblock); + joseph3d_fwd(d_xstart, d_xend, d_img, d_img_origin, d_voxsize, d_img_fwd, nvoxels, nlors, d_img_dim, device_id, threadsperblock); std::vector h_img_fwd(nlors); cudaMemcpy(h_img_fwd.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost); @@ -265,7 +270,7 @@ void test_cuda_device_arrays(int device_id, int threadsperblock) std::vector h_ones(nlors, 1.0f); cudaMemcpy(d_ones, h_ones.data(), nlors * sizeof(float), cudaMemcpyHostToDevice); - joseph3d_back(d_xstart, d_xend, d_bimg, d_img_origin, d_voxsize, d_ones, nlors, d_img_dim, device_id, threadsperblock); + joseph3d_back(d_xstart, d_xend, d_bimg, d_img_origin, d_voxsize, d_ones, nvoxels, nlors, d_img_dim, device_id, threadsperblock); std::vector h_bimg(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]); cudaMemcpy(h_bimg.data(), d_bimg, h_bimg.size() * sizeof(float), cudaMemcpyDeviceToHost); From aa8a8859dfc1e0c359d0330eed228520301477b8 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Mon, 14 Apr 2025 11:31:33 +0200 Subject: [PATCH 32/35] add more cases for handle_cuda_input_array --- include/cuda_utils.h | 7 ++++- src/cuda_utils.cu | 62 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/include/cuda_utils.h b/include/cuda_utils.h index 2accedb..7a51609 100644 --- a/include/cuda_utils.h +++ b/include/cuda_utils.h @@ -2,5 +2,10 @@ #include +// Overload for constant input_ptr (const T*) template -void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint); +void handle_cuda_input_array(const T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint); + +// Overload for non-constant input_ptr (T*) +template +void handle_cuda_input_array(T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint); diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu index 4be521a..becc197 100644 --- a/src/cuda_utils.cu +++ b/src/cuda_utils.cu @@ -1,31 +1,81 @@ #include "cuda_utils.h" +#include +// Overload for constant input_ptr (const T*) template -void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint) +void handle_cuda_input_array(const T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint) { cudaPointerAttributes attr; - cudaError_t err = cudaPointerGetAttributes(&attr, host_ptr); + cudaError_t err = cudaPointerGetAttributes(&attr, input_ptr); free_flag = false; if (err == cudaSuccess && attr.type == cudaMemoryTypeManaged) { - cudaMemPrefetchAsync(const_cast(static_cast(host_ptr)), size, device_id); - cudaMemAdvise(const_cast(static_cast(host_ptr)), size, memory_hint, device_id); + // Prefetch and advise for managed memory + cudaMemPrefetchAsync(const_cast(static_cast(input_ptr)), size, device_id); + cudaMemAdvise(const_cast(static_cast(input_ptr)), size, memory_hint, device_id); } if (err == cudaSuccess && (attr.type == cudaMemoryTypeManaged || attr.type == cudaMemoryTypeDevice)) { - *device_ptr = const_cast(host_ptr); + // Assign managed or device pointer + *device_ptr = const_cast(input_ptr); } else { // Host pointer case, transfer to device cudaMalloc(device_ptr, size); - cudaMemcpy(*device_ptr, host_ptr, size, cudaMemcpyHostToDevice); + cudaMemcpy(*device_ptr, input_ptr, size, cudaMemcpyHostToDevice); + free_flag = true; + } +} + +// Overload for non-constant input_ptr (T*) (not const) +template +void handle_cuda_input_array(T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint) +{ + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, input_ptr); + free_flag = false; + + if (err == cudaSuccess && attr.type == cudaMemoryTypeManaged) + { + // Prefetch and advise for managed memory + cudaMemPrefetchAsync(input_ptr, size, device_id); + cudaMemAdvise(input_ptr, size, memory_hint, device_id); + } + + if (err == cudaSuccess && (attr.type == cudaMemoryTypeManaged || attr.type == cudaMemoryTypeDevice)) + { + // Assign managed or device pointer + *device_ptr = input_ptr; + } + else + { + // Host pointer case, transfer to device + cudaMalloc(device_ptr, size); + cudaMemcpy(*device_ptr, input_ptr, size, cudaMemcpyHostToDevice); free_flag = true; } } // Explicit template instantiations +template void handle_cuda_input_array(const double *, double **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(double *, double **, size_t, bool &, int, cudaMemoryAdvise); template void handle_cuda_input_array(const float *, float **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(float *, float **, size_t, bool &, int, cudaMemoryAdvise); + template void handle_cuda_input_array(const int *, int **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(int *, int **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(const unsigned int *, unsigned int **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(unsigned int *, unsigned int **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(const size_t *, size_t **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(size_t *, size_t **, size_t, bool &, int, cudaMemoryAdvise); + +template void handle_cuda_input_array(const char *, char **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(char *, char **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(const unsigned char *, unsigned char **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(unsigned char *, unsigned char **, size_t, bool &, int, cudaMemoryAdvise); + +template void handle_cuda_input_array(const bool *, bool **, size_t, bool &, int, cudaMemoryAdvise); +template void handle_cuda_input_array(bool *, bool **, size_t, bool &, int, cudaMemoryAdvise); From 0fd48deec5d85e15a5e2f919031a1e1508703df4 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Mon, 14 Apr 2025 17:43:24 +0200 Subject: [PATCH 33/35] wip python interface using pybind11 --- CMakeLists.txt | 42 ++++++++ src/parallelproj_pybind.cpp | 190 ++++++++++++++++++++++++++++++++++++ tests/test.py | 53 ++++++++++ 3 files changed, 285 insertions(+) create mode 100644 src/parallelproj_pybind.cpp create mode 100644 tests/test.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 93b7347..adeaf04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,14 @@ cmake_minimum_required(VERSION 3.18) project(PARALLELPROJ LANGUAGES C CXX) +# Set the C++ standard +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Add an option to build the Python interface +option(BUILD_PYTHON_INTERFACE "Build the Python interface using pybind11" OFF) + + include(CheckLanguage) include(GNUInstallDirs) include(CMakePackageConfigHelpers) @@ -88,6 +96,40 @@ else() ) endif() +################################################################################ +### BUILD THE PYTHON INTERFACE ################################################# +################################################################################ + +# Build the Python interface if enabled +if(BUILD_PYTHON_INTERFACE) + # Enable the use of FindPython in pybind11 + set(PYBIND11_FINDPYTHON ON) + # Find pybind11 + find_package(pybind11 REQUIRED) + + find_path(DLPACK_INCLUDE_DIR + NAMES dlpack/dlpack.h + DOC "Path to the DLPack include directory") + + # Check if DLPack was found + if (NOT DLPACK_INCLUDE_DIR) + message(FATAL_ERROR "DLPack not found. Please install DLPack via conda or provide the path to dlpack.h.") + else() + message(STATUS "DLPack found at: ${DLPACK_INCLUDE_DIR}") + endif() + + + # Add the Python module + pybind11_add_module(parallelproj_pybind + src/parallelproj_pybind.cpp + ) + + target_include_directories(parallelproj_pybind PRIVATE ${CMAKE_SOURCE_DIR}/include) + # Link the core library to the Python module + target_link_libraries(parallelproj_pybind PRIVATE parallelproj) +endif() + + ################################################################################ ### INSTALL AND CONFIGS ######################################################## ################################################################################ diff --git a/src/parallelproj_pybind.cpp b/src/parallelproj_pybind.cpp new file mode 100644 index 0000000..f505566 --- /dev/null +++ b/src/parallelproj_pybind.cpp @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include "parallelproj.h" + +namespace py = pybind11; + +// Helper function to extract raw pointer and shape +template +std::pair> extract_pointer_and_shape(py::object array) +{ + T *raw_ptr = nullptr; + std::vector shape; + + // Handle arrays using the __dlpack__ protocol (default) + if (py::hasattr(array, "__dlpack__")) + { + // Call the __dlpack__ method to get a DLPack tensor + py::capsule dlpack_capsule = array.attr("__dlpack__")(); + + // Extract the DLManagedTensor from the capsule + auto *managed_tensor = static_cast( + PyCapsule_GetPointer(dlpack_capsule.ptr(), "dltensor")); + + if (!managed_tensor) + { + throw std::runtime_error("Failed to extract DLManagedTensor from PyCapsule."); + } + + // Access the DLTensor from the DLManagedTensor + DLTensor dltensor = managed_tensor->dl_tensor; + + // Ensure the data type matches + if (dltensor.dtype.code != kDLFloat || dltensor.dtype.bits != sizeof(T) * 8) + { + throw std::invalid_argument("DLPack tensor has an incompatible data type."); + } + + // Get the raw pointer and shape + raw_ptr = reinterpret_cast(dltensor.data); + shape = std::vector(dltensor.shape, dltensor.shape + dltensor.ndim); + } + // Handle NumPy arrays + else if (py::isinstance>(array)) + { + auto numpy_array = array.cast>(); + raw_ptr = numpy_array.mutable_data(); + shape = std::vector(numpy_array.shape(), numpy_array.shape() + numpy_array.ndim()); + } + // Handle arrays using the __cuda_array_interface__ (e.g. cupy or pytorch gpu tensors) + else if (py::hasattr(array, "__cuda_array_interface__")) + { + auto cuda_interface = array.attr("__cuda_array_interface__"); + raw_ptr = reinterpret_cast(cuda_interface["data"].cast>().first); + shape = cuda_interface["shape"].cast>(); + } + // Handle arrays using the __array_interface__ (Python Array API or array_api_strict) + else + { + throw std::invalid_argument("Unsupported array type. Must have __dlpack__, __cuda_array_interface__ or be numpy."); + } + + return {raw_ptr, shape}; +} + +// Wrapper for joseph3d_fwd +void joseph3d_fwd_py(py::object xstart, + py::object xend, + py::object img, + py::object img_origin, + py::object voxsize, + py::object p, + int device_id = 0, + int threadsperblock = 64) +{ + // Extract raw pointers and shapes + auto [xstart_ptr, xstart_shape] = extract_pointer_and_shape(xstart); + auto [xend_ptr, xend_shape] = extract_pointer_and_shape(xend); + auto [img_ptr, img_shape] = extract_pointer_and_shape(img); + auto [img_origin_ptr, img_origin_shape] = extract_pointer_and_shape(img_origin); + auto [voxsize_ptr, voxsize_shape] = extract_pointer_and_shape(voxsize); + auto [p_ptr, p_shape] = extract_pointer_and_shape(p); + + // Validate shapes + if (xstart_shape.size() < 2 || xstart_shape[1] != 3) + { + throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)"); + } + if (xend_shape.size() < 2 || xend_shape[1] != 3) + { + throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)"); + } + if (img_shape.size() != 3) + { + throw std::invalid_argument("img must be a 3D array"); + } + // Validate that p.shape == xstart.shape[:-1] + if (p_shape.size() != xstart_shape.size() - 1 || + !std::equal(p_shape.begin(), p_shape.end(), xstart_shape.begin())) + { + throw std::invalid_argument("p must have a shape equal to xstart.shape[:-1]"); + } + if (img_origin_shape.size() != 1 || img_origin_shape[0] != 3) + { + throw std::invalid_argument("img_origin must be a 1D array with 3 elements"); + } + if (voxsize_shape.size() != 1 || voxsize_shape[0] != 3) + { + throw std::invalid_argument("voxsize must be a 1D array with 3 elements"); + } + + // Calculate nlors using xstart_shape (multiply shape except the last dimension) + size_t nlors = std::accumulate(xstart_shape.begin(), xstart_shape.end() - 1, 1, std::multiplies()); + int img_dim[3] = {static_cast(img_shape[0]), static_cast(img_shape[1]), static_cast(img_shape[2])}; + size_t nvoxels = img_dim[0] * img_dim[1] * img_dim[2]; + + // Call the C++ function + joseph3d_fwd(xstart_ptr, xend_ptr, img_ptr, img_origin_ptr, voxsize_ptr, p_ptr, nvoxels, nlors, img_dim, device_id, threadsperblock); +} + +// Wrapper for joseph3d_back +void joseph3d_back_py(py::object xstart, + py::object xend, + py::object img, + py::object img_origin, + py::object voxsize, + py::object p, + int device_id = 0, + int threadsperblock = 64) +{ + // Extract raw pointers and shapes + auto [xstart_ptr, xstart_shape] = extract_pointer_and_shape(xstart); + auto [xend_ptr, xend_shape] = extract_pointer_and_shape(xend); + auto [img_ptr, img_shape] = extract_pointer_and_shape(img); + auto [img_origin_ptr, img_origin_shape] = extract_pointer_and_shape(img_origin); + auto [voxsize_ptr, voxsize_shape] = extract_pointer_and_shape(voxsize); + auto [p_ptr, p_shape] = extract_pointer_and_shape(p); + + // Validate shapes + if (xstart_shape.size() < 2 || xstart_shape[1] != 3) + { + throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)"); + } + if (xend_shape.size() < 2 || xend_shape[1] != 3) + { + throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)"); + } + if (img_shape.size() != 3) + { + throw std::invalid_argument("img must be a 3D array"); + } + // Validate that p.shape == xstart.shape[:-1] + if (p_shape.size() != xstart_shape.size() - 1 || + !std::equal(p_shape.begin(), p_shape.end(), xstart_shape.begin())) + { + throw std::invalid_argument("p must have a shape equal to xstart.shape[:-1]"); + } + if (img_origin_shape.size() != 1 || img_origin_shape[0] != 3) + { + throw std::invalid_argument("img_origin must be a 1D array with 3 elements"); + } + if (voxsize_shape.size() != 1 || voxsize_shape[0] != 3) + { + throw std::invalid_argument("voxsize must be a 1D array with 3 elements"); + } + + // Calculate nlors using xstart_shape (multiply shape except the last dimension) + size_t nlors = std::accumulate(xstart_shape.begin(), xstart_shape.end() - 1, 1, std::multiplies()); + + int img_dim[3] = {static_cast(img_shape[0]), static_cast(img_shape[1]), static_cast(img_shape[2])}; + size_t nvoxels = img_dim[0] * img_dim[1] * img_dim[2]; + + // Call the C++ function + joseph3d_back(xstart_ptr, xend_ptr, img_ptr, img_origin_ptr, voxsize_ptr, p_ptr, nvoxels, nlors, img_dim, device_id, threadsperblock); +} + +// Pybind11 module definition +PYBIND11_MODULE(parallelproj_pybind, m) +{ + m.doc() = "Python bindings for parallelproj"; + + m.def("joseph3d_fwd", &joseph3d_fwd_py, "Forward projection", + py::arg("xstart"), py::arg("xend"), py::arg("img"), py::arg("img_origin"), + py::arg("voxsize"), py::arg("p"), py::arg("device_id") = 0, py::arg("threadsperblock") = 64); + + m.def("joseph3d_back", &joseph3d_back_py, "Back projection", + py::arg("xstart"), py::arg("xend"), py::arg("img"), py::arg("img_origin"), + py::arg("voxsize"), py::arg("p"), py::arg("device_id") = 0, py::arg("threadsperblock") = 64); +} diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..b5585e8 --- /dev/null +++ b/tests/test.py @@ -0,0 +1,53 @@ +import numpy as np +import array_api_strict as xp +import parallelproj_pybind as pp + +img_dim = (2, 3, 4) + +# img_dim = xp.asarray([2, 3, 4], dtype=xp.int32) +voxsize = xp.asarray([4.0, 3.0, 2.0], dtype=xp.float32) + +img_origin = (-0.5 * xp.asarray(img_dim, dtype=xp.float32) + 0.5) * voxsize + +# Read the image from file +img = xp.reshape(xp.asarray(np.loadtxt("img.txt", dtype=np.float32)), img_dim) + +# Read the ray start and coordinates from file +vstart = xp.reshape(xp.asarray(np.loadtxt("vstart.txt", dtype=np.float32)), (2, 5, 3)) +vend = xp.reshape(xp.asarray(np.loadtxt("vend.txt", dtype=np.float32)), (2, 5, 3)) + +# Calculate the start and end coordinates in world coordinates +xstart = vstart * voxsize + img_origin +xend = vend * voxsize + img_origin + +# Allocate memory for forward projection results +img_fwd = xp.zeros(xstart.shape[0], dtype=xp.float32) + +# Perform forward projection +pp.joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd) + +# Read the expected forward values from file +expected_fwd_vals = xp.reshape( + xp.asarray(np.loadtxt("expected_fwd_vals.txt", dtype=np.float32)), img_fwd.shape +) + +# Check if we got the expected results +eps = 1e-7 + +assert ( + xp.max(xp.abs(img_fwd - expected_fwd_vals)) < eps +), "Forward projection test failed." + +# Test the back projection +bimg = xp.zeros(img_dim, dtype=xp.float32) +ones = xp.ones(img_fwd.shape, dtype=xp.float32) +pp.joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones) + +print(bimg) + +ip1 = float(xp.sum(img * bimg)) +ip2 = float(xp.sum(img_fwd * ones)) + +print(ip1, ip2) + +assert abs(ip1 - ip2) / abs(ip1) < eps, "Back projection test failed." From b12b9d45f8bc5790034778df89fa070e0661766d Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Mon, 14 Apr 2025 17:56:53 +0200 Subject: [PATCH 34/35] add test for python interface --- src/parallelproj_pybind.cpp | 8 ++--- tests/test.py | 53 ----------------------------- tests/test_nontof_fwd_back.py | 63 +++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 57 deletions(-) delete mode 100644 tests/test.py create mode 100644 tests/test_nontof_fwd_back.py diff --git a/src/parallelproj_pybind.cpp b/src/parallelproj_pybind.cpp index f505566..7aeb7d0 100644 --- a/src/parallelproj_pybind.cpp +++ b/src/parallelproj_pybind.cpp @@ -83,11 +83,11 @@ void joseph3d_fwd_py(py::object xstart, auto [p_ptr, p_shape] = extract_pointer_and_shape(p); // Validate shapes - if (xstart_shape.size() < 2 || xstart_shape[1] != 3) + if (xstart_shape.size() < 2 || xstart_shape.back() != 3) { throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)"); } - if (xend_shape.size() < 2 || xend_shape[1] != 3) + if (xend_shape.size() < 2 || xend_shape.back() != 3) { throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)"); } @@ -138,11 +138,11 @@ void joseph3d_back_py(py::object xstart, auto [p_ptr, p_shape] = extract_pointer_and_shape(p); // Validate shapes - if (xstart_shape.size() < 2 || xstart_shape[1] != 3) + if (xstart_shape.size() < 2 || xstart_shape.back() != 3) { throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)"); } - if (xend_shape.size() < 2 || xend_shape[1] != 3) + if (xend_shape.size() < 2 || xend_shape.back() != 3) { throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)"); } diff --git a/tests/test.py b/tests/test.py deleted file mode 100644 index b5585e8..0000000 --- a/tests/test.py +++ /dev/null @@ -1,53 +0,0 @@ -import numpy as np -import array_api_strict as xp -import parallelproj_pybind as pp - -img_dim = (2, 3, 4) - -# img_dim = xp.asarray([2, 3, 4], dtype=xp.int32) -voxsize = xp.asarray([4.0, 3.0, 2.0], dtype=xp.float32) - -img_origin = (-0.5 * xp.asarray(img_dim, dtype=xp.float32) + 0.5) * voxsize - -# Read the image from file -img = xp.reshape(xp.asarray(np.loadtxt("img.txt", dtype=np.float32)), img_dim) - -# Read the ray start and coordinates from file -vstart = xp.reshape(xp.asarray(np.loadtxt("vstart.txt", dtype=np.float32)), (2, 5, 3)) -vend = xp.reshape(xp.asarray(np.loadtxt("vend.txt", dtype=np.float32)), (2, 5, 3)) - -# Calculate the start and end coordinates in world coordinates -xstart = vstart * voxsize + img_origin -xend = vend * voxsize + img_origin - -# Allocate memory for forward projection results -img_fwd = xp.zeros(xstart.shape[0], dtype=xp.float32) - -# Perform forward projection -pp.joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd) - -# Read the expected forward values from file -expected_fwd_vals = xp.reshape( - xp.asarray(np.loadtxt("expected_fwd_vals.txt", dtype=np.float32)), img_fwd.shape -) - -# Check if we got the expected results -eps = 1e-7 - -assert ( - xp.max(xp.abs(img_fwd - expected_fwd_vals)) < eps -), "Forward projection test failed." - -# Test the back projection -bimg = xp.zeros(img_dim, dtype=xp.float32) -ones = xp.ones(img_fwd.shape, dtype=xp.float32) -pp.joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones) - -print(bimg) - -ip1 = float(xp.sum(img * bimg)) -ip2 = float(xp.sum(img_fwd * ones)) - -print(ip1, ip2) - -assert abs(ip1 - ip2) / abs(ip1) < eps, "Back projection test failed." diff --git a/tests/test_nontof_fwd_back.py b/tests/test_nontof_fwd_back.py new file mode 100644 index 0000000..0b1bcb5 --- /dev/null +++ b/tests/test_nontof_fwd_back.py @@ -0,0 +1,63 @@ +import numpy as np +import array_api_strict as xp +import parallelproj_pybind as pp +import pytest + + +@pytest.fixture +def setup_data(): + """Fixture to set up test data.""" + img_dim = (2, 3, 4) + voxsize = xp.asarray([4.0, 3.0, 2.0], dtype=xp.float32) + img_origin = (-0.5 * xp.asarray(img_dim, dtype=xp.float32) + 0.5) * voxsize + + # Read the image from file + img = xp.reshape(xp.asarray(np.loadtxt("img.txt", dtype=np.float32)), img_dim) + + # Read the ray start and end coordinates from file + vstart = xp.reshape( + xp.asarray(np.loadtxt("vstart.txt", dtype=np.float32)), (2, 5, 3) + ) + vend = xp.reshape(xp.asarray(np.loadtxt("vend.txt", dtype=np.float32)), (2, 5, 3)) + + # Calculate the start and end coordinates in world coordinates + xstart = vstart * voxsize + img_origin + xend = vend * voxsize + img_origin + + return img_dim, voxsize, img_origin, img, xstart, xend + + +def test_forward_and_back_projection(setup_data): + """Test the forward projection.""" + img_dim, voxsize, img_origin, img, xstart, xend = setup_data + + # Allocate memory for forward projection results + img_fwd = xp.zeros(xstart.shape[:-1], dtype=xp.float32) + + # Perform forward projection + pp.joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd) + + # Read the expected forward values from file + expected_fwd_vals = xp.reshape( + xp.asarray(np.loadtxt("expected_fwd_vals.txt", dtype=np.float32)), img_fwd.shape + ) + + # Check if we got the expected results + eps = 1e-7 + assert ( + xp.max(xp.abs(img_fwd - expected_fwd_vals)) < eps + ), "Forward projection test failed." + + # Allocate memory for back projection results + bimg = xp.zeros(img_dim, dtype=xp.float32) + ones = xp.ones(xstart.shape[:-1], dtype=xp.float32) + + # Perform back projection + pp.joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones) + + # Check the results + ip1 = float(xp.sum(img * bimg)) + ip2 = float(xp.sum(img_fwd * ones)) + + eps = 1e-7 + assert abs(ip1 - ip2) / abs(ip1) < eps, "Back projection test failed." From a5f3a4dcc9a1933a2641cd23c146f30b8e60da15 Mon Sep 17 00:00:00 2001 From: Georg Schramm Date: Mon, 14 Apr 2025 18:28:02 +0200 Subject: [PATCH 35/35] add python tests to CMakeLists.txt --- CMakeLists.txt | 105 +++++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index adeaf04..fb4e470 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,6 +100,8 @@ endif() ### BUILD THE PYTHON INTERFACE ################################################# ################################################################################ +enable_testing() + # Build the Python interface if enabled if(BUILD_PYTHON_INTERFACE) # Enable the use of FindPython in pybind11 @@ -118,7 +120,6 @@ if(BUILD_PYTHON_INTERFACE) message(STATUS "DLPack found at: ${DLPACK_INCLUDE_DIR}") endif() - # Add the Python module pybind11_add_module(parallelproj_pybind src/parallelproj_pybind.cpp @@ -130,6 +131,63 @@ if(BUILD_PYTHON_INTERFACE) endif() + + +################################################################################ +### BUILD TESTS ################################################################ +################################################################################ + +set(TEST_DATA_DIR ${CMAKE_SOURCE_DIR}/tests) + +# List all the .txt files that need to be copied +set(TEST_DATA_FILES + img.txt + vstart.txt + vend.txt + expected_fwd_vals.txt +) + +# Copy each file to the build directory +foreach(TEST_FILE ${TEST_DATA_FILES}) + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/${TEST_FILE} + COMMAND ${CMAKE_COMMAND} -E copy ${TEST_DATA_DIR}/${TEST_FILE} ${CMAKE_BINARY_DIR}/${TEST_FILE} + DEPENDS ${TEST_DATA_DIR}/${TEST_FILE} + ) + list(APPEND TEST_DATA_OUTPUTS ${CMAKE_BINARY_DIR}/${TEST_FILE}) +endforeach() + +# Add a custom target to ensure the files are copied before running tests +add_custom_target(copy_test_data ALL DEPENDS ${TEST_DATA_OUTPUTS}) + +# Add the directory containing file_utils.h to the include paths +include_directories(${CMAKE_SOURCE_DIR}/tests) + +foreach(TEST_SOURCE ${TEST_SOURCES}) + get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE) + add_executable(${TEST_NAME} ${TEST_SOURCE}) + target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/tests) + add_dependencies(${TEST_NAME} copy_test_data) # Ensure test data is copied before running tests + if(CMAKE_CUDA_COMPILER) + target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart) + else() + target_link_libraries(${TEST_NAME} PRIVATE parallelproj OpenMP::OpenMP_CXX) + endif() + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) +endforeach() + +if(BUILD_PYTHON_INTERFACE) + ### ADD THE PYTHON INTERFACE TESTS ### + find_program(PYTEST_EXECUTABLE pytest REQUIRED) + add_test(NAME python_interface_test COMMAND pytest ${CMAKE_SOURCE_DIR}/tests) + + # Set PYTHONPATH to include the build directory + set_tests_properties(python_interface_test PROPERTIES + ENVIRONMENT "PYTHONPATH=${CMAKE_BINARY_DIR}:$ENV{PYTHONPATH}" + ) +endif() + + ################################################################################ ### INSTALL AND CONFIGS ######################################################## ################################################################################ @@ -178,48 +236,3 @@ install(FILES DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parallelproj ) - -################################################################################ -### BUILD TESTS ################################################################ -################################################################################ - -enable_testing() - -set(TEST_DATA_DIR ${CMAKE_SOURCE_DIR}/tests) - -# List all the .txt files that need to be copied -set(TEST_DATA_FILES - img.txt - vstart.txt - vend.txt - expected_fwd_vals.txt -) - -# Copy each file to the build directory -foreach(TEST_FILE ${TEST_DATA_FILES}) - add_custom_command( - OUTPUT ${CMAKE_BINARY_DIR}/${TEST_FILE} - COMMAND ${CMAKE_COMMAND} -E copy ${TEST_DATA_DIR}/${TEST_FILE} ${CMAKE_BINARY_DIR}/${TEST_FILE} - DEPENDS ${TEST_DATA_DIR}/${TEST_FILE} - ) - list(APPEND TEST_DATA_OUTPUTS ${CMAKE_BINARY_DIR}/${TEST_FILE}) -endforeach() - -# Add a custom target to ensure the files are copied before running tests -add_custom_target(copy_test_data ALL DEPENDS ${TEST_DATA_OUTPUTS}) - -# Add the directory containing file_utils.h to the include paths -include_directories(${CMAKE_SOURCE_DIR}/tests) - -foreach(TEST_SOURCE ${TEST_SOURCES}) - get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE) - add_executable(${TEST_NAME} ${TEST_SOURCE}) - target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/tests) - add_dependencies(${TEST_NAME} copy_test_data) # Ensure test data is copied before running tests - if(CMAKE_CUDA_COMPILER) - target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart) - else() - target_link_libraries(${TEST_NAME} PRIVATE parallelproj OpenMP::OpenMP_CXX) - endif() - add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) -endforeach()