From 3f9b6449b57d98004a731771dba595d3a851f2e2 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 18:19:49 +0100
Subject: [PATCH 01/35] add support when cuda absent

---
 CMakeLists.txt        |   7 +-
 atomic_sum.h          |   3 +-
 cuda_compat.h         |   5 +-
 joseph3d_fwd.cpp      |  21 +++
 joseph3d_fwd.cu       | 402 ++----------------------------------------
 joseph3d_fwd_worker.h | 401 +++++++++++++++++++++++++++++++++++++++++
 parallelproj.h        |   2 +-
 7 files changed, 439 insertions(+), 402 deletions(-)
 create mode 100644 joseph3d_fwd.cpp
 create mode 100644 joseph3d_fwd_worker.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e63938..2ee254a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,9 +44,8 @@ if(CMAKE_CUDA_COMPILER)
     target_include_directories(main PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
     target_link_libraries(main PRIVATE parallelproj CUDA::cudart)
 else()
-    #find_package(OpenMP REQUIRED)
-    #add_library(parallelproj SHARED parallelproj.cpp)
-    #target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX)
+    find_package(OpenMP REQUIRED)
+    add_library(parallelproj SHARED joseph3d_fwd.cpp)
+    target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX)
     #add_executable(main main_no_cuda.cpp)
 endif()
-
diff --git a/atomic_sum.h b/atomic_sum.h
index a3848e0..dc08883 100644
--- a/atomic_sum.h
+++ b/atomic_sum.h
@@ -1,7 +1,7 @@
 #pragma once
 #include "cuda_compat.h"
 
-CUDA_HOST_DEVICE inline void atomic_sum(float* target, float value) {
+WORKER_QUALIFIER inline void atomic_sum(float* target, float value) {
 #ifdef __CUDA_ARCH__
     atomicAdd(target, value);
 #else
@@ -9,4 +9,3 @@ CUDA_HOST_DEVICE inline void atomic_sum(float* target, float value) {
     *target += value;
 #endif
 }
-
diff --git a/cuda_compat.h b/cuda_compat.h
index 9205bf2..21451f1 100644
--- a/cuda_compat.h
+++ b/cuda_compat.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #ifdef __CUDACC__
-  #define CUDA_HOST_DEVICE __host__ __device__
+  #define WORKER_QUALIFIER __device__
 #else
-  #define CUDA_HOST_DEVICE
+  #define WORKER_QUALIFIER
 #endif
-
diff --git a/joseph3d_fwd.cpp b/joseph3d_fwd.cpp
new file mode 100644
index 0000000..bcd480c
--- /dev/null
+++ b/joseph3d_fwd.cpp
@@ -0,0 +1,21 @@
+#include "parallelproj.h"
+#include "joseph3d_fwd_worker.h"
+#include "debug.h"
+
+void joseph3d_fwd(const float *xstart, 
+                  const float *xend, 
+                  const float *img,
+                  const float *img_origin, 
+                  const float *voxsize, 
+                  float *p,
+                  size_t nlors, 
+                  const int *img_dim,
+                  int device_id,
+                  int threadsperblock){
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nlors; ++i) {
+        joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
+    }
+                    
+}
diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu
index 3e72a27..575e1cb 100644
--- a/joseph3d_fwd.cu
+++ b/joseph3d_fwd.cu
@@ -1,417 +1,35 @@
 #include "parallelproj.h"
+#include "joseph3d_fwd_worker.h"
 #include "debug.h"
 #include <cuda_runtime.h>
 #include <iostream>
 #include <stdexcept>
 
-__device__ unsigned char ray_cube_intersection_cuda(float orig0,
-                                                    float orig1,
-                                                    float orig2,
-                                                    float bounds0_min,
-                                                    float bounds1_min,
-                                                    float bounds2_min,
-                                                    float bounds0_max,
-                                                    float bounds1_max,
-                                                    float bounds2_max,
-                                                    float rdir0,
-                                                    float rdir1,
-                                                    float rdir2,
-                                                    float* t1,
-                                                    float* t2){
-  // the inverse of the directional vector
-  // using the inverse of the directional vector and IEEE floating point arith standard 754
-  // makes sure that 0's in the directional vector are handled correctly 
-  float invdir0 = 1.f/rdir0;
-  float invdir1 = 1.f/rdir1;
-  float invdir2 = 1.f/rdir2;
-  
-  unsigned char intersec = 1;
-  
-  float t11, t12, t21, t22; 
-
-  if (invdir0 >= 0){
-    *t1  = (bounds0_min - orig0) * invdir0;
-    *t2  = (bounds0_max - orig0) * invdir0; 
-  }
-  else{
-    *t1  = (bounds0_max - orig0) * invdir0;
-    *t2  = (bounds0_min - orig0) * invdir0;
-  }
-  
-  if (invdir1 >= 0){
-    t11 = (bounds1_min - orig1) * invdir1; 
-    t12 = (bounds1_max - orig1) * invdir1; 
-  }
-  else{
-    t11 = (bounds1_max - orig1) * invdir1;
-    t12 = (bounds1_min - orig1) * invdir1; 
-  }
-  
-  if ((*t1 > t12) || (t11 > *t2)){intersec = 0;}
-  if (t11 > *t1){*t1 = t11;}
-  if (t12 < *t2){*t2 = t12;}
-  
-  if (invdir2 >= 0){
-    t21 = (bounds2_min - orig2) * invdir2; 
-    t22 = (bounds2_max - orig2) * invdir2;
-  } 
-  else{
-    t21 = (bounds2_max - orig2) * invdir2; 
-    t22 = (bounds2_min - orig2) * invdir2;
-  } 
-  
-  if ((*t1 > t22) || (t21 > *t2)){intersec = 0;}
-  if (t21 > *t1){*t1 = t21;}
-  if (t22 < *t2){*t2 = t22;} 
-
-  return(intersec);
-}
-
-
 __global__ void joseph3d_fwd_kernel(const float *xstart, 
                                     const float *xend, 
                                     const float *img,
                                     const float *img_origin, 
                                     const float *voxsize, 
                                     float *p,
-                                    long long nlors, 
-                                    const int *img_dim)
-{
-  long long i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if(i < nlors)
-  {
-    int n0 = img_dim[0];
-    int n1 = img_dim[1];
-    int n2 = img_dim[2];
-
-    float d0, d1, d2, d0_sq, d1_sq, d2_sq; 
-    float lsq, cos0_sq, cos1_sq, cos2_sq;
-    unsigned short direction; 
-    int i0, i1, i2;
-    int i0_floor, i1_floor, i2_floor;
-    int i0_ceil, i1_ceil, i2_ceil;
-    float x_pr0, x_pr1, x_pr2;
-    float tmp_0, tmp_1, tmp_2;
-
-    float toAdd, cf;
-
-    float xstart0 = xstart[i*3 + 0];
-    float xstart1 = xstart[i*3 + 1];
-    float xstart2 = xstart[i*3 + 2];
-
-    float xend0 = xend[i*3 + 0];
-    float xend1 = xend[i*3 + 1];
-    float xend2 = xend[i*3 + 2];
-
-    float voxsize0 = voxsize[0];
-    float voxsize1 = voxsize[1];
-    float voxsize2 = voxsize[2];
-
-    float img_origin0 = img_origin[0];
-    float img_origin1 = img_origin[1];
-    float img_origin2 = img_origin[2];
-
-    unsigned char intersec;
-    float t1, t2;
-    float istart_f, iend_f, tmp;
-    int   istart, iend;
-
-    // test whether the ray between the two detectors is most parallel
-    // with the 0, 1, or 2 axis
-    d0 = xend0 - xstart0;
-    d1 = xend1 - xstart1;
-    d2 = xend2 - xstart2;
-
-    //-----------
-    //--- test whether ray and cube intersect
-    intersec = ray_cube_intersection_cuda(xstart0, xstart1, xstart2, 
-                                          img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2,
-                                          img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2,
-                                          d0, d1, d2, &t1, &t2);
-
-    if (intersec == 1)
-    {
-      d0_sq = d0*d0;
-      d1_sq = d1*d1;
-      d2_sq = d2*d2;
-
-      lsq = d0_sq + d1_sq + d2_sq;
-
-      cos0_sq = d0_sq / lsq;
-      cos1_sq = d1_sq / lsq;
-      cos2_sq = d2_sq / lsq;
-
-      direction = 0;
-      if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq))
-      {
-        direction = 1;
-      }
-      else
-      {
-        if ((cos2_sq >= cos0_sq) && (cos2_sq >= cos1_sq))
-        {
-          direction = 2;
-        }
-      }
- 
-      if (direction == 0)
-      {
-        cf = voxsize0 / sqrtf(cos0_sq);
-
-        // case where ray is most parallel to the 0 axis
-        // we step through the volume along the 0 direction
-
-        //--- check where ray enters / leaves cube
-        istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0;
-        iend_f   = (xstart0 + t2*d0 - img_origin0) / voxsize0;
-
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
-          istart_f = tmp;
-        }
-    
-        istart = (int)floor(istart_f);
-        iend   = (int)ceil(iend_f);
-
-        if (istart < 0){istart = 0;}
-        if (iend >= n0){iend = n0;}
-
-        // check in which "plane" the start and end points are
-        // we have to do this to avoid that we include voxels
-        // that are "outside" the line segment bewteen xstart and xend
-        
-        // !! for these calculations we overwrite the istart_f and iend_f variables !!
-        istart_f = (xstart0 - img_origin0) / voxsize0;
-        iend_f   = (xend0   - img_origin0) / voxsize0;
-
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
-          istart_f = tmp;
-        }
-
-        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
-        //---
-
-        for(i0 = istart; i0 < iend; i0++)
-        {
-          // get the indices where the ray intersects the image plane
-          x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0;
-          x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0;
-  
-          i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
-          i1_ceil  = i1_floor + 1;
-  
-          i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
-          i2_ceil  = i2_floor + 1; 
-  
-          // calculate the distances to the floor normalized to [0,1]
-          // for the bilinear interpolation
-          tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
-          tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
-
-          toAdd = 0;
-
-          if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2))
-          {
-            toAdd += img[n1*n2*i0 + n2*i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2);
-          }
-          if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2))
-          {
-            toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2);
-          }
-          if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
-          {
-            toAdd += img[n1*n2*i0 + n2*i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2;
-          }
-          if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
-          {
-            toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_ceil] * tmp_1 * tmp_2;
-          }
-
-          if(toAdd != 0){p[i] += (cf * toAdd);}
-        }
-      }
-
-      //--------------------------------------------------------------------------------- 
-      if (direction == 1)
-      {
-        cf = voxsize1 / sqrtf(cos1_sq);
-
-        // case where ray is most parallel to the 1 axis
-        // we step through the volume along the 1 direction
-
-        //--- check where ray enters / leaves cube
-        istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1;
-        iend_f   = (xstart1 + t2*d1 - img_origin1) / voxsize1;
-
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
-          istart_f = tmp;
-        }
-    
-        istart = (int)floor(istart_f);
-        iend   = (int)ceil(iend_f);
-
-        if (istart < 0){istart = 0;}
-        if (iend >= n1){iend = n1;}
-
-        // check in which "plane" the start and end points are
-        // we have to do this to avoid that we include voxels
-        // that are "outside" the line segment bewteen xstart and xend
-        
-        // !! for these calculations we overwrite the istart_f and iend_f variables !!
-        istart_f = (xstart1 - img_origin1) / voxsize1;
-        iend_f   = (xend1   - img_origin1) / voxsize1;
-
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
-          istart_f = tmp;
-        }
-
-        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
-        //---
-
-        for (i1 = istart; i1 < iend; i1++)
-        {
-          // get the indices where the ray intersects the image plane
-          x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1;
-          x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1;
-  
-          i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
-          i0_ceil  = i0_floor + 1; 
-  
-          i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
-          i2_ceil  = i2_floor + 1;
-  
-          // calculate the distances to the floor normalized to [0,1]
-          // for the bilinear interpolation
-          tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
-          tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
-
-          toAdd = 0;
-
-          if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2))
-          {
-            toAdd += img[n1*n2*i0_floor + n2*i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2);
-          }
-          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2))
-          {
-            toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_floor] * tmp_0 * (1 - tmp_2);
-          }
-          if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
-          {
-            toAdd += img[n1*n2*i0_floor + n2*i1 + i2_ceil] * (1 - tmp_0) * tmp_2;
-          }
-          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
-          {
-            toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_ceil] * tmp_0 * tmp_2;
-          }
-
-          if(toAdd != 0){p[i] += (cf * toAdd);}
-        }
-      }
-
-      //--------------------------------------------------------------------------------- 
-      if (direction == 2)
-      {
-        cf = voxsize2 / sqrtf(cos2_sq);
-
-        // case where ray is most parallel to the 2 axis
-        // we step through the volume along the 2 direction
-
-        //--- check where ray enters / leaves cube
-        istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2;
-        iend_f   = (xstart2 + t2*d2 - img_origin2) / voxsize2;
-
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
-          istart_f = tmp;
-        }
-    
-        istart = (int)floor(istart_f);
-        iend   = (int)ceil(iend_f);
-
-        if (istart < 0){istart = 0;}
-        if (iend >= n2){iend = n2;}
-
-        // check in which "plane" the start and end points are
-        // we have to do this to avoid that we include voxels
-        // that are "outside" the line segment bewteen xstart and xend
-        
-        // !! for these calculations we overwrite the istart_f and iend_f variables !!
-        istart_f = (xstart2 - img_origin2) / voxsize2;
-        iend_f   = (xend2   - img_origin2) / voxsize2;
-
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
-          istart_f = tmp;
-        }
-
-        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
-        //---
-
-        for(i2 = istart; i2 < iend; i2++)
-        {
-          // get the indices where the ray intersects the image plane
-          x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2;
-          x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2;
-  
-          i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
-          i0_ceil  = i0_floor + 1;
-  
-          i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
-          i1_ceil  = i1_floor + 1; 
-  
-          // calculate the distances to the floor normalized to [0,1]
-          // for the bilinear interpolation
-          tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
-          tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
-
-          toAdd = 0;
-
-          if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1))
-          {
-            toAdd += img[n1*n2*i0_floor + n2*i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1);
-          }
-          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1))
-          {
-            toAdd += img[n1*n2*i0_ceil + n2*i1_floor + i2] * tmp_0 * (1 - tmp_1);
-          }
-          if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) & (i1_ceil < n1))
-          {
-            toAdd += img[n1*n2*i0_floor + n2*i1_ceil + i2] * (1 - tmp_0) * tmp_1;
-          }
-          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1))
-          {
-            toAdd += img[n1*n2*i0_ceil + n2*i1_ceil + i2] * tmp_0 * tmp_1;
-          }
-
-          if(toAdd != 0){p[i] += (cf * toAdd);}
-        }
-      }
+                                    size_t nlors, 
+                                    const int *img_dim){
+    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
     }
-  }
 }
 
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
 
-extern "C" 
 void joseph3d_fwd(const float *xstart, 
                   const float *xend, 
                   const float *img,
                   const float *img_origin, 
                   const float *voxsize, 
                   float *p,
-                  long long nlors, 
+                  size_t nlors, 
                   const int *img_dim,
                   int device_id,
                   int threadsperblock)
diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h
new file mode 100644
index 0000000..b97892f
--- /dev/null
+++ b/joseph3d_fwd_worker.h
@@ -0,0 +1,401 @@
+#pragma once
+#include "cuda_compat.h"
+#include "atomic_sum.h"
+
+// we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
+#ifndef __CUDA_ARCH__
+#include <math.h>
+#endif
+
+WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0,
+                                                     float orig1,
+                                                     float orig2,
+                                                     float bounds0_min,
+                                                     float bounds1_min,
+                                                     float bounds2_min,
+                                                     float bounds0_max,
+                                                     float bounds1_max,
+                                                     float bounds2_max,
+                                                     float rdir0,
+                                                     float rdir1,
+                                                     float rdir2,
+                                                     float* t1,
+                                                     float* t2){
+  // the inverse of the directional vector
+  // using the inverse of the directional vector and IEEE floating point arith standard 754
+  // makes sure that 0's in the directional vector are handled correctly 
+  float invdir0 = 1.f/rdir0;
+  float invdir1 = 1.f/rdir1;
+  float invdir2 = 1.f/rdir2;
+  
+  unsigned char intersec = 1;
+  
+  float t11, t12, t21, t22; 
+
+  if (invdir0 >= 0){
+    *t1  = (bounds0_min - orig0) * invdir0;
+    *t2  = (bounds0_max - orig0) * invdir0; 
+  }
+  else{
+    *t1  = (bounds0_max - orig0) * invdir0;
+    *t2  = (bounds0_min - orig0) * invdir0;
+  }
+  
+  if (invdir1 >= 0){
+    t11 = (bounds1_min - orig1) * invdir1; 
+    t12 = (bounds1_max - orig1) * invdir1; 
+  }
+  else{
+    t11 = (bounds1_max - orig1) * invdir1;
+    t12 = (bounds1_min - orig1) * invdir1; 
+  }
+  
+  if ((*t1 > t12) || (t11 > *t2)){intersec = 0;}
+  if (t11 > *t1){*t1 = t11;}
+  if (t12 < *t2){*t2 = t12;}
+  
+  if (invdir2 >= 0){
+    t21 = (bounds2_min - orig2) * invdir2; 
+    t22 = (bounds2_max - orig2) * invdir2;
+  } 
+  else{
+    t21 = (bounds2_max - orig2) * invdir2; 
+    t22 = (bounds2_min - orig2) * invdir2;
+  } 
+  
+  if ((*t1 > t22) || (t21 > *t2)){intersec = 0;}
+  if (t21 > *t1){*t1 = t21;}
+  if (t22 < *t2){*t2 = t22;} 
+
+  return(intersec);
+}
+
+WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
+                                                 const float *xstart, 
+                                                 const float *xend, 
+                                                 const float *img,
+                                                 const float *img_origin, 
+                                                 const float *voxsize, 
+                                                 float *p,
+                                                 const int *img_dim){
+
+  int n0 = img_dim[0];
+  int n1 = img_dim[1];
+  int n2 = img_dim[2];
+
+  float d0, d1, d2, d0_sq, d1_sq, d2_sq; 
+  float lsq, cos0_sq, cos1_sq, cos2_sq;
+  unsigned short direction; 
+  int i0, i1, i2;
+  int i0_floor, i1_floor, i2_floor;
+  int i0_ceil, i1_ceil, i2_ceil;
+  float x_pr0, x_pr1, x_pr2;
+  float tmp_0, tmp_1, tmp_2;
+
+  float toAdd, cf;
+
+  float xstart0 = xstart[i*3 + 0];
+  float xstart1 = xstart[i*3 + 1];
+  float xstart2 = xstart[i*3 + 2];
+
+  float xend0 = xend[i*3 + 0];
+  float xend1 = xend[i*3 + 1];
+  float xend2 = xend[i*3 + 2];
+
+  float voxsize0 = voxsize[0];
+  float voxsize1 = voxsize[1];
+  float voxsize2 = voxsize[2];
+
+  float img_origin0 = img_origin[0];
+  float img_origin1 = img_origin[1];
+  float img_origin2 = img_origin[2];
+
+  unsigned char intersec;
+  float t1, t2;
+  float istart_f, iend_f, tmp;
+  int   istart, iend;
+
+  // test whether the ray between the two detectors is most parallel
+  // with the 0, 1, or 2 axis
+  d0 = xend0 - xstart0;
+  d1 = xend1 - xstart1;
+  d2 = xend2 - xstart2;
+
+  //-----------
+  //--- test whether ray and cube intersect
+  intersec = ray_cube_intersection(xstart0, xstart1, xstart2, 
+                                   img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2,
+                                   img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2,
+                                   d0, d1, d2, &t1, &t2);
+
+  if (intersec == 1)
+  {
+    d0_sq = d0*d0;
+    d1_sq = d1*d1;
+    d2_sq = d2*d2;
+
+    lsq = d0_sq + d1_sq + d2_sq;
+
+    cos0_sq = d0_sq / lsq;
+    cos1_sq = d1_sq / lsq;
+    cos2_sq = d2_sq / lsq;
+
+    direction = 0;
+    if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq))
+    {
+      direction = 1;
+    }
+    else
+    {
+      if ((cos2_sq >= cos0_sq) && (cos2_sq >= cos1_sq))
+      {
+        direction = 2;
+      }
+    }
+
+    if (direction == 0)
+    {
+      cf = voxsize0 / sqrtf(cos0_sq);
+
+      // case where ray is most parallel to the 0 axis
+      // we step through the volume along the 0 direction
+
+      //--- check where ray enters / leaves cube
+      istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0;
+      iend_f   = (xstart0 + t2*d0 - img_origin0) / voxsize0;
+
+      if (istart_f > iend_f){
+        tmp      = iend_f;
+        iend_f   = istart_f;
+        istart_f = tmp;
+      }
+  
+      istart = (int)floor(istart_f);
+      iend   = (int)ceil(iend_f);
+
+      if (istart < 0){istart = 0;}
+      if (iend >= n0){iend = n0;}
+
+      // check in which "plane" the start and end points are
+      // we have to do this to avoid that we include voxels
+      // that are "outside" the line segment bewteen xstart and xend
+      
+      // !! for these calculations we overwrite the istart_f and iend_f variables !!
+      istart_f = (xstart0 - img_origin0) / voxsize0;
+      iend_f   = (xend0   - img_origin0) / voxsize0;
+
+      if (istart_f > iend_f){
+        tmp      = iend_f;
+        iend_f   = istart_f;
+        istart_f = tmp;
+      }
+
+      if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
+      if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+      //---
+
+      for(i0 = istart; i0 < iend; i0++)
+      {
+        // get the indices where the ray intersects the image plane
+        x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0;
+        x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0;
+
+        i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
+        i1_ceil  = i1_floor + 1;
+
+        i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
+        i2_ceil  = i2_floor + 1; 
+
+        // calculate the distances to the floor normalized to [0,1]
+        // for the bilinear interpolation
+        tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
+        tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
+
+        toAdd = 0;
+
+        if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2))
+        {
+          toAdd += img[n1*n2*i0 + n2*i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2);
+        }
+        if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2))
+        {
+          toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2);
+        }
+        if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
+        {
+          toAdd += img[n1*n2*i0 + n2*i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2;
+        }
+        if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
+        {
+          toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_ceil] * tmp_1 * tmp_2;
+        }
+
+        if(toAdd != 0){p[i] += (cf * toAdd);}
+      }
+    }
+
+    //--------------------------------------------------------------------------------- 
+    if (direction == 1)
+    {
+      cf = voxsize1 / sqrtf(cos1_sq);
+
+      // case where ray is most parallel to the 1 axis
+      // we step through the volume along the 1 direction
+
+      //--- check where ray enters / leaves cube
+      istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1;
+      iend_f   = (xstart1 + t2*d1 - img_origin1) / voxsize1;
+
+      if (istart_f > iend_f){
+        tmp      = iend_f;
+        iend_f   = istart_f;
+        istart_f = tmp;
+      }
+  
+      istart = (int)floor(istart_f);
+      iend   = (int)ceil(iend_f);
+
+      if (istart < 0){istart = 0;}
+      if (iend >= n1){iend = n1;}
+
+      // check in which "plane" the start and end points are
+      // we have to do this to avoid that we include voxels
+      // that are "outside" the line segment bewteen xstart and xend
+      
+      // !! for these calculations we overwrite the istart_f and iend_f variables !!
+      istart_f = (xstart1 - img_origin1) / voxsize1;
+      iend_f   = (xend1   - img_origin1) / voxsize1;
+
+      if (istart_f > iend_f){
+        tmp      = iend_f;
+        iend_f   = istart_f;
+        istart_f = tmp;
+      }
+
+      if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
+      if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+      //---
+
+      for (i1 = istart; i1 < iend; i1++)
+      {
+        // get the indices where the ray intersects the image plane
+        x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1;
+        x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1;
+
+        i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
+        i0_ceil  = i0_floor + 1; 
+
+        i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
+        i2_ceil  = i2_floor + 1;
+
+        // calculate the distances to the floor normalized to [0,1]
+        // for the bilinear interpolation
+        tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
+        tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
+
+        toAdd = 0;
+
+        if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2))
+        {
+          toAdd += img[n1*n2*i0_floor + n2*i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2);
+        }
+        if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2))
+        {
+          toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_floor] * tmp_0 * (1 - tmp_2);
+        }
+        if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
+        {
+          toAdd += img[n1*n2*i0_floor + n2*i1 + i2_ceil] * (1 - tmp_0) * tmp_2;
+        }
+        if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
+        {
+          toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_ceil] * tmp_0 * tmp_2;
+        }
+
+        if(toAdd != 0){p[i] += (cf * toAdd);}
+      }
+    }
+
+    //--------------------------------------------------------------------------------- 
+    if (direction == 2)
+    {
+      cf = voxsize2 / sqrtf(cos2_sq);
+
+      // case where ray is most parallel to the 2 axis
+      // we step through the volume along the 2 direction
+
+      //--- check where ray enters / leaves cube
+      istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2;
+      iend_f   = (xstart2 + t2*d2 - img_origin2) / voxsize2;
+
+      if (istart_f > iend_f){
+        tmp      = iend_f;
+        iend_f   = istart_f;
+        istart_f = tmp;
+      }
+  
+      istart = (int)floor(istart_f);
+      iend   = (int)ceil(iend_f);
+
+      if (istart < 0){istart = 0;}
+      if (iend >= n2){iend = n2;}
+
+      // check in which "plane" the start and end points are
+      // we have to do this to avoid that we include voxels
+      // that are "outside" the line segment bewteen xstart and xend
+      
+      // !! for these calculations we overwrite the istart_f and iend_f variables !!
+      istart_f = (xstart2 - img_origin2) / voxsize2;
+      iend_f   = (xend2   - img_origin2) / voxsize2;
+
+      if (istart_f > iend_f){
+        tmp      = iend_f;
+        iend_f   = istart_f;
+        istart_f = tmp;
+      }
+
+      if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
+      if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+      //---
+
+      for(i2 = istart; i2 < iend; i2++)
+      {
+        // get the indices where the ray intersects the image plane
+        x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2;
+        x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2;
+
+        i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
+        i0_ceil  = i0_floor + 1;
+
+        i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
+        i1_ceil  = i1_floor + 1; 
+
+        // calculate the distances to the floor normalized to [0,1]
+        // for the bilinear interpolation
+        tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
+        tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
+
+        toAdd = 0;
+
+        if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1))
+        {
+          toAdd += img[n1*n2*i0_floor + n2*i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1);
+        }
+        if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1))
+        {
+          toAdd += img[n1*n2*i0_ceil + n2*i1_floor + i2] * tmp_0 * (1 - tmp_1);
+        }
+        if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) & (i1_ceil < n1))
+        {
+          toAdd += img[n1*n2*i0_floor + n2*i1_ceil + i2] * (1 - tmp_0) * tmp_1;
+        }
+        if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1))
+        {
+          toAdd += img[n1*n2*i0_ceil + n2*i1_ceil + i2] * tmp_0 * tmp_1;
+        }
+
+        if(toAdd != 0){p[i] += (cf * toAdd);}
+      }
+    }
+  }
+}
diff --git a/parallelproj.h b/parallelproj.h
index bc4b7b8..ad98e1a 100644
--- a/parallelproj.h
+++ b/parallelproj.h
@@ -11,7 +11,7 @@ void joseph3d_fwd(const float *xstart,
                   const float *img_origin, 
                   const float *voxsize, 
                   float *p,
-                  long long nlors, 
+                  size_t nlors, 
                   const int *img_dim,
                   int device_id = 0,
                   int threadsperblock = 64);

From 0b8de6f88908825a9a319d81e654d5e4120b9db5 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 18:41:01 +0100
Subject: [PATCH 02/35] add openmp example

---
 CMakeLists.txt         |   5 +-
 main.cpp               | 194 +++++++++++++++++++++++++++++++++++++++++
 main_pp.cpp => main.cu |  46 ++++------
 3 files changed, 216 insertions(+), 29 deletions(-)
 create mode 100644 main.cpp
 rename main_pp.cpp => main.cu (85%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ee254a..e7ec75a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,12 +40,13 @@ if(CMAKE_CUDA_COMPILER)
     add_library(parallelproj SHARED joseph3d_fwd.cu)
     target_link_libraries(parallelproj PRIVATE CUDA::cudart)
 
-    add_executable(main main_pp.cpp)
+    add_executable(main main.cu)
     target_include_directories(main PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
     target_link_libraries(main PRIVATE parallelproj CUDA::cudart)
 else()
     find_package(OpenMP REQUIRED)
     add_library(parallelproj SHARED joseph3d_fwd.cpp)
     target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX)
-    #add_executable(main main_no_cuda.cpp)
+    add_executable(main main.cpp)
+    target_link_libraries(main PRIVATE parallelproj)
 endif()
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000..4f0bb76
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,194 @@
+#include "parallelproj.h"
+#include <iostream>
+#include <chrono>
+#include <cmath>
+
+int main() {
+    const size_t repetitions = 5;
+    size_t nlors = 10;
+
+    ////////////////////////////////////////////////////////
+    // OpenMP managed memory use case
+    ////////////////////////////////////////////////////////
+
+    std::cout << "OpenMP use case\n";
+
+    int img_dim[3] = {2, 3, 4};
+
+    float voxsize[3] = {4, 3, 2};
+
+    float img_origin[3];
+    for (int i = 0; i < 3; ++i) {
+        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i];
+    }
+
+    float* img = new float[img_dim[0] * img_dim[1] * img_dim[2]];
+
+    // fill the test image
+    for (int i0 = 0; i0 < img_dim[0]; i0++)
+    {
+        for (int i1 = 0; i1 < img_dim[1]; i1++)
+        {
+            for (int i2 = 0; i2 < img_dim[2]; i2++)
+            {
+                img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2] = float(img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2 + 1);
+                printf("%.1f ", img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    float id0 = static_cast<float>(img_dim[0]);
+    float id1 = static_cast<float>(img_dim[1]);
+    float id2 = static_cast<float>(img_dim[2]);
+
+    float vstart[] = {
+        0, -1, 0,           // 0
+        0, -1, 0,           // 1
+        0, -1, 1,           // 2
+        0, -1, 0.5,         // 3
+        0, 0, -1,           // 4
+        -1, 0, 0,           // 5
+        id0 - 1, -1, 0,      // 6 - (shifted 1)
+        id0 - 1, -1, id2 - 1, // 7 - (shifted 6)
+        id0 - 1, 0, -1,      // 8 - (shifted 4)
+        id0 - 1, id1 - 1, -1, // 9 - (shifted 8)
+    };
+
+    float vend[] = {
+        0, id1, 0,           // 0
+        0, id1, 0,           // 1
+        0, id1, 1,           // 2
+        0, id1, 0.5,         // 3
+        0, 0, id2,           // 4
+        id0, 0, 0,           // 5
+        id0 - 1, id1, 0,      // 6 - (shifted 1)
+        id0 - 1, id1, id2 - 1, // 7 - (shifted 6)
+        id0 - 1, 0, id2,      // 8 - (shifted 4)
+        id0 - 1, id1 - 1, id2, // 9 - (shifted 8)
+    };
+
+    for (int ir = 0; ir < nlors; ir++)
+    {
+        printf("test ray %d\n", ir);
+        printf("start voxel num .: %.1f %.1f %.1f\n", vstart[ir * 3 + 0], vstart[ir * 3 + 1], vstart[ir * 3 + 2]);
+        printf("end   voxel num .: %.1f %.1f %.1f\n", vend[ir * 3 + 0], vend[ir * 3 + 1], vend[ir * 3 + 2]);
+    }
+
+    // calculate the start and end coordinates in world coordinates
+    
+    float *xstart = new float[3 * nlors];
+    float *xend = new float[3 * nlors];
+
+    for (int ir = 0; ir < nlors; ir++)
+
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j];
+            xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j];
+        }
+    }
+
+    float *img_fwd = new float[nlors];
+    joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
+
+    /////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////
+
+    // calculate the expected values
+
+    int retval = 0;
+    float eps = 1e-7;
+
+    float* expected_fwd_vals = new float[nlors];
+    // initialize expected_fwd_vals with 0s
+    for (int ir = 0; ir < nlors; ir++)
+    {
+        expected_fwd_vals[ir] = 0;
+    }
+
+    for (int i1 = 0; i1 < img_dim[1]; i1++)
+    {
+        expected_fwd_vals[0] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1];
+    }
+
+    expected_fwd_vals[1] = expected_fwd_vals[0];
+
+    // calculate the expected value of ray2 from [0,-1,1] to [0,last+1,1]
+    for (int i1 = 0; i1 < img_dim[1]; i1++)
+    {
+        expected_fwd_vals[2] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 1] * voxsize[1];
+    }
+
+    // calculate the expected value of ray3 from [0,-1,0.5] to [0,last+1,0.5]
+    expected_fwd_vals[3] = 0.5 * (expected_fwd_vals[0] + expected_fwd_vals[2]);
+
+    // calculate the expected value of ray4 from [0,0,-1] to [0,0,last+1]
+    for (int i2 = 0; i2 < img_dim[2]; i2++)
+    {
+        expected_fwd_vals[4] += img[0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2];
+    }
+
+    // calculate the expected value of ray5 from [-1,0,0] to [last+1,0,0]
+    for (int i0 = 0; i0 < img_dim[0]; i0++)
+    {
+        expected_fwd_vals[5] += img[i0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + 0] * voxsize[0];
+    }
+
+    // calculate the expected value of rays6 from [img_dim[0]-1,-1,0] to [img_dim[0]-1,last+1,0]
+    for (int i1 = 0; i1 < img_dim[1]; i1++)
+    {
+        expected_fwd_vals[6] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1];
+    }
+
+    // calculate the expected value of rays7 from [img_dim[0]-1,-1,img_dim[2]-1] to [img_dim[0]-1,last+1,img_dim[2]-1]
+    for (int i1 = 0; i1 < img_dim[1]; i1++)
+    {
+        expected_fwd_vals[7] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + (img_dim[2] - 1)] * voxsize[1];
+    }
+
+    // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1]
+    for (int i2 = 0; i2 < img_dim[2]; i2++)
+    {
+        expected_fwd_vals[8] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2];
+    }
+
+    // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1]
+    for (int i2 = 0; i2 < img_dim[2]; i2++)
+    {
+        expected_fwd_vals[9] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + (img_dim[1] - 1) * img_dim[2] + i2] * voxsize[2];
+    }
+
+    // check if we got the expected results
+    float fwd_diff = 0;
+    printf("\nforward projection test\n");
+    for (int ir = 0; ir < nlors; ir++)
+    {
+        printf("test ray %d: fwd projected: %.7e expected: %.7e\n", ir, img_fwd[ir], expected_fwd_vals[ir]);
+
+        fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]);
+        if (fwd_diff > eps)
+        {
+            printf("\n################################################################################");
+            printf("\nabs(fwd projected - expected value) = %.2e for ray%d above tolerance %.2e", fwd_diff, ir, eps);
+            printf("\n################################################################################\n");
+            retval = 1;
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+
+    free(img);
+    free(xstart);
+    free(xend);
+    free(img_fwd);
+    free(expected_fwd_vals);
+
+    return 0;
+}
+
diff --git a/main_pp.cpp b/main.cu
similarity index 85%
rename from main_pp.cpp
rename to main.cu
index 32b8339..fcc6e35 100644
--- a/main_pp.cpp
+++ b/main.cu
@@ -4,21 +4,9 @@
 #include <chrono>
 #include <cmath>
 
-void print_array(const char* label, float* array, size_t size) {
-    std::cout << label << ": ";
-    // print max 10 elements
-    size_t print_size = (size > 10) ? 10 : size;
-    for (size_t i = 0; i < print_size; ++i)
-        std::cout << array[i] << " ";
-    // print ellipses if size > 10 and the last element
-    if (size > 10)
-        std::cout << "... " << array[size - 1];
-    std::cout << "\n";
-}
-
 int main() {
     const size_t repetitions = 5;
-    long long nlors = 10;
+    size_t nlors = 10;
 
     // get the number of cuda devices - because we want to run on the last device
     int device_count;
@@ -68,6 +56,10 @@ int main() {
         printf("\n");
     }
 
+    float id0 = static_cast<float>(img_dim[0]);
+    float id1 = static_cast<float>(img_dim[1]);
+    float id2 = static_cast<float>(img_dim[2]);
+
     float vstart[] = {
         0, -1, 0,           // 0
         0, -1, 0,           // 1
@@ -75,23 +67,23 @@ int main() {
         0, -1, 0.5,         // 3
         0, 0, -1,           // 4
         -1, 0, 0,           // 5
-        img_dim[0] - 1, -1, 0,      // 6 - (shifted 1)
-        img_dim[0] - 1, -1, img_dim[2] - 1, // 7 - (shifted 6)
-        img_dim[0] - 1, 0, -1,      // 8 - (shifted 4)
-        img_dim[0] - 1, img_dim[1] - 1, -1, // 9 - (shifted 8)
+        id0 - 1, -1, 0,      // 6 - (shifted 1)
+        id0 - 1, -1, id2 - 1, // 7 - (shifted 6)
+        id0 - 1, 0, -1,      // 8 - (shifted 4)
+        id0 - 1, id1 - 1, -1, // 9 - (shifted 8)
     };
 
     float vend[] = {
-        0, img_dim[1], 0,           // 0
-        0, img_dim[1], 0,           // 1
-        0, img_dim[1], 1,           // 2
-        0, img_dim[1], 0.5,         // 3
-        0, 0, img_dim[2],           // 4
-        img_dim[0], 0, 0,           // 5
-        img_dim[0] - 1, img_dim[1], 0,      // 6 - (shifted 1)
-        img_dim[0] - 1, img_dim[1], img_dim[2] - 1, // 7 - (shifted 6)
-        img_dim[0] - 1, 0, img_dim[2],      // 8 - (shifted 4)
-        img_dim[0] - 1, img_dim[1] - 1, img_dim[2], // 9 - (shifted 8)
+        0, id1, 0,           // 0
+        0, id1, 0,           // 1
+        0, id1, 1,           // 2
+        0, id1, 0.5,         // 3
+        0, 0, id2,           // 4
+        id0, 0, 0,           // 5
+        id0 - 1, id1, 0,      // 6 - (shifted 1)
+        id0 - 1, id1, id2 - 1, // 7 - (shifted 6)
+        id0 - 1, 0, id2,      // 8 - (shifted 4)
+        id0 - 1, id1 - 1, id2, // 9 - (shifted 8)
     };
 
     for (int ir = 0; ir < nlors; ir++)

From 8d1793d201349905b0fd89fc7254a1605bdaca5b Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 18:49:07 +0100
Subject: [PATCH 03/35] return retval

---
 main.cpp | 2 +-
 main.cu  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.cpp b/main.cpp
index 4f0bb76..d48a01c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -189,6 +189,6 @@ int main() {
     free(img_fwd);
     free(expected_fwd_vals);
 
-    return 0;
+    return retval;
 }
 
diff --git a/main.cu b/main.cu
index fcc6e35..5bdf872 100644
--- a/main.cu
+++ b/main.cu
@@ -216,6 +216,6 @@ int main() {
 
     free(expected_fwd_vals);
 
-    return 0;
+    return retval;
 }
 

From 28a65cde0c64c1b962ea8f84e8c77f95ddf5cd80 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 18:51:42 +0100
Subject: [PATCH 04/35] mv ray cube intersection to utils

---
 joseph3d_fwd_worker.h | 64 +-----------------------------------------
 utils.h               | 65 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 63 deletions(-)
 create mode 100644 utils.h

diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h
index b97892f..a506d61 100644
--- a/joseph3d_fwd_worker.h
+++ b/joseph3d_fwd_worker.h
@@ -1,75 +1,13 @@
 #pragma once
 #include "cuda_compat.h"
 #include "atomic_sum.h"
+#include "utils.h"
 
 // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
 #ifndef __CUDA_ARCH__
 #include <math.h>
 #endif
 
-WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0,
-                                                     float orig1,
-                                                     float orig2,
-                                                     float bounds0_min,
-                                                     float bounds1_min,
-                                                     float bounds2_min,
-                                                     float bounds0_max,
-                                                     float bounds1_max,
-                                                     float bounds2_max,
-                                                     float rdir0,
-                                                     float rdir1,
-                                                     float rdir2,
-                                                     float* t1,
-                                                     float* t2){
-  // the inverse of the directional vector
-  // using the inverse of the directional vector and IEEE floating point arith standard 754
-  // makes sure that 0's in the directional vector are handled correctly 
-  float invdir0 = 1.f/rdir0;
-  float invdir1 = 1.f/rdir1;
-  float invdir2 = 1.f/rdir2;
-  
-  unsigned char intersec = 1;
-  
-  float t11, t12, t21, t22; 
-
-  if (invdir0 >= 0){
-    *t1  = (bounds0_min - orig0) * invdir0;
-    *t2  = (bounds0_max - orig0) * invdir0; 
-  }
-  else{
-    *t1  = (bounds0_max - orig0) * invdir0;
-    *t2  = (bounds0_min - orig0) * invdir0;
-  }
-  
-  if (invdir1 >= 0){
-    t11 = (bounds1_min - orig1) * invdir1; 
-    t12 = (bounds1_max - orig1) * invdir1; 
-  }
-  else{
-    t11 = (bounds1_max - orig1) * invdir1;
-    t12 = (bounds1_min - orig1) * invdir1; 
-  }
-  
-  if ((*t1 > t12) || (t11 > *t2)){intersec = 0;}
-  if (t11 > *t1){*t1 = t11;}
-  if (t12 < *t2){*t2 = t12;}
-  
-  if (invdir2 >= 0){
-    t21 = (bounds2_min - orig2) * invdir2; 
-    t22 = (bounds2_max - orig2) * invdir2;
-  } 
-  else{
-    t21 = (bounds2_max - orig2) * invdir2; 
-    t22 = (bounds2_min - orig2) * invdir2;
-  } 
-  
-  if ((*t1 > t22) || (t21 > *t2)){intersec = 0;}
-  if (t21 > *t1){*t1 = t21;}
-  if (t22 < *t2){*t2 = t22;} 
-
-  return(intersec);
-}
-
 WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
                                                  const float *xstart, 
                                                  const float *xend, 
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000..69ea5ce
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,65 @@
+#pragma once
+#include "cuda_compat.h"
+
+WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0,
+                                                     float orig1,
+                                                     float orig2,
+                                                     float bounds0_min,
+                                                     float bounds1_min,
+                                                     float bounds2_min,
+                                                     float bounds0_max,
+                                                     float bounds1_max,
+                                                     float bounds2_max,
+                                                     float rdir0,
+                                                     float rdir1,
+                                                     float rdir2,
+                                                     float* t1,
+                                                     float* t2){
+  // the inverse of the directional vector
+  // using the inverse of the directional vector and IEEE floating point arith standard 754
+  // makes sure that 0's in the directional vector are handled correctly 
+  float invdir0 = 1.f/rdir0;
+  float invdir1 = 1.f/rdir1;
+  float invdir2 = 1.f/rdir2;
+  
+  unsigned char intersec = 1;
+  
+  float t11, t12, t21, t22; 
+
+  if (invdir0 >= 0){
+    *t1  = (bounds0_min - orig0) * invdir0;
+    *t2  = (bounds0_max - orig0) * invdir0; 
+  }
+  else{
+    *t1  = (bounds0_max - orig0) * invdir0;
+    *t2  = (bounds0_min - orig0) * invdir0;
+  }
+  
+  if (invdir1 >= 0){
+    t11 = (bounds1_min - orig1) * invdir1; 
+    t12 = (bounds1_max - orig1) * invdir1; 
+  }
+  else{
+    t11 = (bounds1_max - orig1) * invdir1;
+    t12 = (bounds1_min - orig1) * invdir1; 
+  }
+  
+  if ((*t1 > t12) || (t11 > *t2)){intersec = 0;}
+  if (t11 > *t1){*t1 = t11;}
+  if (t12 < *t2){*t2 = t12;}
+  
+  if (invdir2 >= 0){
+    t21 = (bounds2_min - orig2) * invdir2; 
+    t22 = (bounds2_max - orig2) * invdir2;
+  } 
+  else{
+    t21 = (bounds2_max - orig2) * invdir2; 
+    t22 = (bounds2_min - orig2) * invdir2;
+  } 
+  
+  if ((*t1 > t22) || (t21 > *t2)){intersec = 0;}
+  if (t21 > *t1){*t1 = t21;}
+  if (t22 < *t2){*t2 = t22;} 
+
+  return(intersec);
+}

From a79b17181371737a277e7549a239b2a389f443a9 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 21:47:30 +0200
Subject: [PATCH 05/35] add back kernel

---
 CMakeLists.txt                     |   2 +-
 joseph3d_back.cpp                  |  21 ++
 joseph3d_back_worker.h             | 329 +++++++++++++++++++++++++++++
 joseph3d_fwd_worker.h              |   3 +-
 utils.h => ray_cube_intersection.h |   0
 5 files changed, 352 insertions(+), 3 deletions(-)
 create mode 100644 joseph3d_back.cpp
 create mode 100644 joseph3d_back_worker.h
 rename utils.h => ray_cube_intersection.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e7ec75a..934f362 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@ if(CMAKE_CUDA_COMPILER)
     target_link_libraries(main PRIVATE parallelproj CUDA::cudart)
 else()
     find_package(OpenMP REQUIRED)
-    add_library(parallelproj SHARED joseph3d_fwd.cpp)
+    add_library(parallelproj SHARED joseph3d_fwd.cpp joseph3d_back.cpp)
     target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX)
     add_executable(main main.cpp)
     target_link_libraries(main PRIVATE parallelproj)
diff --git a/joseph3d_back.cpp b/joseph3d_back.cpp
new file mode 100644
index 0000000..a58e92b
--- /dev/null
+++ b/joseph3d_back.cpp
@@ -0,0 +1,21 @@
+#include "parallelproj.h"
+#include "joseph3d_back_worker.h"
+#include "debug.h"
+
+void joseph3d_back(const float *xstart, 
+                  const float *xend, 
+                  float *img,
+                  const float *img_origin, 
+                  const float *voxsize, 
+                  const float *p,
+                  size_t nlors, 
+                  const int *img_dim,
+                  int device_id,
+                  int threadsperblock){
+
+#pragma omp parallel for
+    for (size_t i = 0; i < nlors; ++i) {
+        joseph3d_back_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
+    }
+                    
+}
diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h
new file mode 100644
index 0000000..93c96f0
--- /dev/null
+++ b/joseph3d_back_worker.h
@@ -0,0 +1,329 @@
+#pragma once
+#include "cuda_compat.h"
+#include "atomic_sum.h"
+#include "ray_cube_intersection.h"
+
+// we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
+#ifndef __CUDA_ARCH__
+#include <math.h>
+#endif
+
+WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i,
+                                                 const float *xstart, 
+                                                 const float *xend, 
+                                                 float *img,
+                                                 const float *img_origin, 
+                                                 const float *voxsize, 
+                                                 const float *p,
+                                                 const int *img_dim){
+
+  int n0 = img_dim[0];
+  int n1 = img_dim[1];
+  int n2 = img_dim[2];
+
+  float voxsize0 = voxsize[0];
+  float voxsize1 = voxsize[1];
+  float voxsize2 = voxsize[2];
+
+  float img_origin0 = img_origin[0];
+  float img_origin1 = img_origin[1];
+  float img_origin2 = img_origin[2];
+
+  if(p[i] != 0)
+  {
+
+    float d0, d1, d2, d0_sq, d1_sq, d2_sq;
+    float cs0, cs1, cs2, cf; 
+    float lsq, cos0_sq, cos1_sq, cos2_sq;
+    unsigned short direction; 
+    int i0, i1, i2;
+    int i0_floor, i1_floor, i2_floor;
+    int i0_ceil, i1_ceil, i2_ceil;
+    float x_pr0, x_pr1, x_pr2;
+    float tmp_0, tmp_1, tmp_2;
+
+    float xstart0 = xstart[i*3 + 0];
+    float xstart1 = xstart[i*3 + 1];
+    float xstart2 = xstart[i*3 + 2];
+
+    float xend0 = xend[i*3 + 0];
+    float xend1 = xend[i*3 + 1];
+    float xend2 = xend[i*3 + 2];
+
+    unsigned char intersec;
+    float t1, t2;
+    float istart_f, iend_f, tmp;
+    int   istart, iend;
+
+    // test whether the ray between the two detectors is most parallel
+    // with the 0, 1, or 2 axis
+    d0    = xend0 - xstart0;
+    d1    = xend1 - xstart1;
+    d2    = xend2 - xstart2;
+  
+    //-----------
+    //--- test whether ray and cube intersect
+    intersec = ray_cube_intersection(xstart0, xstart1, xstart2, 
+                                   img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2,
+                                   img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2,
+                                   d0, d1, d2, &t1, &t2);
+
+    if (intersec == 1)
+    {
+      d0_sq = d0*d0; 
+      d1_sq = d1*d1;
+      d2_sq = d2*d2;
+      
+      lsq = d0_sq + d1_sq + d2_sq;
+      
+      cos0_sq = d0_sq / lsq;
+      cos1_sq = d1_sq / lsq;
+      cos2_sq = d2_sq / lsq;
+
+      cs0 = sqrtf(cos0_sq); 
+      cs1 = sqrtf(cos1_sq); 
+      cs2 = sqrtf(cos2_sq); 
+      
+      direction = 0;
+      if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq))
+      {
+        direction = 1;
+      }
+      if ((cos2_sq >= cos0_sq) && (cos2_sq >= cos1_sq))
+      {
+        direction = 2;
+      }
+
+      if(direction == 0)
+      {
+        // case where ray is most parallel to the 0 axis
+        // we step through the volume along the 0 direction
+
+        // factor for correctiong voxel size and |cos(theta)|
+        cf = voxsize0/cs0;
+
+        //--- check where ray enters / leaves cube
+        istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0;
+        iend_f   = (xstart0 + t2*d0 - img_origin0) / voxsize0;
+
+        if (istart_f > iend_f){
+          tmp      = iend_f;
+          iend_f   = istart_f;
+          istart_f = tmp;
+        }
+  
+        istart = (int)floor(istart_f);
+        iend   = (int)ceil(iend_f);
+        if (istart < 0){istart = 0;}
+        if (iend >= n0){iend = n0;}
+
+        // check in which "plane" the start and end points are
+        // we have to do this to avoid that we include voxels
+        // that are "outside" the line segment bewteen xstart and xend
+        
+        // !! for these calculations we overwrite the istart_f and iend_f variables !!
+        istart_f = (xstart0 - img_origin0) / voxsize0;
+        iend_f   = (xend0   - img_origin0) / voxsize0;
+
+        if (istart_f > iend_f){
+          tmp      = iend_f;
+          iend_f   = istart_f;
+          istart_f = tmp;
+        }
+
+        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
+        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+        //---
+
+        for(i0 = istart; i0 < iend; i0++)
+        {
+          // get the indices where the ray intersects the image plane
+          x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0;
+          x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0;
+  
+          i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
+          i1_ceil  = i1_floor + 1; 
+  
+          i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
+          i2_ceil  = i2_floor + 1; 
+  
+          // calculate the distances to the floor normalized to [0,1]
+          // for the bilinear interpolation
+          tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
+          tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
+  
+          if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2))
+          {
+            atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_floor, (p[i] * (1 - tmp_1) * (1 - tmp_2) * cf));
+          }
+          if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2))
+          {
+            atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_floor, (p[i] * tmp_1 * (1 - tmp_2) * cf));
+          }
+          if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
+          {
+            atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_ceil, (p[i] * (1 - tmp_1) * tmp_2 * cf));
+          }
+          if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
+          {
+            atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_ceil, (p[i] * tmp_1 * tmp_2 * cf));
+          }
+        }
+      }  
+      // --------------------------------------------------------------------------------- 
+      if(direction == 1)
+      {
+        // case where ray is most parallel to the 1 axis
+        // we step through the volume along the 1 direction
+  
+        // factor for correctiong voxel size and |cos(theta)|
+        cf = voxsize1/cs1;
+
+        //--- check where ray enters / leaves cube
+        istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1;
+        iend_f   = (xstart1 + t2*d1 - img_origin1) / voxsize1;
+
+        if (istart_f > iend_f){
+          tmp      = iend_f;
+          iend_f   = istart_f;
+          istart_f = tmp;
+        }
+  
+        istart = (int)floor(istart_f);
+        iend   = (int)ceil(iend_f);
+        if (istart < 0){istart = 0;}
+        if (iend >= n1){iend = n1;}
+
+        // check in which "plane" the start and end points are
+        // we have to do this to avoid that we include voxels
+        // that are "outside" the line segment bewteen xstart and xend
+        
+        // !! for these calculations we overwrite the istart_f and iend_f variables !!
+        istart_f = (xstart1 - img_origin1) / voxsize1;
+        iend_f   = (xend1   - img_origin1) / voxsize1;
+
+        if (istart_f > iend_f){
+          tmp      = iend_f;
+          iend_f   = istart_f;
+          istart_f = tmp;
+        }
+
+        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
+        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+        //---
+
+        for(i1 = istart; i1 < iend; i1++)
+        {
+          // get the indices where the ray intersects the image plane
+          x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1;
+          x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1;
+  
+          i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
+          i0_ceil  = i0_floor + 1; 
+  
+          i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
+          i2_ceil  = i2_floor + 1; 
+  
+          // calculate the distances to the floor normalized to [0,1]
+          // for the bilinear interpolation
+          tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
+          tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
+  
+          if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) 
+          {
+            atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_floor, (p[i] * (1 - tmp_0) * (1 - tmp_2) * cf));
+          }
+          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2))
+          {
+            atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_floor, (p[i] * tmp_0 * (1 - tmp_2) * cf));
+          }
+          if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
+          {
+            atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_ceil, (p[i] * (1 - tmp_0) * tmp_2 * cf));
+          }
+          if((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
+          {
+            atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_ceil, (p[i] * tmp_0 * tmp_2 * cf));
+          }
+        }
+      }
+      //--------------------------------------------------------------------------------- 
+      if (direction == 2)
+      {
+        // case where ray is most parallel to the 2 axis
+        // we step through the volume along the 2 direction
+  
+        // factor for correctiong voxel size and |cos(theta)|
+        cf = voxsize2/cs2;
+  
+        //--- check where ray enters / leaves cube
+        istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2;
+        iend_f   = (xstart2 + t2*d2 - img_origin2) / voxsize2;
+
+        if (istart_f > iend_f){
+          tmp      = iend_f;
+          iend_f   = istart_f;
+          istart_f = tmp;
+        }
+  
+        istart = (int)floor(istart_f);
+        iend   = (int)ceil(iend_f);
+        if (istart < 0){istart = 0;}
+        if (iend >= n2){iend = n2;}
+
+        // check in which "plane" the start and end points are
+        // we have to do this to avoid that we include voxels
+        // that are "outside" the line segment bewteen xstart and xend
+        
+        // !! for these calculations we overwrite the istart_f and iend_f variables !!
+        istart_f = (xstart2 - img_origin2) / voxsize2;
+        iend_f   = (xend2   - img_origin2) / voxsize2;
+
+        if (istart_f > iend_f){
+          tmp      = iend_f;
+          iend_f   = istart_f;
+          istart_f = tmp;
+        }
+
+        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
+        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+        //---
+
+        for(i2 = istart; i2 < iend; i2++)
+        {
+          // get the indices where the ray intersects the image plane
+          x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2;
+          x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2;
+  
+          i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
+          i0_ceil  = i0_floor + 1; 
+  
+          i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
+          i1_ceil  = i1_floor + 1; 
+  
+          // calculate the distances to the floor normalized to [0,1]
+          // for the bilinear interpolation
+          tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
+          tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
+  
+          if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1))
+          {
+            atomic_sum(img + n1*n2*i0_floor +  n2*i1_floor + i2, (p[i] * (1 - tmp_0) * (1 - tmp_1) * cf));
+          }
+          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1))
+          {
+            atomic_sum(img + n1*n2*i0_ceil + n2*i1_floor + i2, (p[i] * tmp_0 * (1 - tmp_1) * cf));
+          }
+          if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) && (i1_ceil < n1))
+          {
+            atomic_sum(img + n1*n2*i0_floor + n2*i1_ceil + i2, (p[i] * (1 - tmp_0) * tmp_1 * cf));
+          }
+          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1))
+          {
+            atomic_sum(img + n1*n2*i0_ceil + n2*i1_ceil + i2, (p[i] * tmp_0 * tmp_1 * cf));
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h
index a506d61..a8b3fe4 100644
--- a/joseph3d_fwd_worker.h
+++ b/joseph3d_fwd_worker.h
@@ -1,7 +1,6 @@
 #pragma once
 #include "cuda_compat.h"
-#include "atomic_sum.h"
-#include "utils.h"
+#include "ray_cube_intersection.h"
 
 // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
 #ifndef __CUDA_ARCH__
diff --git a/utils.h b/ray_cube_intersection.h
similarity index 100%
rename from utils.h
rename to ray_cube_intersection.h

From 86fc2d03ad537bcddebdfcfcff27faebc1995638 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 21:53:17 +0200
Subject: [PATCH 06/35] rename files

---
 atomic_sum.h                       | 11 -----------
 joseph3d_back_worker.h             |  3 +--
 joseph3d_fwd_worker.h              |  2 +-
 square_op.h                        |  2 +-
 ray_cube_intersection.h => utils.h | 11 ++++++++++-
 5 files changed, 13 insertions(+), 16 deletions(-)
 delete mode 100644 atomic_sum.h
 rename ray_cube_intersection.h => utils.h (89%)

diff --git a/atomic_sum.h b/atomic_sum.h
deleted file mode 100644
index dc08883..0000000
--- a/atomic_sum.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-#include "cuda_compat.h"
-
-WORKER_QUALIFIER inline void atomic_sum(float* target, float value) {
-#ifdef __CUDA_ARCH__
-    atomicAdd(target, value);
-#else
-#pragma omp atomic
-    *target += value;
-#endif
-}
diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h
index 93c96f0..f3c7500 100644
--- a/joseph3d_back_worker.h
+++ b/joseph3d_back_worker.h
@@ -1,7 +1,6 @@
 #pragma once
 #include "cuda_compat.h"
-#include "atomic_sum.h"
-#include "ray_cube_intersection.h"
+#include "utils.h"
 
 // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
 #ifndef __CUDA_ARCH__
diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h
index a8b3fe4..f4f8f73 100644
--- a/joseph3d_fwd_worker.h
+++ b/joseph3d_fwd_worker.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "cuda_compat.h"
-#include "ray_cube_intersection.h"
+#include "utils.h"
 
 // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
 #ifndef __CUDA_ARCH__
diff --git a/square_op.h b/square_op.h
index 2eda75b..78d9b65 100644
--- a/square_op.h
+++ b/square_op.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "cuda_compat.h"
-#include "atomic_sum.h"
+#include "utils.h"
 
 CUDA_HOST_DEVICE inline void compute_and_accumulate(float* array, size_t idx, float* result_sum) {
     array[idx] += 1;
diff --git a/ray_cube_intersection.h b/utils.h
similarity index 89%
rename from ray_cube_intersection.h
rename to utils.h
index 69ea5ce..a9e642c 100644
--- a/ray_cube_intersection.h
+++ b/utils.h
@@ -1,7 +1,16 @@
 #pragma once
 #include "cuda_compat.h"
 
-WORKER_QUALIFIER unsigned char ray_cube_intersection(float orig0,
+WORKER_QUALIFIER inline void atomic_sum(float* target, float value) {
+#ifdef __CUDA_ARCH__
+    atomicAdd(target, value);
+#else
+#pragma omp atomic
+    *target += value;
+#endif
+}
+
+WORKER_QUALIFIER inline unsigned char ray_cube_intersection(float orig0,
                                                      float orig1,
                                                      float orig2,
                                                      float bounds0_min,

From 10269350b8a0098d3941749ed01df80770b50290 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 22:10:57 +0200
Subject: [PATCH 07/35] add back projection test to cpp file

---
 main.cpp       | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-
 parallelproj.h | 29 +++++++++++++++++++----
 2 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/main.cpp b/main.cpp
index d48a01c..641c0b1 100644
--- a/main.cpp
+++ b/main.cpp
@@ -183,12 +183,73 @@ int main() {
     ////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////
 
+    // test the back projection
+
+    float* bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]];
+    for (size_t i = 0; i < (img_dim[0]*img_dim[1]*img_dim[2]); i++){
+        bimg[i] = 0;
+    }
+
+    float* ones = new float[nlors];
+    for (size_t i = 0; i < nlors; i++){
+        ones[i] = 1;
+    }
+
+    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim);
+
+    printf("\nback projection of ones along all rays:\n");
+    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    {
+        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        {
+            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
+            {
+                printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    // To test whether the back projection is correct, we test if the back projector is the adjoint
+    // of the forward projector. This is more practical than checking a lot of single voxels in the
+    // back projected image.
+
+    float inner_product1 = 0;
+    float inner_product2 = 0;
+
+    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
+    {
+        inner_product1 += (img[i] * bimg[i]);
+    }
+
+    for (size_t ir = 0; ir < nlors; ir++)
+    {
+        inner_product2 += (img_fwd[ir] * ones[ir]);
+    }
+
+    float ip_diff = fabs(inner_product1 - inner_product2);
+
+    if (ip_diff > eps)
+    {
+        printf("\n#########################################################################");
+        printf("\nback projection test failed. back projection seems not to be the adjoint.");
+        printf("\n %.7e", ip_diff);
+        printf("\n#########################################################################\n");
+        retval = 1;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
     free(img);
     free(xstart);
     free(xend);
     free(img_fwd);
     free(expected_fwd_vals);
 
+    free(bimg);
+    free(ones);
+
     return retval;
 }
-
diff --git a/parallelproj.h b/parallelproj.h
index ad98e1a..e554942 100644
--- a/parallelproj.h
+++ b/parallelproj.h
@@ -5,13 +5,13 @@
 extern "C" {
 #endif
 
-void joseph3d_fwd(const float *xstart, 
-                  const float *xend, 
+void joseph3d_fwd(const float *xstart,
+                  const float *xend,
                   const float *img,
-                  const float *img_origin, 
-                  const float *voxsize, 
+                  const float *img_origin,
+                  const float *voxsize,
                   float *p,
-                  size_t nlors, 
+                  size_t nlors,
                   const int *img_dim,
                   int device_id = 0,
                   int threadsperblock = 64);
@@ -19,3 +19,22 @@ void joseph3d_fwd(const float *xstart,
 #ifdef __cplusplus
 }
 #endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void joseph3d_back(const float *xstart,
+                   const float *xend,
+                   float *img,
+                   const float *img_origin, 
+                   const float *voxsize, 
+                   const float *p,
+                   size_t nlors, 
+                   const int *img_dim,
+                   int device_id = 0,
+                   int threadsperblock = 64);
+
+#ifdef __cplusplus
+}
+#endif

From 3cc377a05a2a1bda2bf3ab76895f836862ca2dde Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg@Georgs-MacBook-Pro-208.local>
Date: Wed, 9 Apr 2025 22:18:06 +0200
Subject: [PATCH 08/35] reformat code

---
 cuda_compat.h          |   4 +-
 debug.h                |  13 +-
 joseph3d_back.cpp      |  25 +--
 joseph3d_back_worker.h | 335 +++++++++++++++++++++++------------------
 joseph3d_fwd.cpp       |  17 ++-
 joseph3d_fwd.cu        |  73 ++++-----
 joseph3d_fwd_worker.h  | 274 +++++++++++++++++++--------------
 main.cpp               |  54 ++++---
 main.cu                |  57 ++++---
 utils.h                | 128 +++++++++-------
 10 files changed, 559 insertions(+), 421 deletions(-)

diff --git a/cuda_compat.h b/cuda_compat.h
index 21451f1..9b3752c 100644
--- a/cuda_compat.h
+++ b/cuda_compat.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #ifdef __CUDACC__
-  #define WORKER_QUALIFIER __device__
+#define WORKER_QUALIFIER __device__
 #else
-  #define WORKER_QUALIFIER
+#define WORKER_QUALIFIER
 #endif
diff --git a/debug.h b/debug.h
index 9535f81..fe5244e 100644
--- a/debug.h
+++ b/debug.h
@@ -4,11 +4,18 @@
 #include <cstdio>
 
 // Safe printf wrapper
-#define DPRINTF(...) do { printf(__VA_ARGS__); } while (0)
+#define DPRINTF(...)         \
+    do                       \
+    {                        \
+        printf(__VA_ARGS__); \
+    } while (0)
 
 // Conditional debug print
 #ifdef DEBUG
-    #define DEBUG_PRINT(...) DPRINTF(__VA_ARGS__)
+#define DEBUG_PRINT(...) DPRINTF(__VA_ARGS__)
 #else
-    #define DEBUG_PRINT(...) do {} while (0)
+#define DEBUG_PRINT(...) \
+    do                   \
+    {                    \
+    } while (0)
 #endif
diff --git a/joseph3d_back.cpp b/joseph3d_back.cpp
index a58e92b..f63d8da 100644
--- a/joseph3d_back.cpp
+++ b/joseph3d_back.cpp
@@ -2,20 +2,21 @@
 #include "joseph3d_back_worker.h"
 #include "debug.h"
 
-void joseph3d_back(const float *xstart, 
-                  const float *xend, 
-                  float *img,
-                  const float *img_origin, 
-                  const float *voxsize, 
-                  const float *p,
-                  size_t nlors, 
-                  const int *img_dim,
-                  int device_id,
-                  int threadsperblock){
+void joseph3d_back(const float *xstart,
+                   const float *xend,
+                   float *img,
+                   const float *img_origin,
+                   const float *voxsize,
+                   const float *p,
+                   size_t nlors,
+                   const int *img_dim,
+                   int device_id,
+                   int threadsperblock)
+{
 
 #pragma omp parallel for
-    for (size_t i = 0; i < nlors; ++i) {
+    for (size_t i = 0; i < nlors; ++i)
+    {
         joseph3d_back_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
     }
-                    
 }
diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h
index f3c7500..b7fdb23 100644
--- a/joseph3d_back_worker.h
+++ b/joseph3d_back_worker.h
@@ -8,13 +8,14 @@
 #endif
 
 WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i,
-                                                 const float *xstart, 
-                                                 const float *xend, 
-                                                 float *img,
-                                                 const float *img_origin, 
-                                                 const float *voxsize, 
-                                                 const float *p,
-                                                 const int *img_dim){
+                                                  const float *xstart,
+                                                  const float *xend,
+                                                  float *img,
+                                                  const float *img_origin,
+                                                  const float *voxsize,
+                                                  const float *p,
+                                                  const int *img_dim)
+{
 
   int n0 = img_dim[0];
   int n1 = img_dim[1];
@@ -28,61 +29,61 @@ WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i,
   float img_origin1 = img_origin[1];
   float img_origin2 = img_origin[2];
 
-  if(p[i] != 0)
+  if (p[i] != 0)
   {
 
     float d0, d1, d2, d0_sq, d1_sq, d2_sq;
-    float cs0, cs1, cs2, cf; 
+    float cs0, cs1, cs2, cf;
     float lsq, cos0_sq, cos1_sq, cos2_sq;
-    unsigned short direction; 
+    unsigned short direction;
     int i0, i1, i2;
     int i0_floor, i1_floor, i2_floor;
     int i0_ceil, i1_ceil, i2_ceil;
     float x_pr0, x_pr1, x_pr2;
     float tmp_0, tmp_1, tmp_2;
 
-    float xstart0 = xstart[i*3 + 0];
-    float xstart1 = xstart[i*3 + 1];
-    float xstart2 = xstart[i*3 + 2];
+    float xstart0 = xstart[i * 3 + 0];
+    float xstart1 = xstart[i * 3 + 1];
+    float xstart2 = xstart[i * 3 + 2];
 
-    float xend0 = xend[i*3 + 0];
-    float xend1 = xend[i*3 + 1];
-    float xend2 = xend[i*3 + 2];
+    float xend0 = xend[i * 3 + 0];
+    float xend1 = xend[i * 3 + 1];
+    float xend2 = xend[i * 3 + 2];
 
     unsigned char intersec;
     float t1, t2;
     float istart_f, iend_f, tmp;
-    int   istart, iend;
+    int istart, iend;
 
     // test whether the ray between the two detectors is most parallel
     // with the 0, 1, or 2 axis
-    d0    = xend0 - xstart0;
-    d1    = xend1 - xstart1;
-    d2    = xend2 - xstart2;
-  
+    d0 = xend0 - xstart0;
+    d1 = xend1 - xstart1;
+    d2 = xend2 - xstart2;
+
     //-----------
     //--- test whether ray and cube intersect
-    intersec = ray_cube_intersection(xstart0, xstart1, xstart2, 
-                                   img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2,
-                                   img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2,
-                                   d0, d1, d2, &t1, &t2);
+    intersec = ray_cube_intersection(xstart0, xstart1, xstart2,
+                                     img_origin0 - 1 * voxsize0, img_origin1 - 1 * voxsize1, img_origin2 - 1 * voxsize2,
+                                     img_origin0 + n0 * voxsize0, img_origin1 + n1 * voxsize1, img_origin2 + n2 * voxsize2,
+                                     d0, d1, d2, &t1, &t2);
 
     if (intersec == 1)
     {
-      d0_sq = d0*d0; 
-      d1_sq = d1*d1;
-      d2_sq = d2*d2;
-      
+      d0_sq = d0 * d0;
+      d1_sq = d1 * d1;
+      d2_sq = d2 * d2;
+
       lsq = d0_sq + d1_sq + d2_sq;
-      
+
       cos0_sq = d0_sq / lsq;
       cos1_sq = d1_sq / lsq;
       cos2_sq = d2_sq / lsq;
 
-      cs0 = sqrtf(cos0_sq); 
-      cs1 = sqrtf(cos1_sq); 
-      cs2 = sqrtf(cos2_sq); 
-      
+      cs0 = sqrtf(cos0_sq);
+      cs1 = sqrtf(cos1_sq);
+      cs2 = sqrtf(cos2_sq);
+
       direction = 0;
       if ((cos1_sq >= cos0_sq) && (cos1_sq >= cos2_sq))
       {
@@ -93,233 +94,275 @@ WORKER_QUALIFIER inline void joseph3d_back_worker(size_t i,
         direction = 2;
       }
 
-      if(direction == 0)
+      if (direction == 0)
       {
         // case where ray is most parallel to the 0 axis
         // we step through the volume along the 0 direction
 
         // factor for correctiong voxel size and |cos(theta)|
-        cf = voxsize0/cs0;
+        cf = voxsize0 / cs0;
 
         //--- check where ray enters / leaves cube
-        istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0;
-        iend_f   = (xstart0 + t2*d0 - img_origin0) / voxsize0;
+        istart_f = (xstart0 + t1 * d0 - img_origin0) / voxsize0;
+        iend_f = (xstart0 + t2 * d0 - img_origin0) / voxsize0;
 
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
+        if (istart_f > iend_f)
+        {
+          tmp = iend_f;
+          iend_f = istart_f;
           istart_f = tmp;
         }
-  
+
         istart = (int)floor(istart_f);
-        iend   = (int)ceil(iend_f);
-        if (istart < 0){istart = 0;}
-        if (iend >= n0){iend = n0;}
+        iend = (int)ceil(iend_f);
+        if (istart < 0)
+        {
+          istart = 0;
+        }
+        if (iend >= n0)
+        {
+          iend = n0;
+        }
 
         // check in which "plane" the start and end points are
         // we have to do this to avoid that we include voxels
         // that are "outside" the line segment bewteen xstart and xend
-        
+
         // !! for these calculations we overwrite the istart_f and iend_f variables !!
         istart_f = (xstart0 - img_origin0) / voxsize0;
-        iend_f   = (xend0   - img_origin0) / voxsize0;
+        iend_f = (xend0 - img_origin0) / voxsize0;
 
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
+        if (istart_f > iend_f)
+        {
+          tmp = iend_f;
+          iend_f = istart_f;
           istart_f = tmp;
         }
 
-        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+        if (istart < (int)floor(istart_f))
+        {
+          istart = (int)floor(istart_f);
+        }
+        if (iend >= (int)ceil(iend_f))
+        {
+          iend = (int)ceil(iend_f);
+        }
         //---
 
-        for(i0 = istart; i0 < iend; i0++)
+        for (i0 = istart; i0 < iend; i0++)
         {
           // get the indices where the ray intersects the image plane
-          x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0;
-          x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0;
-  
-          i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
-          i1_ceil  = i1_floor + 1; 
-  
-          i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
-          i2_ceil  = i2_floor + 1; 
-  
+          x_pr1 = xstart1 + (img_origin0 + i0 * voxsize0 - xstart0) * d1 / d0;
+          x_pr2 = xstart2 + (img_origin0 + i0 * voxsize0 - xstart0) * d2 / d0;
+
+          i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1);
+          i1_ceil = i1_floor + 1;
+
+          i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2);
+          i2_ceil = i2_floor + 1;
+
           // calculate the distances to the floor normalized to [0,1]
           // for the bilinear interpolation
-          tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
-          tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
-  
+          tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1;
+          tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2;
+
           if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2))
           {
-            atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_floor, (p[i] * (1 - tmp_1) * (1 - tmp_2) * cf));
+            atomic_sum(img + n1 * n2 * i0 + n2 * i1_floor + i2_floor, (p[i] * (1 - tmp_1) * (1 - tmp_2) * cf));
           }
           if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2))
           {
-            atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_floor, (p[i] * tmp_1 * (1 - tmp_2) * cf));
+            atomic_sum(img + n1 * n2 * i0 + n2 * i1_ceil + i2_floor, (p[i] * tmp_1 * (1 - tmp_2) * cf));
           }
           if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
           {
-            atomic_sum(img + n1*n2*i0 + n2*i1_floor + i2_ceil, (p[i] * (1 - tmp_1) * tmp_2 * cf));
+            atomic_sum(img + n1 * n2 * i0 + n2 * i1_floor + i2_ceil, (p[i] * (1 - tmp_1) * tmp_2 * cf));
           }
           if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
           {
-            atomic_sum(img + n1*n2*i0 + n2*i1_ceil + i2_ceil, (p[i] * tmp_1 * tmp_2 * cf));
+            atomic_sum(img + n1 * n2 * i0 + n2 * i1_ceil + i2_ceil, (p[i] * tmp_1 * tmp_2 * cf));
           }
         }
-      }  
-      // --------------------------------------------------------------------------------- 
-      if(direction == 1)
+      }
+      // ---------------------------------------------------------------------------------
+      if (direction == 1)
       {
         // case where ray is most parallel to the 1 axis
         // we step through the volume along the 1 direction
-  
+
         // factor for correctiong voxel size and |cos(theta)|
-        cf = voxsize1/cs1;
+        cf = voxsize1 / cs1;
 
         //--- check where ray enters / leaves cube
-        istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1;
-        iend_f   = (xstart1 + t2*d1 - img_origin1) / voxsize1;
+        istart_f = (xstart1 + t1 * d1 - img_origin1) / voxsize1;
+        iend_f = (xstart1 + t2 * d1 - img_origin1) / voxsize1;
 
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
+        if (istart_f > iend_f)
+        {
+          tmp = iend_f;
+          iend_f = istart_f;
           istart_f = tmp;
         }
-  
+
         istart = (int)floor(istart_f);
-        iend   = (int)ceil(iend_f);
-        if (istart < 0){istart = 0;}
-        if (iend >= n1){iend = n1;}
+        iend = (int)ceil(iend_f);
+        if (istart < 0)
+        {
+          istart = 0;
+        }
+        if (iend >= n1)
+        {
+          iend = n1;
+        }
 
         // check in which "plane" the start and end points are
         // we have to do this to avoid that we include voxels
         // that are "outside" the line segment bewteen xstart and xend
-        
+
         // !! for these calculations we overwrite the istart_f and iend_f variables !!
         istart_f = (xstart1 - img_origin1) / voxsize1;
-        iend_f   = (xend1   - img_origin1) / voxsize1;
+        iend_f = (xend1 - img_origin1) / voxsize1;
 
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
+        if (istart_f > iend_f)
+        {
+          tmp = iend_f;
+          iend_f = istart_f;
           istart_f = tmp;
         }
 
-        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+        if (istart < (int)floor(istart_f))
+        {
+          istart = (int)floor(istart_f);
+        }
+        if (iend >= (int)ceil(iend_f))
+        {
+          iend = (int)ceil(iend_f);
+        }
         //---
 
-        for(i1 = istart; i1 < iend; i1++)
+        for (i1 = istart; i1 < iend; i1++)
         {
           // get the indices where the ray intersects the image plane
-          x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1;
-          x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1;
-  
-          i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
-          i0_ceil  = i0_floor + 1; 
-  
-          i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
-          i2_ceil  = i2_floor + 1; 
-  
+          x_pr0 = xstart0 + (img_origin1 + i1 * voxsize1 - xstart1) * d0 / d1;
+          x_pr2 = xstart2 + (img_origin1 + i1 * voxsize1 - xstart1) * d2 / d1;
+
+          i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0);
+          i0_ceil = i0_floor + 1;
+
+          i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2);
+          i2_ceil = i2_floor + 1;
+
           // calculate the distances to the floor normalized to [0,1]
           // for the bilinear interpolation
-          tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
-          tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
-  
-          if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2)) 
+          tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0;
+          tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2;
+
+          if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2))
           {
-            atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_floor, (p[i] * (1 - tmp_0) * (1 - tmp_2) * cf));
+            atomic_sum(img + n1 * n2 * i0_floor + n2 * i1 + i2_floor, (p[i] * (1 - tmp_0) * (1 - tmp_2) * cf));
           }
           if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2))
           {
-            atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_floor, (p[i] * tmp_0 * (1 - tmp_2) * cf));
+            atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1 + i2_floor, (p[i] * tmp_0 * (1 - tmp_2) * cf));
           }
           if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
           {
-            atomic_sum(img + n1*n2*i0_floor + n2*i1 + i2_ceil, (p[i] * (1 - tmp_0) * tmp_2 * cf));
+            atomic_sum(img + n1 * n2 * i0_floor + n2 * i1 + i2_ceil, (p[i] * (1 - tmp_0) * tmp_2 * cf));
           }
-          if((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
+          if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
           {
-            atomic_sum(img + n1*n2*i0_ceil + n2*i1 + i2_ceil, (p[i] * tmp_0 * tmp_2 * cf));
+            atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1 + i2_ceil, (p[i] * tmp_0 * tmp_2 * cf));
           }
         }
       }
-      //--------------------------------------------------------------------------------- 
+      //---------------------------------------------------------------------------------
       if (direction == 2)
       {
         // case where ray is most parallel to the 2 axis
         // we step through the volume along the 2 direction
-  
+
         // factor for correctiong voxel size and |cos(theta)|
-        cf = voxsize2/cs2;
-  
+        cf = voxsize2 / cs2;
+
         //--- check where ray enters / leaves cube
-        istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2;
-        iend_f   = (xstart2 + t2*d2 - img_origin2) / voxsize2;
+        istart_f = (xstart2 + t1 * d2 - img_origin2) / voxsize2;
+        iend_f = (xstart2 + t2 * d2 - img_origin2) / voxsize2;
 
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
+        if (istart_f > iend_f)
+        {
+          tmp = iend_f;
+          iend_f = istart_f;
           istart_f = tmp;
         }
-  
+
         istart = (int)floor(istart_f);
-        iend   = (int)ceil(iend_f);
-        if (istart < 0){istart = 0;}
-        if (iend >= n2){iend = n2;}
+        iend = (int)ceil(iend_f);
+        if (istart < 0)
+        {
+          istart = 0;
+        }
+        if (iend >= n2)
+        {
+          iend = n2;
+        }
 
         // check in which "plane" the start and end points are
         // we have to do this to avoid that we include voxels
         // that are "outside" the line segment bewteen xstart and xend
-        
+
         // !! for these calculations we overwrite the istart_f and iend_f variables !!
         istart_f = (xstart2 - img_origin2) / voxsize2;
-        iend_f   = (xend2   - img_origin2) / voxsize2;
+        iend_f = (xend2 - img_origin2) / voxsize2;
 
-        if (istart_f > iend_f){
-          tmp      = iend_f;
-          iend_f   = istart_f;
+        if (istart_f > iend_f)
+        {
+          tmp = iend_f;
+          iend_f = istart_f;
           istart_f = tmp;
         }
 
-        if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-        if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+        if (istart < (int)floor(istart_f))
+        {
+          istart = (int)floor(istart_f);
+        }
+        if (iend >= (int)ceil(iend_f))
+        {
+          iend = (int)ceil(iend_f);
+        }
         //---
 
-        for(i2 = istart; i2 < iend; i2++)
+        for (i2 = istart; i2 < iend; i2++)
         {
           // get the indices where the ray intersects the image plane
-          x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2;
-          x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2;
-  
-          i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
-          i0_ceil  = i0_floor + 1; 
-  
-          i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
-          i1_ceil  = i1_floor + 1; 
-  
+          x_pr0 = xstart0 + (img_origin2 + i2 * voxsize2 - xstart2) * d0 / d2;
+          x_pr1 = xstart1 + (img_origin2 + i2 * voxsize2 - xstart2) * d1 / d2;
+
+          i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0);
+          i0_ceil = i0_floor + 1;
+
+          i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1);
+          i1_ceil = i1_floor + 1;
+
           // calculate the distances to the floor normalized to [0,1]
           // for the bilinear interpolation
-          tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
-          tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
-  
+          tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0;
+          tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1;
+
           if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1))
           {
-            atomic_sum(img + n1*n2*i0_floor +  n2*i1_floor + i2, (p[i] * (1 - tmp_0) * (1 - tmp_1) * cf));
+            atomic_sum(img + n1 * n2 * i0_floor + n2 * i1_floor + i2, (p[i] * (1 - tmp_0) * (1 - tmp_1) * cf));
           }
           if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1))
           {
-            atomic_sum(img + n1*n2*i0_ceil + n2*i1_floor + i2, (p[i] * tmp_0 * (1 - tmp_1) * cf));
+            atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1_floor + i2, (p[i] * tmp_0 * (1 - tmp_1) * cf));
           }
           if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) && (i1_ceil < n1))
           {
-            atomic_sum(img + n1*n2*i0_floor + n2*i1_ceil + i2, (p[i] * (1 - tmp_0) * tmp_1 * cf));
+            atomic_sum(img + n1 * n2 * i0_floor + n2 * i1_ceil + i2, (p[i] * (1 - tmp_0) * tmp_1 * cf));
           }
           if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1))
           {
-            atomic_sum(img + n1*n2*i0_ceil + n2*i1_ceil + i2, (p[i] * tmp_0 * tmp_1 * cf));
+            atomic_sum(img + n1 * n2 * i0_ceil + n2 * i1_ceil + i2, (p[i] * tmp_0 * tmp_1 * cf));
           }
         }
       }
diff --git a/joseph3d_fwd.cpp b/joseph3d_fwd.cpp
index bcd480c..553cd8a 100644
--- a/joseph3d_fwd.cpp
+++ b/joseph3d_fwd.cpp
@@ -2,20 +2,21 @@
 #include "joseph3d_fwd_worker.h"
 #include "debug.h"
 
-void joseph3d_fwd(const float *xstart, 
-                  const float *xend, 
+void joseph3d_fwd(const float *xstart,
+                  const float *xend,
                   const float *img,
-                  const float *img_origin, 
-                  const float *voxsize, 
+                  const float *img_origin,
+                  const float *voxsize,
                   float *p,
-                  size_t nlors, 
+                  size_t nlors,
                   const int *img_dim,
                   int device_id,
-                  int threadsperblock){
+                  int threadsperblock)
+{
 
 #pragma omp parallel for
-    for (size_t i = 0; i < nlors; ++i) {
+    for (size_t i = 0; i < nlors; ++i)
+    {
         joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
     }
-                    
 }
diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu
index 575e1cb..10e3f75 100644
--- a/joseph3d_fwd.cu
+++ b/joseph3d_fwd.cu
@@ -5,16 +5,18 @@
 #include <iostream>
 #include <stdexcept>
 
-__global__ void joseph3d_fwd_kernel(const float *xstart, 
-                                    const float *xend, 
+__global__ void joseph3d_fwd_kernel(const float *xstart,
+                                    const float *xend,
                                     const float *img,
-                                    const float *img_origin, 
-                                    const float *voxsize, 
+                                    const float *img_origin,
+                                    const float *voxsize,
                                     float *p,
-                                    size_t nlors, 
-                                    const int *img_dim){
+                                    size_t nlors,
+                                    const int *img_dim)
+{
     size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) {
+    if (i < n)
+    {
         joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
     }
 }
@@ -23,25 +25,25 @@ __global__ void joseph3d_fwd_kernel(const float *xstart,
 //////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////
 
-void joseph3d_fwd(const float *xstart, 
-                  const float *xend, 
+void joseph3d_fwd(const float *xstart,
+                  const float *xend,
                   const float *img,
-                  const float *img_origin, 
-                  const float *voxsize, 
+                  const float *img_origin,
+                  const float *voxsize,
                   float *p,
-                  size_t nlors, 
+                  size_t nlors,
                   const int *img_dim,
                   int device_id,
                   int threadsperblock)
 {
 
-    const float* d_xstart = nullptr;
-    const float* d_xend = nullptr;
-    const float* d_img = nullptr;
-    const float* d_img_origin = nullptr;
-    const float* d_voxsize = nullptr;
-    float* d_p = nullptr;
-    const int* d_img_dim = nullptr;
+    const float *d_xstart = nullptr;
+    const float *d_xend = nullptr;
+    const float *d_img = nullptr;
+    const float *d_img_origin = nullptr;
+    const float *d_voxsize = nullptr;
+    float *d_p = nullptr;
+    const int *d_img_dim = nullptr;
 
     // get pointer attributes of all input and output arrays
     cudaPointerAttributes xstart_attr;
@@ -55,18 +57,21 @@ void joseph3d_fwd(const float *xstart,
     bool needs_copy_back = false;
     bool is_cuda_managed_ptr = false;
 
-    if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged)){
+    if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged))
+    {
         is_cuda_managed_ptr = true;
         DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device);
     }
     // else throw error
-    else{
+    else
+    {
         needs_copy_back = true;
         throw std::runtime_error("Unsupported pointer type");
     }
 
-    if (is_cuda_managed_ptr){
-    // all arrays are cuda malloc managed, so no need to copy to the device
+    if (is_cuda_managed_ptr)
+    {
+        // all arrays are cuda malloc managed, so no need to copy to the device
         d_xstart = xstart;
         d_xend = xend;
         d_img = img;
@@ -74,7 +79,9 @@ void joseph3d_fwd(const float *xstart,
         d_voxsize = voxsize;
         d_p = p;
         d_img_dim = img_dim;
-    } else {
+    }
+    else
+    {
         DEBUG_PRINT("COPYING HOST TO DEVICE");
     }
 
@@ -83,16 +90,14 @@ void joseph3d_fwd(const float *xstart,
     cudaGetDevice(&current_device_id);
     DEBUG_PRINT("Using CUDA device: %d\n", current_device_id);
 
-
-    int num_blocks = (int)((nlors + threadsperblock- 1) / threadsperblock);
-    joseph3d_fwd_kernel<<<num_blocks,threadsperblock>>>(d_xstart, d_xend, d_img, 
-                                         d_img_origin, d_voxsize, 
-                                         d_p, nlors, d_img_dim);
+    int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock);
+    joseph3d_fwd_kernel<<<num_blocks, threadsperblock>>>(d_xstart, d_xend, d_img,
+                                                         d_img_origin, d_voxsize,
+                                                         d_p, nlors, d_img_dim);
     cudaDeviceSynchronize();
 
-    //if (needs_copy_back) {
-    //    cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost);
-    //    cudaFree(device_array);
-    //}
+    // if (needs_copy_back) {
+    //     cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost);
+    //     cudaFree(device_array);
+    // }
 }
-
diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h
index f4f8f73..c8e402d 100644
--- a/joseph3d_fwd_worker.h
+++ b/joseph3d_fwd_worker.h
@@ -8,21 +8,22 @@
 #endif
 
 WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
-                                                 const float *xstart, 
-                                                 const float *xend, 
+                                                 const float *xstart,
+                                                 const float *xend,
                                                  const float *img,
-                                                 const float *img_origin, 
-                                                 const float *voxsize, 
+                                                 const float *img_origin,
+                                                 const float *voxsize,
                                                  float *p,
-                                                 const int *img_dim){
+                                                 const int *img_dim)
+{
 
   int n0 = img_dim[0];
   int n1 = img_dim[1];
   int n2 = img_dim[2];
 
-  float d0, d1, d2, d0_sq, d1_sq, d2_sq; 
+  float d0, d1, d2, d0_sq, d1_sq, d2_sq;
   float lsq, cos0_sq, cos1_sq, cos2_sq;
-  unsigned short direction; 
+  unsigned short direction;
   int i0, i1, i2;
   int i0_floor, i1_floor, i2_floor;
   int i0_ceil, i1_ceil, i2_ceil;
@@ -31,13 +32,13 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
 
   float toAdd, cf;
 
-  float xstart0 = xstart[i*3 + 0];
-  float xstart1 = xstart[i*3 + 1];
-  float xstart2 = xstart[i*3 + 2];
+  float xstart0 = xstart[i * 3 + 0];
+  float xstart1 = xstart[i * 3 + 1];
+  float xstart2 = xstart[i * 3 + 2];
 
-  float xend0 = xend[i*3 + 0];
-  float xend1 = xend[i*3 + 1];
-  float xend2 = xend[i*3 + 2];
+  float xend0 = xend[i * 3 + 0];
+  float xend1 = xend[i * 3 + 1];
+  float xend2 = xend[i * 3 + 2];
 
   float voxsize0 = voxsize[0];
   float voxsize1 = voxsize[1];
@@ -50,7 +51,7 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
   unsigned char intersec;
   float t1, t2;
   float istart_f, iend_f, tmp;
-  int   istart, iend;
+  int istart, iend;
 
   // test whether the ray between the two detectors is most parallel
   // with the 0, 1, or 2 axis
@@ -60,16 +61,16 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
 
   //-----------
   //--- test whether ray and cube intersect
-  intersec = ray_cube_intersection(xstart0, xstart1, xstart2, 
-                                   img_origin0 - 1*voxsize0, img_origin1 - 1*voxsize1, img_origin2 - 1*voxsize2,
-                                   img_origin0 + n0*voxsize0, img_origin1 + n1*voxsize1, img_origin2 + n2*voxsize2,
+  intersec = ray_cube_intersection(xstart0, xstart1, xstart2,
+                                   img_origin0 - 1 * voxsize0, img_origin1 - 1 * voxsize1, img_origin2 - 1 * voxsize2,
+                                   img_origin0 + n0 * voxsize0, img_origin1 + n1 * voxsize1, img_origin2 + n2 * voxsize2,
                                    d0, d1, d2, &t1, &t2);
 
   if (intersec == 1)
   {
-    d0_sq = d0*d0;
-    d1_sq = d1*d1;
-    d2_sq = d2*d2;
+    d0_sq = d0 * d0;
+    d1_sq = d1 * d1;
+    d2_sq = d2 * d2;
 
     lsq = d0_sq + d1_sq + d2_sq;
 
@@ -98,80 +99,97 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
       // we step through the volume along the 0 direction
 
       //--- check where ray enters / leaves cube
-      istart_f = (xstart0 + t1*d0 - img_origin0) / voxsize0;
-      iend_f   = (xstart0 + t2*d0 - img_origin0) / voxsize0;
+      istart_f = (xstart0 + t1 * d0 - img_origin0) / voxsize0;
+      iend_f = (xstart0 + t2 * d0 - img_origin0) / voxsize0;
 
-      if (istart_f > iend_f){
-        tmp      = iend_f;
-        iend_f   = istart_f;
+      if (istart_f > iend_f)
+      {
+        tmp = iend_f;
+        iend_f = istart_f;
         istart_f = tmp;
       }
-  
+
       istart = (int)floor(istart_f);
-      iend   = (int)ceil(iend_f);
+      iend = (int)ceil(iend_f);
 
-      if (istart < 0){istart = 0;}
-      if (iend >= n0){iend = n0;}
+      if (istart < 0)
+      {
+        istart = 0;
+      }
+      if (iend >= n0)
+      {
+        iend = n0;
+      }
 
       // check in which "plane" the start and end points are
       // we have to do this to avoid that we include voxels
       // that are "outside" the line segment bewteen xstart and xend
-      
+
       // !! for these calculations we overwrite the istart_f and iend_f variables !!
       istart_f = (xstart0 - img_origin0) / voxsize0;
-      iend_f   = (xend0   - img_origin0) / voxsize0;
+      iend_f = (xend0 - img_origin0) / voxsize0;
 
-      if (istart_f > iend_f){
-        tmp      = iend_f;
-        iend_f   = istart_f;
+      if (istart_f > iend_f)
+      {
+        tmp = iend_f;
+        iend_f = istart_f;
         istart_f = tmp;
       }
 
-      if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-      if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+      if (istart < (int)floor(istart_f))
+      {
+        istart = (int)floor(istart_f);
+      }
+      if (iend >= (int)ceil(iend_f))
+      {
+        iend = (int)ceil(iend_f);
+      }
       //---
 
-      for(i0 = istart; i0 < iend; i0++)
+      for (i0 = istart; i0 < iend; i0++)
       {
         // get the indices where the ray intersects the image plane
-        x_pr1 = xstart1 + (img_origin0 + i0*voxsize0 - xstart0)*d1 / d0;
-        x_pr2 = xstart2 + (img_origin0 + i0*voxsize0 - xstart0)*d2 / d0;
+        x_pr1 = xstart1 + (img_origin0 + i0 * voxsize0 - xstart0) * d1 / d0;
+        x_pr2 = xstart2 + (img_origin0 + i0 * voxsize0 - xstart0) * d2 / d0;
 
-        i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
-        i1_ceil  = i1_floor + 1;
+        i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1);
+        i1_ceil = i1_floor + 1;
 
-        i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
-        i2_ceil  = i2_floor + 1; 
+        i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2);
+        i2_ceil = i2_floor + 1;
 
         // calculate the distances to the floor normalized to [0,1]
         // for the bilinear interpolation
-        tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
-        tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
+        tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1;
+        tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2;
 
         toAdd = 0;
 
         if ((i1_floor >= 0) && (i1_floor < n1) && (i2_floor >= 0) && (i2_floor < n2))
         {
-          toAdd += img[n1*n2*i0 + n2*i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2);
+          toAdd += img[n1 * n2 * i0 + n2 * i1_floor + i2_floor] * (1 - tmp_1) * (1 - tmp_2);
         }
         if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_floor >= 0) && (i2_floor < n2))
         {
-          toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2);
+          toAdd += img[n1 * n2 * i0 + n2 * i1_ceil + i2_floor] * tmp_1 * (1 - tmp_2);
         }
         if ((i1_floor >= 0) && (i1_floor < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
         {
-          toAdd += img[n1*n2*i0 + n2*i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2;
+          toAdd += img[n1 * n2 * i0 + n2 * i1_floor + i2_ceil] * (1 - tmp_1) * tmp_2;
         }
         if ((i1_ceil >= 0) && (i1_ceil < n1) && (i2_ceil >= 0) && (i2_ceil < n2))
         {
-          toAdd += img[n1*n2*i0 + n2*i1_ceil + i2_ceil] * tmp_1 * tmp_2;
+          toAdd += img[n1 * n2 * i0 + n2 * i1_ceil + i2_ceil] * tmp_1 * tmp_2;
         }
 
-        if(toAdd != 0){p[i] += (cf * toAdd);}
+        if (toAdd != 0)
+        {
+          p[i] += (cf * toAdd);
+        }
       }
     }
 
-    //--------------------------------------------------------------------------------- 
+    //---------------------------------------------------------------------------------
     if (direction == 1)
     {
       cf = voxsize1 / sqrtf(cos1_sq);
@@ -180,80 +198,97 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
       // we step through the volume along the 1 direction
 
       //--- check where ray enters / leaves cube
-      istart_f = (xstart1 + t1*d1 - img_origin1) / voxsize1;
-      iend_f   = (xstart1 + t2*d1 - img_origin1) / voxsize1;
+      istart_f = (xstart1 + t1 * d1 - img_origin1) / voxsize1;
+      iend_f = (xstart1 + t2 * d1 - img_origin1) / voxsize1;
 
-      if (istart_f > iend_f){
-        tmp      = iend_f;
-        iend_f   = istart_f;
+      if (istart_f > iend_f)
+      {
+        tmp = iend_f;
+        iend_f = istart_f;
         istart_f = tmp;
       }
-  
+
       istart = (int)floor(istart_f);
-      iend   = (int)ceil(iend_f);
+      iend = (int)ceil(iend_f);
 
-      if (istart < 0){istart = 0;}
-      if (iend >= n1){iend = n1;}
+      if (istart < 0)
+      {
+        istart = 0;
+      }
+      if (iend >= n1)
+      {
+        iend = n1;
+      }
 
       // check in which "plane" the start and end points are
       // we have to do this to avoid that we include voxels
       // that are "outside" the line segment bewteen xstart and xend
-      
+
       // !! for these calculations we overwrite the istart_f and iend_f variables !!
       istart_f = (xstart1 - img_origin1) / voxsize1;
-      iend_f   = (xend1   - img_origin1) / voxsize1;
+      iend_f = (xend1 - img_origin1) / voxsize1;
 
-      if (istart_f > iend_f){
-        tmp      = iend_f;
-        iend_f   = istart_f;
+      if (istart_f > iend_f)
+      {
+        tmp = iend_f;
+        iend_f = istart_f;
         istart_f = tmp;
       }
 
-      if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-      if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+      if (istart < (int)floor(istart_f))
+      {
+        istart = (int)floor(istart_f);
+      }
+      if (iend >= (int)ceil(iend_f))
+      {
+        iend = (int)ceil(iend_f);
+      }
       //---
 
       for (i1 = istart; i1 < iend; i1++)
       {
         // get the indices where the ray intersects the image plane
-        x_pr0 = xstart0 + (img_origin1 + i1*voxsize1 - xstart1)*d0 / d1;
-        x_pr2 = xstart2 + (img_origin1 + i1*voxsize1 - xstart1)*d2 / d1;
+        x_pr0 = xstart0 + (img_origin1 + i1 * voxsize1 - xstart1) * d0 / d1;
+        x_pr2 = xstart2 + (img_origin1 + i1 * voxsize1 - xstart1) * d2 / d1;
 
-        i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
-        i0_ceil  = i0_floor + 1; 
+        i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0);
+        i0_ceil = i0_floor + 1;
 
-        i2_floor = (int)floor((x_pr2 - img_origin2)/voxsize2);
-        i2_ceil  = i2_floor + 1;
+        i2_floor = (int)floor((x_pr2 - img_origin2) / voxsize2);
+        i2_ceil = i2_floor + 1;
 
         // calculate the distances to the floor normalized to [0,1]
         // for the bilinear interpolation
-        tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
-        tmp_2 = (x_pr2 - (i2_floor*voxsize2 + img_origin2)) / voxsize2;
+        tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0;
+        tmp_2 = (x_pr2 - (i2_floor * voxsize2 + img_origin2)) / voxsize2;
 
         toAdd = 0;
 
         if ((i0_floor >= 0) && (i0_floor < n0) && (i2_floor >= 0) && (i2_floor < n2))
         {
-          toAdd += img[n1*n2*i0_floor + n2*i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2);
+          toAdd += img[n1 * n2 * i0_floor + n2 * i1 + i2_floor] * (1 - tmp_0) * (1 - tmp_2);
         }
         if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_floor >= 0) && (i2_floor < n2))
         {
-          toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_floor] * tmp_0 * (1 - tmp_2);
+          toAdd += img[n1 * n2 * i0_ceil + n2 * i1 + i2_floor] * tmp_0 * (1 - tmp_2);
         }
         if ((i0_floor >= 0) && (i0_floor < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
         {
-          toAdd += img[n1*n2*i0_floor + n2*i1 + i2_ceil] * (1 - tmp_0) * tmp_2;
+          toAdd += img[n1 * n2 * i0_floor + n2 * i1 + i2_ceil] * (1 - tmp_0) * tmp_2;
         }
         if ((i0_ceil >= 0) && (i0_ceil < n0) && (i2_ceil >= 0) && (i2_ceil < n2))
         {
-          toAdd += img[n1*n2*i0_ceil + n2*i1 + i2_ceil] * tmp_0 * tmp_2;
+          toAdd += img[n1 * n2 * i0_ceil + n2 * i1 + i2_ceil] * tmp_0 * tmp_2;
         }
 
-        if(toAdd != 0){p[i] += (cf * toAdd);}
+        if (toAdd != 0)
+        {
+          p[i] += (cf * toAdd);
+        }
       }
     }
 
-    //--------------------------------------------------------------------------------- 
+    //---------------------------------------------------------------------------------
     if (direction == 2)
     {
       cf = voxsize2 / sqrtf(cos2_sq);
@@ -262,76 +297,93 @@ WORKER_QUALIFIER inline void joseph3d_fwd_worker(size_t i,
       // we step through the volume along the 2 direction
 
       //--- check where ray enters / leaves cube
-      istart_f = (xstart2 + t1*d2 - img_origin2) / voxsize2;
-      iend_f   = (xstart2 + t2*d2 - img_origin2) / voxsize2;
+      istart_f = (xstart2 + t1 * d2 - img_origin2) / voxsize2;
+      iend_f = (xstart2 + t2 * d2 - img_origin2) / voxsize2;
 
-      if (istart_f > iend_f){
-        tmp      = iend_f;
-        iend_f   = istart_f;
+      if (istart_f > iend_f)
+      {
+        tmp = iend_f;
+        iend_f = istart_f;
         istart_f = tmp;
       }
-  
+
       istart = (int)floor(istart_f);
-      iend   = (int)ceil(iend_f);
+      iend = (int)ceil(iend_f);
 
-      if (istart < 0){istart = 0;}
-      if (iend >= n2){iend = n2;}
+      if (istart < 0)
+      {
+        istart = 0;
+      }
+      if (iend >= n2)
+      {
+        iend = n2;
+      }
 
       // check in which "plane" the start and end points are
       // we have to do this to avoid that we include voxels
       // that are "outside" the line segment bewteen xstart and xend
-      
+
       // !! for these calculations we overwrite the istart_f and iend_f variables !!
       istart_f = (xstart2 - img_origin2) / voxsize2;
-      iend_f   = (xend2   - img_origin2) / voxsize2;
+      iend_f = (xend2 - img_origin2) / voxsize2;
 
-      if (istart_f > iend_f){
-        tmp      = iend_f;
-        iend_f   = istart_f;
+      if (istart_f > iend_f)
+      {
+        tmp = iend_f;
+        iend_f = istart_f;
         istart_f = tmp;
       }
 
-      if (istart < (int)floor(istart_f)){istart = (int)floor(istart_f);}
-      if (iend >= (int)ceil(iend_f)){iend = (int)ceil(iend_f);}
+      if (istart < (int)floor(istart_f))
+      {
+        istart = (int)floor(istart_f);
+      }
+      if (iend >= (int)ceil(iend_f))
+      {
+        iend = (int)ceil(iend_f);
+      }
       //---
 
-      for(i2 = istart; i2 < iend; i2++)
+      for (i2 = istart; i2 < iend; i2++)
       {
         // get the indices where the ray intersects the image plane
-        x_pr0 = xstart0 + (img_origin2 + i2*voxsize2 - xstart2)*d0 / d2;
-        x_pr1 = xstart1 + (img_origin2 + i2*voxsize2 - xstart2)*d1 / d2;
+        x_pr0 = xstart0 + (img_origin2 + i2 * voxsize2 - xstart2) * d0 / d2;
+        x_pr1 = xstart1 + (img_origin2 + i2 * voxsize2 - xstart2) * d1 / d2;
 
-        i0_floor = (int)floor((x_pr0 - img_origin0)/voxsize0);
-        i0_ceil  = i0_floor + 1;
+        i0_floor = (int)floor((x_pr0 - img_origin0) / voxsize0);
+        i0_ceil = i0_floor + 1;
 
-        i1_floor = (int)floor((x_pr1 - img_origin1)/voxsize1);
-        i1_ceil  = i1_floor + 1; 
+        i1_floor = (int)floor((x_pr1 - img_origin1) / voxsize1);
+        i1_ceil = i1_floor + 1;
 
         // calculate the distances to the floor normalized to [0,1]
         // for the bilinear interpolation
-        tmp_0 = (x_pr0 - (i0_floor*voxsize0 + img_origin0)) / voxsize0;
-        tmp_1 = (x_pr1 - (i1_floor*voxsize1 + img_origin1)) / voxsize1;
+        tmp_0 = (x_pr0 - (i0_floor * voxsize0 + img_origin0)) / voxsize0;
+        tmp_1 = (x_pr1 - (i1_floor * voxsize1 + img_origin1)) / voxsize1;
 
         toAdd = 0;
 
         if ((i0_floor >= 0) && (i0_floor < n0) && (i1_floor >= 0) && (i1_floor < n1))
         {
-          toAdd += img[n1*n2*i0_floor + n2*i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1);
+          toAdd += img[n1 * n2 * i0_floor + n2 * i1_floor + i2] * (1 - tmp_0) * (1 - tmp_1);
         }
         if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_floor >= 0) && (i1_floor < n1))
         {
-          toAdd += img[n1*n2*i0_ceil + n2*i1_floor + i2] * tmp_0 * (1 - tmp_1);
+          toAdd += img[n1 * n2 * i0_ceil + n2 * i1_floor + i2] * tmp_0 * (1 - tmp_1);
         }
         if ((i0_floor >= 0) && (i0_floor < n0) && (i1_ceil >= 0) & (i1_ceil < n1))
         {
-          toAdd += img[n1*n2*i0_floor + n2*i1_ceil + i2] * (1 - tmp_0) * tmp_1;
+          toAdd += img[n1 * n2 * i0_floor + n2 * i1_ceil + i2] * (1 - tmp_0) * tmp_1;
         }
         if ((i0_ceil >= 0) && (i0_ceil < n0) && (i1_ceil >= 0) && (i1_ceil < n1))
         {
-          toAdd += img[n1*n2*i0_ceil + n2*i1_ceil + i2] * tmp_0 * tmp_1;
+          toAdd += img[n1 * n2 * i0_ceil + n2 * i1_ceil + i2] * tmp_0 * tmp_1;
         }
 
-        if(toAdd != 0){p[i] += (cf * toAdd);}
+        if (toAdd != 0)
+        {
+          p[i] += (cf * toAdd);
+        }
       }
     }
   }
diff --git a/main.cpp b/main.cpp
index 641c0b1..aac1d7f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -3,7 +3,8 @@
 #include <chrono>
 #include <cmath>
 
-int main() {
+int main()
+{
     const size_t repetitions = 5;
     size_t nlors = 10;
 
@@ -18,11 +19,12 @@ int main() {
     float voxsize[3] = {4, 3, 2};
 
     float img_origin[3];
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < 3; ++i)
+    {
         img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i];
     }
 
-    float* img = new float[img_dim[0] * img_dim[1] * img_dim[2]];
+    float *img = new float[img_dim[0] * img_dim[1] * img_dim[2]];
 
     // fill the test image
     for (int i0 = 0; i0 < img_dim[0]; i0++)
@@ -44,28 +46,28 @@ int main() {
     float id2 = static_cast<float>(img_dim[2]);
 
     float vstart[] = {
-        0, -1, 0,           // 0
-        0, -1, 0,           // 1
-        0, -1, 1,           // 2
-        0, -1, 0.5,         // 3
-        0, 0, -1,           // 4
-        -1, 0, 0,           // 5
-        id0 - 1, -1, 0,      // 6 - (shifted 1)
+        0, -1, 0,             // 0
+        0, -1, 0,             // 1
+        0, -1, 1,             // 2
+        0, -1, 0.5,           // 3
+        0, 0, -1,             // 4
+        -1, 0, 0,             // 5
+        id0 - 1, -1, 0,       // 6 - (shifted 1)
         id0 - 1, -1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, -1,      // 8 - (shifted 4)
+        id0 - 1, 0, -1,       // 8 - (shifted 4)
         id0 - 1, id1 - 1, -1, // 9 - (shifted 8)
     };
 
     float vend[] = {
-        0, id1, 0,           // 0
-        0, id1, 0,           // 1
-        0, id1, 1,           // 2
-        0, id1, 0.5,         // 3
-        0, 0, id2,           // 4
-        id0, 0, 0,           // 5
-        id0 - 1, id1, 0,      // 6 - (shifted 1)
+        0, id1, 0,             // 0
+        0, id1, 0,             // 1
+        0, id1, 1,             // 2
+        0, id1, 0.5,           // 3
+        0, 0, id2,             // 4
+        id0, 0, 0,             // 5
+        id0 - 1, id1, 0,       // 6 - (shifted 1)
         id0 - 1, id1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, id2,      // 8 - (shifted 4)
+        id0 - 1, 0, id2,       // 8 - (shifted 4)
         id0 - 1, id1 - 1, id2, // 9 - (shifted 8)
     };
 
@@ -77,7 +79,7 @@ int main() {
     }
 
     // calculate the start and end coordinates in world coordinates
-    
+
     float *xstart = new float[3 * nlors];
     float *xend = new float[3 * nlors];
 
@@ -103,7 +105,7 @@ int main() {
     int retval = 0;
     float eps = 1e-7;
 
-    float* expected_fwd_vals = new float[nlors];
+    float *expected_fwd_vals = new float[nlors];
     // initialize expected_fwd_vals with 0s
     for (int ir = 0; ir < nlors; ir++)
     {
@@ -185,13 +187,15 @@ int main() {
 
     // test the back projection
 
-    float* bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]];
-    for (size_t i = 0; i < (img_dim[0]*img_dim[1]*img_dim[2]); i++){
+    float *bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]];
+    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
+    {
         bimg[i] = 0;
     }
 
-    float* ones = new float[nlors];
-    for (size_t i = 0; i < nlors; i++){
+    float *ones = new float[nlors];
+    for (size_t i = 0; i < nlors; i++)
+    {
         ones[i] = 1;
     }
 
diff --git a/main.cu b/main.cu
index 5bdf872..7437b4c 100644
--- a/main.cu
+++ b/main.cu
@@ -4,7 +4,8 @@
 #include <chrono>
 #include <cmath>
 
-int main() {
+int main()
+{
     const size_t repetitions = 5;
     size_t nlors = 10;
 
@@ -20,25 +21,26 @@ int main() {
 
     cudaSetDevice(device_count - 1);
 
-    int* img_dim;
+    int *img_dim;
     cudaMallocManaged(&img_dim, 3 * sizeof(int));
     img_dim[0] = 2;
     img_dim[1] = 3;
     img_dim[2] = 4;
 
-    float* voxsize;
+    float *voxsize;
     cudaMallocManaged(&voxsize, 3 * sizeof(float));
     voxsize[0] = 4;
     voxsize[1] = 3;
     voxsize[2] = 2;
 
-    float* img_origin;
+    float *img_origin;
     cudaMallocManaged(&img_origin, 3 * sizeof(float));
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < 3; ++i)
+    {
         img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i];
     }
 
-    float* img;
+    float *img;
     cudaMallocManaged(&img, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float));
 
     // fill the test image
@@ -61,28 +63,28 @@ int main() {
     float id2 = static_cast<float>(img_dim[2]);
 
     float vstart[] = {
-        0, -1, 0,           // 0
-        0, -1, 0,           // 1
-        0, -1, 1,           // 2
-        0, -1, 0.5,         // 3
-        0, 0, -1,           // 4
-        -1, 0, 0,           // 5
-        id0 - 1, -1, 0,      // 6 - (shifted 1)
+        0, -1, 0,             // 0
+        0, -1, 0,             // 1
+        0, -1, 1,             // 2
+        0, -1, 0.5,           // 3
+        0, 0, -1,             // 4
+        -1, 0, 0,             // 5
+        id0 - 1, -1, 0,       // 6 - (shifted 1)
         id0 - 1, -1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, -1,      // 8 - (shifted 4)
+        id0 - 1, 0, -1,       // 8 - (shifted 4)
         id0 - 1, id1 - 1, -1, // 9 - (shifted 8)
     };
 
     float vend[] = {
-        0, id1, 0,           // 0
-        0, id1, 0,           // 1
-        0, id1, 1,           // 2
-        0, id1, 0.5,         // 3
-        0, 0, id2,           // 4
-        id0, 0, 0,           // 5
-        id0 - 1, id1, 0,      // 6 - (shifted 1)
+        0, id1, 0,             // 0
+        0, id1, 0,             // 1
+        0, id1, 1,             // 2
+        0, id1, 0.5,           // 3
+        0, 0, id2,             // 4
+        id0, 0, 0,             // 5
+        id0 - 1, id1, 0,       // 6 - (shifted 1)
         id0 - 1, id1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, id2,      // 8 - (shifted 4)
+        id0 - 1, 0, id2,       // 8 - (shifted 4)
         id0 - 1, id1 - 1, id2, // 9 - (shifted 8)
     };
 
@@ -94,11 +96,11 @@ int main() {
     }
 
     // calculate the start and end coordinates in world coordinates
-    
+
     float *xstart;
-    cudaMallocManaged(&xstart, (3*nlors) * sizeof(float));
+    cudaMallocManaged(&xstart, (3 * nlors) * sizeof(float));
     float *xend;
-    cudaMallocManaged(&xend, (3*nlors) * sizeof(float));
+    cudaMallocManaged(&xend, (3 * nlors) * sizeof(float));
 
     for (int ir = 0; ir < nlors; ir++)
 
@@ -110,7 +112,6 @@ int main() {
         }
     }
 
-
     float *img_fwd;
     cudaMallocManaged(&img_fwd, nlors * sizeof(float));
 
@@ -118,7 +119,6 @@ int main() {
 
     // calculate the expected values
 
-
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
@@ -126,7 +126,7 @@ int main() {
     int retval = 0;
     float eps = 1e-7;
 
-    float* expected_fwd_vals = new float[nlors];
+    float *expected_fwd_vals = new float[nlors];
     // initialize expected_fwd_vals with 0s
     for (int ir = 0; ir < nlors; ir++)
     {
@@ -218,4 +218,3 @@ int main() {
 
     return retval;
 }
-
diff --git a/utils.h b/utils.h
index a9e642c..d046b35 100644
--- a/utils.h
+++ b/utils.h
@@ -1,74 +1,100 @@
 #pragma once
 #include "cuda_compat.h"
 
-WORKER_QUALIFIER inline void atomic_sum(float* target, float value) {
+WORKER_QUALIFIER inline void atomic_sum(float *target, float value)
+{
 #ifdef __CUDA_ARCH__
-    atomicAdd(target, value);
+  atomicAdd(target, value);
 #else
 #pragma omp atomic
-    *target += value;
+  *target += value;
 #endif
 }
 
 WORKER_QUALIFIER inline unsigned char ray_cube_intersection(float orig0,
-                                                     float orig1,
-                                                     float orig2,
-                                                     float bounds0_min,
-                                                     float bounds1_min,
-                                                     float bounds2_min,
-                                                     float bounds0_max,
-                                                     float bounds1_max,
-                                                     float bounds2_max,
-                                                     float rdir0,
-                                                     float rdir1,
-                                                     float rdir2,
-                                                     float* t1,
-                                                     float* t2){
+                                                            float orig1,
+                                                            float orig2,
+                                                            float bounds0_min,
+                                                            float bounds1_min,
+                                                            float bounds2_min,
+                                                            float bounds0_max,
+                                                            float bounds1_max,
+                                                            float bounds2_max,
+                                                            float rdir0,
+                                                            float rdir1,
+                                                            float rdir2,
+                                                            float *t1,
+                                                            float *t2)
+{
   // the inverse of the directional vector
   // using the inverse of the directional vector and IEEE floating point arith standard 754
-  // makes sure that 0's in the directional vector are handled correctly 
-  float invdir0 = 1.f/rdir0;
-  float invdir1 = 1.f/rdir1;
-  float invdir2 = 1.f/rdir2;
-  
+  // makes sure that 0's in the directional vector are handled correctly
+  float invdir0 = 1.f / rdir0;
+  float invdir1 = 1.f / rdir1;
+  float invdir2 = 1.f / rdir2;
+
   unsigned char intersec = 1;
-  
-  float t11, t12, t21, t22; 
 
-  if (invdir0 >= 0){
-    *t1  = (bounds0_min - orig0) * invdir0;
-    *t2  = (bounds0_max - orig0) * invdir0; 
+  float t11, t12, t21, t22;
+
+  if (invdir0 >= 0)
+  {
+    *t1 = (bounds0_min - orig0) * invdir0;
+    *t2 = (bounds0_max - orig0) * invdir0;
   }
-  else{
-    *t1  = (bounds0_max - orig0) * invdir0;
-    *t2  = (bounds0_min - orig0) * invdir0;
+  else
+  {
+    *t1 = (bounds0_max - orig0) * invdir0;
+    *t2 = (bounds0_min - orig0) * invdir0;
   }
-  
-  if (invdir1 >= 0){
-    t11 = (bounds1_min - orig1) * invdir1; 
-    t12 = (bounds1_max - orig1) * invdir1; 
+
+  if (invdir1 >= 0)
+  {
+    t11 = (bounds1_min - orig1) * invdir1;
+    t12 = (bounds1_max - orig1) * invdir1;
   }
-  else{
+  else
+  {
     t11 = (bounds1_max - orig1) * invdir1;
-    t12 = (bounds1_min - orig1) * invdir1; 
+    t12 = (bounds1_min - orig1) * invdir1;
+  }
+
+  if ((*t1 > t12) || (t11 > *t2))
+  {
+    intersec = 0;
+  }
+  if (t11 > *t1)
+  {
+    *t1 = t11;
+  }
+  if (t12 < *t2)
+  {
+    *t2 = t12;
   }
-  
-  if ((*t1 > t12) || (t11 > *t2)){intersec = 0;}
-  if (t11 > *t1){*t1 = t11;}
-  if (t12 < *t2){*t2 = t12;}
-  
-  if (invdir2 >= 0){
-    t21 = (bounds2_min - orig2) * invdir2; 
+
+  if (invdir2 >= 0)
+  {
+    t21 = (bounds2_min - orig2) * invdir2;
     t22 = (bounds2_max - orig2) * invdir2;
-  } 
-  else{
-    t21 = (bounds2_max - orig2) * invdir2; 
+  }
+  else
+  {
+    t21 = (bounds2_max - orig2) * invdir2;
     t22 = (bounds2_min - orig2) * invdir2;
-  } 
-  
-  if ((*t1 > t22) || (t21 > *t2)){intersec = 0;}
-  if (t21 > *t1){*t1 = t21;}
-  if (t22 < *t2){*t2 = t22;} 
+  }
+
+  if ((*t1 > t22) || (t21 > *t2))
+  {
+    intersec = 0;
+  }
+  if (t21 > *t1)
+  {
+    *t1 = t21;
+  }
+  if (t22 < *t2)
+  {
+    *t2 = t22;
+  }
 
-  return(intersec);
+  return (intersec);
 }

From d349b1fc697c7e4b7cb149f1ab2ddca33b521a97 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Thu, 10 Apr 2025 17:24:07 +0200
Subject: [PATCH 09/35] use __CUDACC__ and fix bug with variable name

---
 joseph3d_back_worker.h | 2 +-
 joseph3d_fwd.cu        | 2 +-
 joseph3d_fwd_worker.h  | 2 +-
 main.cu                | 1 -
 utils.h                | 2 +-
 5 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/joseph3d_back_worker.h b/joseph3d_back_worker.h
index b7fdb23..75e5751 100644
--- a/joseph3d_back_worker.h
+++ b/joseph3d_back_worker.h
@@ -3,7 +3,7 @@
 #include "utils.h"
 
 // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
-#ifndef __CUDA_ARCH__
+#ifndef __CUDACC__
 #include <math.h>
 #endif
 
diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu
index 10e3f75..8c67f37 100644
--- a/joseph3d_fwd.cu
+++ b/joseph3d_fwd.cu
@@ -15,7 +15,7 @@ __global__ void joseph3d_fwd_kernel(const float *xstart,
                                     const int *img_dim)
 {
     size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n)
+    if (i < nlors)
     {
         joseph3d_fwd_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
     }
diff --git a/joseph3d_fwd_worker.h b/joseph3d_fwd_worker.h
index c8e402d..2e9106f 100644
--- a/joseph3d_fwd_worker.h
+++ b/joseph3d_fwd_worker.h
@@ -3,7 +3,7 @@
 #include "utils.h"
 
 // we need to import math.h for the floor, ceil, sqrtf if not compiling for CUDA
-#ifndef __CUDA_ARCH__
+#ifndef __CUDACC__
 #include <math.h>
 #endif
 
diff --git a/main.cu b/main.cu
index 7437b4c..5bf0d46 100644
--- a/main.cu
+++ b/main.cu
@@ -6,7 +6,6 @@
 
 int main()
 {
-    const size_t repetitions = 5;
     size_t nlors = 10;
 
     // get the number of cuda devices - because we want to run on the last device
diff --git a/utils.h b/utils.h
index d046b35..f5281ad 100644
--- a/utils.h
+++ b/utils.h
@@ -3,7 +3,7 @@
 
 WORKER_QUALIFIER inline void atomic_sum(float *target, float value)
 {
-#ifdef __CUDA_ARCH__
+#ifdef __CUDACC__
   atomicAdd(target, value);
 #else
 #pragma omp atomic

From ff616218ef7f2a05b54d44903cf8db0ec8674f0a Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Thu, 10 Apr 2025 17:37:31 +0200
Subject: [PATCH 10/35] add joseph3d_back.cu

---
 CMakeLists.txt   |   2 +-
 TODO.txt         |   5 +++
 joseph3d_back.cu | 103 +++++++++++++++++++++++++++++++++++++++++++++++
 main.cu          |  68 +++++++++++++++++++++++++++++++
 4 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 TODO.txt
 create mode 100644 joseph3d_back.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 934f362..d17d3b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ if(CMAKE_CUDA_COMPILER)
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
     
-    add_library(parallelproj SHARED joseph3d_fwd.cu)
+    add_library(parallelproj SHARED joseph3d_fwd.cu joseph3d_back.cu)
     target_link_libraries(parallelproj PRIVATE CUDA::cudart)
 
     add_executable(main main.cu)
diff --git a/TODO.txt b/TODO.txt
new file mode 100644
index 0000000..53bf847
--- /dev/null
+++ b/TODO.txt
@@ -0,0 +1,5 @@
+- managed cases: cudaprefetchasync to device, read / write hints
+- host pointer cases
+- device array cases
+
+- test all cases in main.cu
\ No newline at end of file
diff --git a/joseph3d_back.cu b/joseph3d_back.cu
new file mode 100644
index 0000000..18ea918
--- /dev/null
+++ b/joseph3d_back.cu
@@ -0,0 +1,103 @@
+#include "parallelproj.h"
+#include "joseph3d_back_worker.h"
+#include "debug.h"
+#include <cuda_runtime.h>
+#include <iostream>
+#include <stdexcept>
+
+__global__ void joseph3d_back_kernel(const float *xstart,
+                                     const float *xend,
+                                     float *img,
+                                     const float *img_origin,
+                                     const float *voxsize,
+                                     const float *p,
+                                     size_t nlors,
+                                     const int *img_dim)
+{
+    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < nlors)
+    {
+        joseph3d_back_worker(i, xstart, xend, img, img_origin, voxsize, p, img_dim);
+    }
+}
+
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////
+
+void joseph3d_back(const float *xstart,
+                   const float *xend,
+                   float *img,
+                   const float *img_origin,
+                   const float *voxsize,
+                   const float *p,
+                   size_t nlors,
+                   const int *img_dim,
+                   int device_id,
+                   int threadsperblock)
+{
+
+    const float *d_xstart = nullptr;
+    const float *d_xend = nullptr;
+    float *d_img = nullptr;
+    const float *d_img_origin = nullptr;
+    const float *d_voxsize = nullptr;
+    const float *d_p = nullptr;
+    const int *d_img_dim = nullptr;
+
+    // get pointer attributes of all input and output arrays
+    cudaPointerAttributes xstart_attr;
+    cudaError_t err = cudaPointerGetAttributes(&xstart_attr, xstart);
+    /////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////
+    // TODO get attributes of all other arrays
+    /////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////
+
+    bool needs_copy_back = false;
+    bool is_cuda_managed_ptr = false;
+
+    if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged))
+    {
+        is_cuda_managed_ptr = true;
+        DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device);
+    }
+    // else throw error
+    else
+    {
+        needs_copy_back = true;
+        throw std::runtime_error("Unsupported pointer type");
+    }
+
+    if (is_cuda_managed_ptr)
+    {
+        // all arrays are cuda malloc managed, so no need to copy to the device
+        d_xstart = xstart;
+        d_xend = xend;
+        d_img = img;
+        d_img_origin = img_origin;
+        d_voxsize = voxsize;
+        d_p = p;
+        d_img_dim = img_dim;
+    }
+    else
+    {
+        DEBUG_PRINT("COPYING HOST TO DEVICE");
+    }
+
+    // get and print the current cuda device ID
+    int current_device_id;
+    cudaGetDevice(&current_device_id);
+    DEBUG_PRINT("Using CUDA device: %d\n", current_device_id);
+
+    int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock);
+    joseph3d_back_kernel<<<num_blocks, threadsperblock>>>(d_xstart, d_xend, d_img,
+                                                          d_img_origin, d_voxsize,
+                                                          d_p, nlors, d_img_dim);
+    cudaDeviceSynchronize();
+
+    // if (needs_copy_back) {
+    //     cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost);
+    //     cudaFree(device_array);
+    // }
+}
diff --git a/main.cu b/main.cu
index 5bf0d46..b6fa349 100644
--- a/main.cu
+++ b/main.cu
@@ -205,6 +205,71 @@ int main()
     ////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////
 
+    // test the back projection
+
+    float *bimg;
+    cudaMallocManaged(&bimg, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float));
+
+    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
+    {
+        bimg[i] = 0;
+    }
+
+    float *ones;
+    cudaMallocManaged(&ones, nlors * sizeof(float));
+    for (size_t i = 0; i < nlors; i++)
+    {
+        ones[i] = 1;
+    }
+
+    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim, 0, 64);
+
+    printf("\nback projection of ones along all rays:\n");
+    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    {
+        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        {
+            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
+            {
+                printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    // To test whether the back projection is correct, we test if the back projector is the adjoint
+    // of the forward projector. This is more practical than checking a lot of single voxels in the
+    // back projected image.
+
+    float inner_product1 = 0;
+    float inner_product2 = 0;
+
+    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
+    {
+        inner_product1 += (img[i] * bimg[i]);
+    }
+
+    for (size_t ir = 0; ir < nlors; ir++)
+    {
+        inner_product2 += (img_fwd[ir] * ones[ir]);
+    }
+
+    float ip_diff = fabs(inner_product1 - inner_product2);
+
+    if (ip_diff > eps)
+    {
+        printf("\n#########################################################################");
+        printf("\nback projection test failed. back projection seems not to be the adjoint.");
+        printf("\n %.7e", ip_diff);
+        printf("\n#########################################################################\n");
+        retval = 1;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+
     cudaFree(img_dim);
     cudaFree(voxsize);
     cudaFree(img_origin);
@@ -213,6 +278,9 @@ int main()
     cudaFree(xend);
     cudaFree(img_fwd);
 
+    cudaFree(bimg);
+    cudaFree(ones);
+
     free(expected_fwd_vals);
 
     return retval;

From 9332113adbbdf5038de081d2bfb5ba43d3af784d Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 15:32:21 +0200
Subject: [PATCH 11/35] start refactorization

---
 joseph3d_back.cu |   6 ++
 joseph3d_fwd.cu  | 224 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 196 insertions(+), 34 deletions(-)

diff --git a/joseph3d_back.cu b/joseph3d_back.cu
index 18ea918..d959808 100644
--- a/joseph3d_back.cu
+++ b/joseph3d_back.cu
@@ -37,6 +37,12 @@ void joseph3d_back(const float *xstart,
                    int threadsperblock)
 {
 
+    // Set the CUDA device
+    if (device_id >= 0)
+    {
+        cudaSetDevice(device_id);
+    }
+
     const float *d_xstart = nullptr;
     const float *d_xend = nullptr;
     float *d_img = nullptr;
diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu
index 8c67f37..9e7b950 100644
--- a/joseph3d_fwd.cu
+++ b/joseph3d_fwd.cu
@@ -36,59 +36,198 @@ void joseph3d_fwd(const float *xstart,
                   int device_id,
                   int threadsperblock)
 {
+    // Set the CUDA device
+    if (device_id >= 0)
+    {
+        cudaSetDevice(device_id);
+    }
 
-    const float *d_xstart = nullptr;
-    const float *d_xend = nullptr;
-    const float *d_img = nullptr;
-    const float *d_img_origin = nullptr;
-    const float *d_voxsize = nullptr;
-    float *d_p = nullptr;
-    const int *d_img_dim = nullptr;
-
-    // get pointer attributes of all input and output arrays
-    cudaPointerAttributes xstart_attr;
-    cudaError_t err = cudaPointerGetAttributes(&xstart_attr, xstart);
     /////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////
-    // TODO get attributes of all other arrays
+    // copy arrays to device if needed
     /////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////
 
-    bool needs_copy_back = false;
-    bool is_cuda_managed_ptr = false;
+    // handle xstart (read only)
+    float *d_xstart = nullptr;
+    cudaPointerAttributes xstart_attr;
+    cudaError_t err_xstart = cudaPointerGetAttributes(&xstart_attr, xstart);
+    bool free_xstart = false;
+    if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged))
+    {
+        cudaMemPrefetchAsync(xstart, sizeof(float) * nlors * 3, device_id);
+        cudaMemAdvise(xstart, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id);
+    }
+
+    if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged ||
+                                      xstart_attr.type == cudaMemoryTypeDevice))
+    {
+        d_xstart = const_cast<float *>(xstart);
+    }
+    else
+    {
+        // host pointer case, transfer to device
+        cudaMalloc(&d_xstart, sizeof(float) * nlors * 3);
+        cudaMemcpy(d_xstart, xstart, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice);
+        free_xstart = true;
+    }
+
+    // handle xend (read only)
+    float *d_xend = nullptr;
+    cudaPointerAttributes xend_attr;
+    cudaError_t err_xend = cudaPointerGetAttributes(&xend_attr, xend);
+    bool free_xend = false;
+    if (err_xend == cudaSuccess && xend_attr.type == cudaMemoryTypeManaged)
+    {
+        cudaMemPrefetchAsync(xend, sizeof(float) * nlors * 3, device_id);
+        cudaMemAdvise(xend, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id);
+    }
 
-    if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged))
+    if (err_xend == cudaSuccess && (xend_attr.type == cudaMemoryTypeManaged ||
+                                    xend_attr.type == cudaMemoryTypeDevice))
     {
-        is_cuda_managed_ptr = true;
-        DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device);
+        d_xend = const_cast<float *>(xend);
     }
-    // else throw error
     else
     {
-        needs_copy_back = true;
-        throw std::runtime_error("Unsupported pointer type");
+        // host pointer case, transfer to device
+        cudaMalloc(&d_xend, sizeof(float) * nlors * 3);
+        cudaMemcpy(d_xend, xend, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice);
+        free_xend = true;
+    }
+
+    // handle img (read only)
+    float *d_img = nullptr;
+    cudaPointerAttributes img_attr;
+    cudaError_t err_img = cudaPointerGetAttributes(&img_attr, img);
+    bool free_img = false;
+    if (err_img == cudaSuccess && img_attr.type == cudaMemoryTypeManaged)
+    {
+        cudaMemPrefetchAsync(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], device_id);
+        cudaMemAdvise(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemAdviseSetReadMostly, device_id);
     }
 
-    if (is_cuda_managed_ptr)
+    if (err_img == cudaSuccess && (img_attr.type == cudaMemoryTypeManaged ||
+                                   img_attr.type == cudaMemoryTypeDevice))
     {
-        // all arrays are cuda malloc managed, so no need to copy to the device
-        d_xstart = xstart;
-        d_xend = xend;
-        d_img = img;
-        d_img_origin = img_origin;
-        d_voxsize = voxsize;
-        d_p = p;
-        d_img_dim = img_dim;
+        d_img = const_cast<float *>(img);
     }
     else
     {
-        DEBUG_PRINT("COPYING HOST TO DEVICE");
+        // host pointer case, transfer to device
+        cudaMalloc(&d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2]);
+        cudaMemcpy(d_img, img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyHostToDevice);
+        free_img = true;
+    }
+
+    // handle img_origin (read only)
+    float *d_img_origin = nullptr;
+    cudaPointerAttributes img_origin_attr;
+    cudaError_t err_img_origin = cudaPointerGetAttributes(&img_origin_attr, img_origin);
+    bool free_img_origin = false;
+    if (err_img_origin == cudaSuccess && img_origin_attr.type == cudaMemoryTypeManaged)
+    {
+        cudaMemPrefetchAsync(img_origin, sizeof(float) * 3, device_id);
+        cudaMemAdvise(img_origin, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id);
     }
 
+    if (err_img_origin == cudaSuccess && (img_origin_attr.type == cudaMemoryTypeManaged ||
+                                          img_origin_attr.type == cudaMemoryTypeDevice))
+    {
+        d_img_origin = const_cast<float *>(img_origin);
+    }
+    else
+    {
+        // host pointer case, transfer to device
+        cudaMalloc(&d_img_origin, sizeof(float) * 3);
+        cudaMemcpy(d_img_origin, img_origin, sizeof(float) * 3, cudaMemcpyHostToDevice);
+        free_img_origin = true;
+    }
+
+    // handle voxsize (read only)
+    float *d_voxsize = nullptr;
+    cudaPointerAttributes voxsize_attr;
+    cudaError_t err_voxsize = cudaPointerGetAttributes(&voxsize_attr, voxsize);
+    bool free_voxsize = false;
+    if (err_voxsize == cudaSuccess && voxsize_attr.type == cudaMemoryTypeManaged)
+    {
+        cudaMemPrefetchAsync(voxsize, sizeof(float) * 3, device_id);
+        cudaMemAdvise(voxsize, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id);
+    }
+
+    if (err_voxsize == cudaSuccess && (voxsize_attr.type == cudaMemoryTypeManaged ||
+                                       voxsize_attr.type == cudaMemoryTypeDevice))
+    {
+        d_voxsize = const_cast<float *>(voxsize);
+    }
+    else
+    {
+        // host pointer case, transfer to device
+        cudaMalloc(&d_voxsize, sizeof(float) * 3);
+        cudaMemcpy(d_voxsize, voxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
+        free_voxsize = true;
+    }
+
+    // handle p (write)
+    float *d_p = nullptr;
+    cudaPointerAttributes p_attr;
+    cudaError_t err_p = cudaPointerGetAttributes(&p_attr, p);
+    bool free_p = false;
+    if (err_p == cudaSuccess && p_attr.type == cudaMemoryTypeManaged)
+    {
+        cudaMemPrefetchAsync(p, sizeof(float) * nlors, device_id);
+        cudaMemAdvise(p, sizeof(float) * nlors, cudaMemAdviseSetAccessedBy, device_id);
+    }
+
+    if (err_p == cudaSuccess && (p_attr.type == cudaMemoryTypeManaged ||
+                                 p_attr.type == cudaMemoryTypeDevice))
+    {
+        d_p = const_cast<float *>(p);
+    }
+    else
+    {
+        // host pointer case, transfer to device
+        cudaMalloc(&d_p, sizeof(float) * nlors);
+        cudaMemcpy(d_p, p, sizeof(float) * nlors, cudaMemcpyHostToDevice);
+        free_p = true;
+    }
+
+    // handle img_dim (read only)
+    int *d_img_dim = nullptr;
+    cudaPointerAttributes img_dim_attr;
+    cudaError_t err_img_dim = cudaPointerGetAttributes(&img_dim_attr, img_dim);
+    bool free_img_dim = false;
+    if (err_img_dim == cudaSuccess && img_dim_attr.type == cudaMemoryTypeManaged)
+    {
+        cudaMemPrefetchAsync(img_dim, sizeof(int) * 3, device_id);
+        cudaMemAdvise(img_dim, sizeof(int) * 3, cudaMemAdviseSetReadMostly, device_id);
+    }
+
+    if (err_img_dim == cudaSuccess && (img_dim_attr.type == cudaMemoryTypeManaged ||
+                                       img_dim_attr.type == cudaMemoryTypeDevice))
+    {
+        d_img_dim = const_cast<int *>(img_dim);
+    }
+    else
+    {
+        // host pointer case, transfer to device
+        cudaMalloc(&d_img_dim, sizeof(int) * 3);
+        cudaMemcpy(d_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyHostToDevice);
+        free_img_dim = true;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    // launch the kernel
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+
+#ifdef DEBUG
     // get and print the current cuda device ID
     int current_device_id;
     cudaGetDevice(&current_device_id);
     DEBUG_PRINT("Using CUDA device: %d\n", current_device_id);
+#endif
 
     int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock);
     joseph3d_fwd_kernel<<<num_blocks, threadsperblock>>>(d_xstart, d_xend, d_img,
@@ -96,8 +235,25 @@ void joseph3d_fwd(const float *xstart,
                                                          d_p, nlors, d_img_dim);
     cudaDeviceSynchronize();
 
-    // if (needs_copy_back) {
-    //     cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost);
-    //     cudaFree(device_array);
-    // }
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    // free device memory if needed
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+
+    // Free device memory if it was allocated
+    if (free_xstart)
+        cudaFree(d_xstart);
+    if (free_xend)
+        cudaFree(d_xend);
+    if (free_img)
+        cudaFree(d_img);
+    if (free_img_origin)
+        cudaFree(d_img_origin);
+    if (free_voxsize)
+        cudaFree(d_voxsize);
+    if (free_p)
+        cudaFree(d_p);
+    if (free_img_dim)
+        cudaFree(d_img_dim);
 }

From 921e126d8b2aaa348766c5de71c287ba15490b3e Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 16:00:54 +0200
Subject: [PATCH 12/35] wip

---
 cuda_utils.cu   |  31 +++++++++
 cuda_utils.h    |   6 ++
 joseph3d_fwd.cu | 162 +++++-------------------------------------------
 3 files changed, 52 insertions(+), 147 deletions(-)
 create mode 100644 cuda_utils.cu
 create mode 100644 cuda_utils.h

diff --git a/cuda_utils.cu b/cuda_utils.cu
new file mode 100644
index 0000000..4be521a
--- /dev/null
+++ b/cuda_utils.cu
@@ -0,0 +1,31 @@
+#include "cuda_utils.h"
+
+template <typename T>
+void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint)
+{
+    cudaPointerAttributes attr;
+    cudaError_t err = cudaPointerGetAttributes(&attr, host_ptr);
+    free_flag = false;
+
+    if (err == cudaSuccess && attr.type == cudaMemoryTypeManaged)
+    {
+        cudaMemPrefetchAsync(const_cast<void *>(static_cast<const void *>(host_ptr)), size, device_id);
+        cudaMemAdvise(const_cast<void *>(static_cast<const void *>(host_ptr)), size, memory_hint, device_id);
+    }
+
+    if (err == cudaSuccess && (attr.type == cudaMemoryTypeManaged || attr.type == cudaMemoryTypeDevice))
+    {
+        *device_ptr = const_cast<T *>(host_ptr);
+    }
+    else
+    {
+        // Host pointer case, transfer to device
+        cudaMalloc(device_ptr, size);
+        cudaMemcpy(*device_ptr, host_ptr, size, cudaMemcpyHostToDevice);
+        free_flag = true;
+    }
+}
+
+// Explicit template instantiations
+template void handle_cuda_input_array<float>(const float *, float **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<int>(const int *, int **, size_t, bool &, int, cudaMemoryAdvise);
diff --git a/cuda_utils.h b/cuda_utils.h
new file mode 100644
index 0000000..2accedb
--- /dev/null
+++ b/cuda_utils.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint);
diff --git a/joseph3d_fwd.cu b/joseph3d_fwd.cu
index 9e7b950..1093983 100644
--- a/joseph3d_fwd.cu
+++ b/joseph3d_fwd.cu
@@ -1,6 +1,7 @@
 #include "parallelproj.h"
 #include "joseph3d_fwd_worker.h"
 #include "debug.h"
+#include "cuda_utils.h"
 #include <cuda_runtime.h>
 #include <iostream>
 #include <stdexcept>
@@ -48,173 +49,40 @@ void joseph3d_fwd(const float *xstart,
     /////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////
 
-    // handle xstart (read only)
+    // Handle xstart (read mostly)
     float *d_xstart = nullptr;
-    cudaPointerAttributes xstart_attr;
-    cudaError_t err_xstart = cudaPointerGetAttributes(&xstart_attr, xstart);
     bool free_xstart = false;
-    if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged))
-    {
-        cudaMemPrefetchAsync(xstart, sizeof(float) * nlors * 3, device_id);
-        cudaMemAdvise(xstart, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id);
-    }
-
-    if (err_xstart == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged ||
-                                      xstart_attr.type == cudaMemoryTypeDevice))
-    {
-        d_xstart = const_cast<float *>(xstart);
-    }
-    else
-    {
-        // host pointer case, transfer to device
-        cudaMalloc(&d_xstart, sizeof(float) * nlors * 3);
-        cudaMemcpy(d_xstart, xstart, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice);
-        free_xstart = true;
-    }
+    handle_cuda_input_array(xstart, &d_xstart, sizeof(float) * nlors * 3, free_xstart, device_id, cudaMemAdviseSetReadMostly);
 
-    // handle xend (read only)
+    // Handle xend (read mostly)
     float *d_xend = nullptr;
-    cudaPointerAttributes xend_attr;
-    cudaError_t err_xend = cudaPointerGetAttributes(&xend_attr, xend);
     bool free_xend = false;
-    if (err_xend == cudaSuccess && xend_attr.type == cudaMemoryTypeManaged)
-    {
-        cudaMemPrefetchAsync(xend, sizeof(float) * nlors * 3, device_id);
-        cudaMemAdvise(xend, sizeof(float) * nlors * 3, cudaMemAdviseSetReadMostly, device_id);
-    }
+    handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly);
 
-    if (err_xend == cudaSuccess && (xend_attr.type == cudaMemoryTypeManaged ||
-                                    xend_attr.type == cudaMemoryTypeDevice))
-    {
-        d_xend = const_cast<float *>(xend);
-    }
-    else
-    {
-        // host pointer case, transfer to device
-        cudaMalloc(&d_xend, sizeof(float) * nlors * 3);
-        cudaMemcpy(d_xend, xend, sizeof(float) * nlors * 3, cudaMemcpyHostToDevice);
-        free_xend = true;
-    }
-
-    // handle img (read only)
+    // Handle img (read mostly)
     float *d_img = nullptr;
-    cudaPointerAttributes img_attr;
-    cudaError_t err_img = cudaPointerGetAttributes(&img_attr, img);
     bool free_img = false;
-    if (err_img == cudaSuccess && img_attr.type == cudaMemoryTypeManaged)
-    {
-        cudaMemPrefetchAsync(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], device_id);
-        cudaMemAdvise(img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemAdviseSetReadMostly, device_id);
-    }
-
-    if (err_img == cudaSuccess && (img_attr.type == cudaMemoryTypeManaged ||
-                                   img_attr.type == cudaMemoryTypeDevice))
-    {
-        d_img = const_cast<float *>(img);
-    }
-    else
-    {
-        // host pointer case, transfer to device
-        cudaMalloc(&d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2]);
-        cudaMemcpy(d_img, img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyHostToDevice);
-        free_img = true;
-    }
+    handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetReadMostly);
 
-    // handle img_origin (read only)
+    // Handle img_origin (read mostly)
     float *d_img_origin = nullptr;
-    cudaPointerAttributes img_origin_attr;
-    cudaError_t err_img_origin = cudaPointerGetAttributes(&img_origin_attr, img_origin);
     bool free_img_origin = false;
-    if (err_img_origin == cudaSuccess && img_origin_attr.type == cudaMemoryTypeManaged)
-    {
-        cudaMemPrefetchAsync(img_origin, sizeof(float) * 3, device_id);
-        cudaMemAdvise(img_origin, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id);
-    }
+    handle_cuda_input_array(img_origin, &d_img_origin, sizeof(float) * 3, free_img_origin, device_id, cudaMemAdviseSetReadMostly);
 
-    if (err_img_origin == cudaSuccess && (img_origin_attr.type == cudaMemoryTypeManaged ||
-                                          img_origin_attr.type == cudaMemoryTypeDevice))
-    {
-        d_img_origin = const_cast<float *>(img_origin);
-    }
-    else
-    {
-        // host pointer case, transfer to device
-        cudaMalloc(&d_img_origin, sizeof(float) * 3);
-        cudaMemcpy(d_img_origin, img_origin, sizeof(float) * 3, cudaMemcpyHostToDevice);
-        free_img_origin = true;
-    }
-
-    // handle voxsize (read only)
+    // Handle voxsize (read mostly)
     float *d_voxsize = nullptr;
-    cudaPointerAttributes voxsize_attr;
-    cudaError_t err_voxsize = cudaPointerGetAttributes(&voxsize_attr, voxsize);
     bool free_voxsize = false;
-    if (err_voxsize == cudaSuccess && voxsize_attr.type == cudaMemoryTypeManaged)
-    {
-        cudaMemPrefetchAsync(voxsize, sizeof(float) * 3, device_id);
-        cudaMemAdvise(voxsize, sizeof(float) * 3, cudaMemAdviseSetReadMostly, device_id);
-    }
-
-    if (err_voxsize == cudaSuccess && (voxsize_attr.type == cudaMemoryTypeManaged ||
-                                       voxsize_attr.type == cudaMemoryTypeDevice))
-    {
-        d_voxsize = const_cast<float *>(voxsize);
-    }
-    else
-    {
-        // host pointer case, transfer to device
-        cudaMalloc(&d_voxsize, sizeof(float) * 3);
-        cudaMemcpy(d_voxsize, voxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
-        free_voxsize = true;
-    }
+    handle_cuda_input_array(voxsize, &d_voxsize, sizeof(float) * 3, free_voxsize, device_id, cudaMemAdviseSetReadMostly);
 
-    // handle p (write)
+    // Handle p (write access)
     float *d_p = nullptr;
-    cudaPointerAttributes p_attr;
-    cudaError_t err_p = cudaPointerGetAttributes(&p_attr, p);
     bool free_p = false;
-    if (err_p == cudaSuccess && p_attr.type == cudaMemoryTypeManaged)
-    {
-        cudaMemPrefetchAsync(p, sizeof(float) * nlors, device_id);
-        cudaMemAdvise(p, sizeof(float) * nlors, cudaMemAdviseSetAccessedBy, device_id);
-    }
+    handle_cuda_input_array(p, &d_p, sizeof(float) * nlors, free_p, device_id, cudaMemAdviseSetAccessedBy);
 
-    if (err_p == cudaSuccess && (p_attr.type == cudaMemoryTypeManaged ||
-                                 p_attr.type == cudaMemoryTypeDevice))
-    {
-        d_p = const_cast<float *>(p);
-    }
-    else
-    {
-        // host pointer case, transfer to device
-        cudaMalloc(&d_p, sizeof(float) * nlors);
-        cudaMemcpy(d_p, p, sizeof(float) * nlors, cudaMemcpyHostToDevice);
-        free_p = true;
-    }
-
-    // handle img_dim (read only)
+    // Handle img_dim (read mostly)
     int *d_img_dim = nullptr;
-    cudaPointerAttributes img_dim_attr;
-    cudaError_t err_img_dim = cudaPointerGetAttributes(&img_dim_attr, img_dim);
     bool free_img_dim = false;
-    if (err_img_dim == cudaSuccess && img_dim_attr.type == cudaMemoryTypeManaged)
-    {
-        cudaMemPrefetchAsync(img_dim, sizeof(int) * 3, device_id);
-        cudaMemAdvise(img_dim, sizeof(int) * 3, cudaMemAdviseSetReadMostly, device_id);
-    }
-
-    if (err_img_dim == cudaSuccess && (img_dim_attr.type == cudaMemoryTypeManaged ||
-                                       img_dim_attr.type == cudaMemoryTypeDevice))
-    {
-        d_img_dim = const_cast<int *>(img_dim);
-    }
-    else
-    {
-        // host pointer case, transfer to device
-        cudaMalloc(&d_img_dim, sizeof(int) * 3);
-        cudaMemcpy(d_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyHostToDevice);
-        free_img_dim = true;
-    }
+    handle_cuda_input_array(img_dim, &d_img_dim, sizeof(int) * 3, free_img_dim, device_id, cudaMemAdviseSetReadMostly);
 
     ////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////

From 01eaed0bfa1f5b53f83745f8ce5aa205d79241e6 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 16:17:44 +0200
Subject: [PATCH 13/35] re-organize project

---
 CMakeLists.txt                                | 27 +++++++++++++++----
 cuda_compat.h => include/cuda_compat.h        |  0
 cuda_utils.h => include/cuda_utils.h          |  0
 debug.h => include/debug.h                    |  0
 parallelproj.h => include/parallelproj.h      |  0
 square_array.h => include/square_array.h      |  0
 square_op.h => include/square_op.h            |  0
 utils.h => include/utils.h                    |  0
 cuda_utils.cu => src/cuda_utils.cu            |  0
 joseph3d_back.cpp => src/joseph3d_back.cpp    |  0
 joseph3d_back.cu => src/joseph3d_back.cu      |  0
 .../joseph3d_back_worker.h                    |  0
 joseph3d_fwd.cpp => src/joseph3d_fwd.cpp      |  0
 joseph3d_fwd.cu => src/joseph3d_fwd.cu        |  0
 .../joseph3d_fwd_worker.h                     |  0
 main.cpp => src/main.cpp                      |  0
 main.cu => src/main.cu                        |  0
 17 files changed, 22 insertions(+), 5 deletions(-)
 rename cuda_compat.h => include/cuda_compat.h (100%)
 rename cuda_utils.h => include/cuda_utils.h (100%)
 rename debug.h => include/debug.h (100%)
 rename parallelproj.h => include/parallelproj.h (100%)
 rename square_array.h => include/square_array.h (100%)
 rename square_op.h => include/square_op.h (100%)
 rename utils.h => include/utils.h (100%)
 rename cuda_utils.cu => src/cuda_utils.cu (100%)
 rename joseph3d_back.cpp => src/joseph3d_back.cpp (100%)
 rename joseph3d_back.cu => src/joseph3d_back.cu (100%)
 rename joseph3d_back_worker.h => src/joseph3d_back_worker.h (100%)
 rename joseph3d_fwd.cpp => src/joseph3d_fwd.cpp (100%)
 rename joseph3d_fwd.cu => src/joseph3d_fwd.cu (100%)
 rename joseph3d_fwd_worker.h => src/joseph3d_fwd_worker.h (100%)
 rename main.cpp => src/main.cpp (100%)
 rename main.cu => src/main.cu (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d17d3b8..f27d71c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,20 +33,37 @@ if(DEBUG)
     add_compile_definitions(DEBUG)
 endif()
 
+
+
 if(CMAKE_CUDA_COMPILER)
+    set(LIB_SOURCES
+        src/joseph3d_fwd.cu
+        src/joseph3d_back.cu
+        src/cuda_utils.cu
+    )
+
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
     
-    add_library(parallelproj SHARED joseph3d_fwd.cu joseph3d_back.cu)
+    add_library(parallelproj SHARED ${LIB_SOURCES})
+    target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include)
     target_link_libraries(parallelproj PRIVATE CUDA::cudart)
 
-    add_executable(main main.cu)
-    target_include_directories(main PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+    add_executable(main src/main.cu)
+    target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include ${CUDAToolkit_INCLUDE_DIRS})
     target_link_libraries(main PRIVATE parallelproj CUDA::cudart)
 else()
+    set(LIB_SOURCES
+        src/joseph3d_fwd.cpp
+        src/joseph3d_back.cpp
+    )
+
     find_package(OpenMP REQUIRED)
-    add_library(parallelproj SHARED joseph3d_fwd.cpp joseph3d_back.cpp)
+    add_library(parallelproj SHARED ${LIB_SOURCES})
+    target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include)
     target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX)
-    add_executable(main main.cpp)
+
+    add_executable(main src/main.cpp)
+    target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include)
     target_link_libraries(main PRIVATE parallelproj)
 endif()
diff --git a/cuda_compat.h b/include/cuda_compat.h
similarity index 100%
rename from cuda_compat.h
rename to include/cuda_compat.h
diff --git a/cuda_utils.h b/include/cuda_utils.h
similarity index 100%
rename from cuda_utils.h
rename to include/cuda_utils.h
diff --git a/debug.h b/include/debug.h
similarity index 100%
rename from debug.h
rename to include/debug.h
diff --git a/parallelproj.h b/include/parallelproj.h
similarity index 100%
rename from parallelproj.h
rename to include/parallelproj.h
diff --git a/square_array.h b/include/square_array.h
similarity index 100%
rename from square_array.h
rename to include/square_array.h
diff --git a/square_op.h b/include/square_op.h
similarity index 100%
rename from square_op.h
rename to include/square_op.h
diff --git a/utils.h b/include/utils.h
similarity index 100%
rename from utils.h
rename to include/utils.h
diff --git a/cuda_utils.cu b/src/cuda_utils.cu
similarity index 100%
rename from cuda_utils.cu
rename to src/cuda_utils.cu
diff --git a/joseph3d_back.cpp b/src/joseph3d_back.cpp
similarity index 100%
rename from joseph3d_back.cpp
rename to src/joseph3d_back.cpp
diff --git a/joseph3d_back.cu b/src/joseph3d_back.cu
similarity index 100%
rename from joseph3d_back.cu
rename to src/joseph3d_back.cu
diff --git a/joseph3d_back_worker.h b/src/joseph3d_back_worker.h
similarity index 100%
rename from joseph3d_back_worker.h
rename to src/joseph3d_back_worker.h
diff --git a/joseph3d_fwd.cpp b/src/joseph3d_fwd.cpp
similarity index 100%
rename from joseph3d_fwd.cpp
rename to src/joseph3d_fwd.cpp
diff --git a/joseph3d_fwd.cu b/src/joseph3d_fwd.cu
similarity index 100%
rename from joseph3d_fwd.cu
rename to src/joseph3d_fwd.cu
diff --git a/joseph3d_fwd_worker.h b/src/joseph3d_fwd_worker.h
similarity index 100%
rename from joseph3d_fwd_worker.h
rename to src/joseph3d_fwd_worker.h
diff --git a/main.cpp b/src/main.cpp
similarity index 100%
rename from main.cpp
rename to src/main.cpp
diff --git a/main.cu b/src/main.cu
similarity index 100%
rename from main.cu
rename to src/main.cu

From 6f7c0815b8cd3578c29c45f0c59d9bb57adf3368 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 16:24:04 +0200
Subject: [PATCH 14/35] refactor joseph3d_back.cu

---
 src/joseph3d_back.cu | 110 +++++++++++++++++++++++++------------------
 1 file changed, 64 insertions(+), 46 deletions(-)

diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu
index d959808..c090d5d 100644
--- a/src/joseph3d_back.cu
+++ b/src/joseph3d_back.cu
@@ -1,6 +1,7 @@
 #include "parallelproj.h"
 #include "joseph3d_back_worker.h"
 #include "debug.h"
+#include "cuda_utils.h"
 #include <cuda_runtime.h>
 #include <iostream>
 #include <stdexcept>
@@ -36,65 +37,65 @@ void joseph3d_back(const float *xstart,
                    int device_id,
                    int threadsperblock)
 {
-
     // Set the CUDA device
     if (device_id >= 0)
     {
         cudaSetDevice(device_id);
     }
 
-    const float *d_xstart = nullptr;
-    const float *d_xend = nullptr;
-    float *d_img = nullptr;
-    const float *d_img_origin = nullptr;
-    const float *d_voxsize = nullptr;
-    const float *d_p = nullptr;
-    const int *d_img_dim = nullptr;
-
-    // get pointer attributes of all input and output arrays
-    cudaPointerAttributes xstart_attr;
-    cudaError_t err = cudaPointerGetAttributes(&xstart_attr, xstart);
     /////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////
-    // TODO get attributes of all other arrays
+    // copy arrays to device if needed
     /////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////
 
-    bool needs_copy_back = false;
-    bool is_cuda_managed_ptr = false;
+    // Handle xstart (read mostly)
+    float *d_xstart = nullptr;
+    bool free_xstart = false;
+    handle_cuda_input_array(xstart, &d_xstart, sizeof(float) * nlors * 3, free_xstart, device_id, cudaMemAdviseSetReadMostly);
 
-    if (err == cudaSuccess && (xstart_attr.type == cudaMemoryTypeManaged))
-    {
-        is_cuda_managed_ptr = true;
-        DEBUG_PRINT("Managed array is on device : %d\n", xstart_attr.device);
-    }
-    // else throw error
-    else
-    {
-        needs_copy_back = true;
-        throw std::runtime_error("Unsupported pointer type");
-    }
+    // Handle xend (read mostly)
+    float *d_xend = nullptr;
+    bool free_xend = false;
+    handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly);
 
-    if (is_cuda_managed_ptr)
-    {
-        // all arrays are cuda malloc managed, so no need to copy to the device
-        d_xstart = xstart;
-        d_xend = xend;
-        d_img = img;
-        d_img_origin = img_origin;
-        d_voxsize = voxsize;
-        d_p = p;
-        d_img_dim = img_dim;
-    }
-    else
-    {
-        DEBUG_PRINT("COPYING HOST TO DEVICE");
-    }
+    // Handle img (write access)
+    float *d_img = nullptr;
+    bool free_img = false;
+    handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetAccessedBy);
+
+    // Handle img_origin (read mostly)
+    float *d_img_origin = nullptr;
+    bool free_img_origin = false;
+    handle_cuda_input_array(img_origin, &d_img_origin, sizeof(float) * 3, free_img_origin, device_id, cudaMemAdviseSetReadMostly);
+
+    // Handle voxsize (read mostly)
+    float *d_voxsize = nullptr;
+    bool free_voxsize = false;
+    handle_cuda_input_array(voxsize, &d_voxsize, sizeof(float) * 3, free_voxsize, device_id, cudaMemAdviseSetReadMostly);
+
+    // Handle p (read mostly)
+    float *d_p = nullptr;
+    bool free_p = false;
+    handle_cuda_input_array(p, &d_p, sizeof(float) * nlors, free_p, device_id, cudaMemAdviseSetReadMostly);
 
+    // Handle img_dim (read mostly)
+    int *d_img_dim = nullptr;
+    bool free_img_dim = false;
+    handle_cuda_input_array(img_dim, &d_img_dim, sizeof(int) * 3, free_img_dim, device_id, cudaMemAdviseSetReadMostly);
+
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    // launch the kernel
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+
+#ifdef DEBUG
     // get and print the current cuda device ID
     int current_device_id;
     cudaGetDevice(&current_device_id);
     DEBUG_PRINT("Using CUDA device: %d\n", current_device_id);
+#endif
 
     int num_blocks = (int)((nlors + threadsperblock - 1) / threadsperblock);
     joseph3d_back_kernel<<<num_blocks, threadsperblock>>>(d_xstart, d_xend, d_img,
@@ -102,8 +103,25 @@ void joseph3d_back(const float *xstart,
                                                           d_p, nlors, d_img_dim);
     cudaDeviceSynchronize();
 
-    // if (needs_copy_back) {
-    //     cudaMemcpy(array, device_array, size * sizeof(float), cudaMemcpyDeviceToHost);
-    //     cudaFree(device_array);
-    // }
-}
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    // free device memory if needed
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+
+    // Free device memory if it was allocated
+    if (free_xstart)
+        cudaFree(d_xstart);
+    if (free_xend)
+        cudaFree(d_xend);
+    if (free_img)
+        cudaFree(d_img);
+    if (free_img_origin)
+        cudaFree(d_img_origin);
+    if (free_voxsize)
+        cudaFree(d_voxsize);
+    if (free_p)
+        cudaFree(d_p);
+    if (free_img_dim)
+        cudaFree(d_img_dim);
+}
\ No newline at end of file

From fd9816c360d7d9567db78594526a07233d277cb6 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 16:26:44 +0200
Subject: [PATCH 15/35] mv tests

---
 src/main.cpp => tests/test_nontof_fwd_back.cpp | 0
 src/main.cu => tests/test_nontof_fwd_back.cu   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename src/main.cpp => tests/test_nontof_fwd_back.cpp (100%)
 rename src/main.cu => tests/test_nontof_fwd_back.cu (100%)

diff --git a/src/main.cpp b/tests/test_nontof_fwd_back.cpp
similarity index 100%
rename from src/main.cpp
rename to tests/test_nontof_fwd_back.cpp
diff --git a/src/main.cu b/tests/test_nontof_fwd_back.cu
similarity index 100%
rename from src/main.cu
rename to tests/test_nontof_fwd_back.cu

From 5e92e0f90d0f16a9d6e5ef651b1606a24be9bf05 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 16:42:30 +0200
Subject: [PATCH 16/35] build tests

---
 CMakeLists.txt | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f27d71c..3e6f048 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,9 +49,9 @@ if(CMAKE_CUDA_COMPILER)
     target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include)
     target_link_libraries(parallelproj PRIVATE CUDA::cudart)
 
-    add_executable(main src/main.cu)
-    target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include ${CUDAToolkit_INCLUDE_DIRS})
-    target_link_libraries(main PRIVATE parallelproj CUDA::cudart)
+    set(TEST_SOURCES
+        tests/test_nontof_fwd_back.cu
+    )
 else()
     set(LIB_SOURCES
         src/joseph3d_fwd.cpp
@@ -63,7 +63,26 @@ else()
     target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include)
     target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX)
 
-    add_executable(main src/main.cpp)
-    target_include_directories(main PRIVATE ${CMAKE_SOURCE_DIR}/include)
-    target_link_libraries(main PRIVATE parallelproj)
+    set(TEST_SOURCES
+        tests/test_nontof_fwd_back.cpp
+    )
 endif()
+
+
+################################################################################
+### BUILD TESTS ################################################################
+################################################################################
+
+enable_testing()
+
+foreach(TEST_SOURCE ${TEST_SOURCES})
+    get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE)
+    add_executable(${TEST_NAME} ${TEST_SOURCE})
+    target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include)
+    if(CMAKE_CUDA_COMPILER)
+        target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart)
+    else()
+        target_link_libraries(${TEST_NAME} PRIVATE parallelproj OpenMP::OpenMP_CXX)
+    endif()
+    add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+endforeach()

From b299358d063ec56502faf0b83dbc5a6076e979b7 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 16:54:41 +0200
Subject: [PATCH 17/35] move obsolete files

---
 CMakeLists_square.txt => obsolete/CMakeLists_square.txt | 0
 main.py => obsolete/main.py                             | 0
 main_cuda.cpp => obsolete/main_cuda.cpp                 | 0
 main_no_cuda.cpp => obsolete/main_no_cuda.cpp           | 0
 square_array.cpp => obsolete/square_array.cpp           | 0
 square_array.cu => obsolete/square_array.cu             | 0
 utils.py => obsolete/utils.py                           | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename CMakeLists_square.txt => obsolete/CMakeLists_square.txt (100%)
 rename main.py => obsolete/main.py (100%)
 rename main_cuda.cpp => obsolete/main_cuda.cpp (100%)
 rename main_no_cuda.cpp => obsolete/main_no_cuda.cpp (100%)
 rename square_array.cpp => obsolete/square_array.cpp (100%)
 rename square_array.cu => obsolete/square_array.cu (100%)
 rename utils.py => obsolete/utils.py (100%)

diff --git a/CMakeLists_square.txt b/obsolete/CMakeLists_square.txt
similarity index 100%
rename from CMakeLists_square.txt
rename to obsolete/CMakeLists_square.txt
diff --git a/main.py b/obsolete/main.py
similarity index 100%
rename from main.py
rename to obsolete/main.py
diff --git a/main_cuda.cpp b/obsolete/main_cuda.cpp
similarity index 100%
rename from main_cuda.cpp
rename to obsolete/main_cuda.cpp
diff --git a/main_no_cuda.cpp b/obsolete/main_no_cuda.cpp
similarity index 100%
rename from main_no_cuda.cpp
rename to obsolete/main_no_cuda.cpp
diff --git a/square_array.cpp b/obsolete/square_array.cpp
similarity index 100%
rename from square_array.cpp
rename to obsolete/square_array.cpp
diff --git a/square_array.cu b/obsolete/square_array.cu
similarity index 100%
rename from square_array.cu
rename to obsolete/square_array.cu
diff --git a/utils.py b/obsolete/utils.py
similarity index 100%
rename from utils.py
rename to obsolete/utils.py

From 30589e01b9ab755a0274b1517a413be6e4c992b7 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 17:37:34 +0200
Subject: [PATCH 18/35] add cmake config

---
 CMakeLists.txt                    | 64 ++++++++++++++++++++++++++++++-
 cmake/parallelprojConfig.cmake.in |  9 +++++
 2 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 cmake/parallelprojConfig.cmake.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e6f048..0763c78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,9 +2,15 @@ cmake_minimum_required(VERSION 3.18)
 project(PARALLELPROJ LANGUAGES C CXX)
 
 include(CheckLanguage)
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
 check_language(CUDA)
 
-# Derive version from Git or folder name
+################################################################################
+### DERIVE VERSION FROM GIT ####################################################
+################################################################################
+
 if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
     execute_process(
         COMMAND git describe --tags --long --dirty
@@ -33,6 +39,9 @@ if(DEBUG)
     add_compile_definitions(DEBUG)
 endif()
 
+################################################################################
+### DEFINE TARGETS #############################################################
+################################################################################
 
 
 if(CMAKE_CUDA_COMPILER)
@@ -44,10 +53,12 @@ if(CMAKE_CUDA_COMPILER)
 
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
+    set(PARALLELPROJ_CUDA 1)
     
     add_library(parallelproj SHARED ${LIB_SOURCES})
     target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include)
     target_link_libraries(parallelproj PRIVATE CUDA::cudart)
+    target_compile_definitions(parallelproj PUBLIC PARALLELPROJ_CUDA=1) 
 
     set(TEST_SOURCES
         tests/test_nontof_fwd_back.cu
@@ -59,15 +70,66 @@ else()
     )
 
     find_package(OpenMP REQUIRED)
+    set(PARALLELPROJ_CUDA 0)
+
     add_library(parallelproj SHARED ${LIB_SOURCES})
     target_include_directories(parallelproj PRIVATE ${CMAKE_SOURCE_DIR}/include)
     target_link_libraries(parallelproj PRIVATE OpenMP::OpenMP_CXX)
+    target_compile_definitions(parallelproj PUBLIC PARALLELPROJ_CUDA=0)
 
     set(TEST_SOURCES
         tests/test_nontof_fwd_back.cpp
     )
 endif()
 
+################################################################################
+### INSTALL AND CONFIGS ########################################################
+################################################################################
+
+# Install the library and headers
+install(TARGETS parallelproj
+    EXPORT parallelprojTargets
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+# Export the build tree for other projects
+export(EXPORT parallelprojTargets
+    FILE ${CMAKE_BINARY_DIR}/parallelprojTargets.cmake
+    NAMESPACE parallelproj::
+)
+
+# Install the export configuration
+install(EXPORT parallelprojTargets
+    FILE parallelprojTargets.cmake
+    NAMESPACE parallelproj::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parallelproj
+)
+
+# Create and install a package configuration file
+write_basic_package_version_file(
+    ${CMAKE_BINARY_DIR}/parallelprojConfigVersion.cmake
+    VERSION ${PROJECT_VERSION}
+    COMPATIBILITY AnyNewerVersion
+)
+
+configure_file(${CMAKE_SOURCE_DIR}/cmake/parallelprojConfig.cmake.in
+    ${CMAKE_BINARY_DIR}/parallelprojConfig.cmake
+    @ONLY
+)
+
+install(FILES
+    ${CMAKE_BINARY_DIR}/parallelprojConfig.cmake
+    ${CMAKE_BINARY_DIR}/parallelprojConfigVersion.cmake
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parallelproj
+)
+
 
 ################################################################################
 ### BUILD TESTS ################################################################
diff --git a/cmake/parallelprojConfig.cmake.in b/cmake/parallelprojConfig.cmake.in
new file mode 100644
index 0000000..5e263d9
--- /dev/null
+++ b/cmake/parallelprojConfig.cmake.in
@@ -0,0 +1,9 @@
+include("${CMAKE_CURRENT_LIST_DIR}/parallelprojTargets.cmake")
+
+# Include the version file for compatibility checks
+include("${CMAKE_CURRENT_LIST_DIR}/parallelprojConfigVersion.cmake")
+
+set(PARALLELPROJ_CUDA @PARALLELPROJ_CUDA@)
+set(PARALLELPROJ_INCLUDE_DIRS "${CMAKE_INSTALL_INCLUDEDIR}")
+set(PARALLELPROJ_LIBRARY_DIRS "${CMAKE_INSTALL_LIBDIR}")
+set(PARALLELPROJ_VERSION "@PROJECT_VERSION@")
\ No newline at end of file

From 38ec122bb63563738f20ea454f675a6ce81d7371 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Fri, 11 Apr 2025 17:40:22 +0200
Subject: [PATCH 19/35] add windows_export symbols

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0763c78..f3180c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,11 @@ include(CMakePackageConfigHelpers)
 
 check_language(CUDA)
 
+# Enable automatic symbol export on Windows for shared libraries
+if(WIN32 AND BUILD_SHARED_LIBS)
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
+endif()
+
 ################################################################################
 ### DERIVE VERSION FROM GIT ####################################################
 ################################################################################

From 81f8285ea485bf1496e22856e00e569655718b95 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 15:56:24 +0200
Subject: [PATCH 20/35] read test input data from text files

---
 CMakeLists.txt                 |  29 ++++++-
 tests/expected_fwd_vals.txt    |  10 +++
 tests/file_utils.h             |  14 ++++
 tests/file_utils.tpp           |  40 +++++++++
 tests/img.txt                  |  24 ++++++
 tests/test_nontof_fwd_back.cpp |   1 +
 tests/test_nontof_fwd_back.cu  | 145 +++++++--------------------------
 tests/vend.txt                 |  30 +++++++
 tests/vstart.txt               |  30 +++++++
 9 files changed, 208 insertions(+), 115 deletions(-)
 create mode 100644 tests/expected_fwd_vals.txt
 create mode 100644 tests/file_utils.h
 create mode 100644 tests/file_utils.tpp
 create mode 100644 tests/img.txt
 create mode 100644 tests/vend.txt
 create mode 100644 tests/vstart.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3180c3..7650395 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -142,10 +142,37 @@ install(FILES
 
 enable_testing()
 
+set(TEST_DATA_DIR ${CMAKE_SOURCE_DIR}/tests)
+
+# List all the .txt files that need to be copied
+set(TEST_DATA_FILES
+    img.txt
+    vstart.txt
+    vend.txt
+    expected_fwd_vals.txt
+)
+
+# Copy each file to the build directory
+foreach(TEST_FILE ${TEST_DATA_FILES})
+    add_custom_command(
+        OUTPUT ${CMAKE_BINARY_DIR}/${TEST_FILE}
+        COMMAND ${CMAKE_COMMAND} -E copy ${TEST_DATA_DIR}/${TEST_FILE} ${CMAKE_BINARY_DIR}/${TEST_FILE}
+        DEPENDS ${TEST_DATA_DIR}/${TEST_FILE}
+    )
+    list(APPEND TEST_DATA_OUTPUTS ${CMAKE_BINARY_DIR}/${TEST_FILE})
+endforeach()
+
+# Add a custom target to ensure the files are copied before running tests
+add_custom_target(copy_test_data ALL DEPENDS ${TEST_DATA_OUTPUTS})
+
+# Add the directory containing file_utils.h to the include paths
+include_directories(${CMAKE_SOURCE_DIR}/tests)
+
 foreach(TEST_SOURCE ${TEST_SOURCES})
     get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE)
     add_executable(${TEST_NAME} ${TEST_SOURCE})
-    target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include)
+    target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/tests)
+    add_dependencies(${TEST_NAME} copy_test_data) # Ensure test data is copied before running tests
     if(CMAKE_CUDA_COMPILER)
         target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart)
     else()
diff --git a/tests/expected_fwd_vals.txt b/tests/expected_fwd_vals.txt
new file mode 100644
index 0000000..29456bf
--- /dev/null
+++ b/tests/expected_fwd_vals.txt
@@ -0,0 +1,10 @@
+45
+45
+54
+49.5
+20
+56
+153
+180
+116
+180
diff --git a/tests/file_utils.h b/tests/file_utils.h
new file mode 100644
index 0000000..e258e6f
--- /dev/null
+++ b/tests/file_utils.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+// Function to save a 1D array to a text file
+template <typename T>
+void saveArrayToFile(const std::string &filename, const T *array, size_t size);
+
+// Function to read a 1D array from a text file
+template <typename T>
+std::vector<T> readArrayFromFile(const std::string &filename);
+
+#include "file_utils.tpp" // Include the template implementation
diff --git a/tests/file_utils.tpp b/tests/file_utils.tpp
new file mode 100644
index 0000000..26f9cee
--- /dev/null
+++ b/tests/file_utils.tpp
@@ -0,0 +1,40 @@
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+// Function to save a 1D array to a text file
+template <typename T>
+void saveArrayToFile(const std::string &filename, const T *array, size_t size)
+{
+    std::ofstream file(filename);
+    if (!file.is_open())
+    {
+        std::cerr << "Error: Could not open file " << filename << " for writing.\n";
+        return;
+    }
+    for (size_t i = 0; i < size; ++i)
+    {
+        file << array[i] << "\n";
+    }
+    file.close();
+}
+
+// Function to read a 1D array from a text file
+template <typename T>
+std::vector<T> readArrayFromFile(const std::string &filename)
+{
+    std::ifstream file(filename);
+    if (!file.is_open())
+    {
+        std::cerr << "Error: Could not open file " << filename << " for reading.\n";
+        return {};
+    }
+    std::vector<T> array;
+    T value;
+    while (file >> value)
+    {
+        array.push_back(value);
+    }
+    file.close();
+    return array;
+}
diff --git a/tests/img.txt b/tests/img.txt
new file mode 100644
index 0000000..7ed2844
--- /dev/null
+++ b/tests/img.txt
@@ -0,0 +1,24 @@
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp
index aac1d7f..9e07908 100644
--- a/tests/test_nontof_fwd_back.cpp
+++ b/tests/test_nontof_fwd_back.cpp
@@ -1,4 +1,5 @@
 #include "parallelproj.h"
+#include "file_utils.h"
 #include <iostream>
 #include <chrono>
 #include <cmath>
diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu
index b6fa349..ec8bf2e 100644
--- a/tests/test_nontof_fwd_back.cu
+++ b/tests/test_nontof_fwd_back.cu
@@ -1,8 +1,10 @@
 #include "parallelproj.h"
+#include "file_utils.h"
 #include <iostream>
 #include <cuda_runtime.h>
 #include <chrono>
 #include <cmath>
+#include <cstring>
 
 int main()
 {
@@ -39,53 +41,25 @@ int main()
         img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i];
     }
 
+    // read the image from file and convert into a cuda managed array
+    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
     float *img;
     cudaMallocManaged(&img, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float));
-
-    // fill the test image
-    for (int i0 = 0; i0 < img_dim[0]; i0++)
-    {
-        for (int i1 = 0; i1 < img_dim[1]; i1++)
-        {
-            for (int i2 = 0; i2 < img_dim[2]; i2++)
-            {
-                img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2] = float(img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2 + 1);
-                printf("%.1f ", img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
-            }
-            printf("\n");
-        }
-        printf("\n");
-    }
-
-    float id0 = static_cast<float>(img_dim[0]);
-    float id1 = static_cast<float>(img_dim[1]);
-    float id2 = static_cast<float>(img_dim[2]);
-
-    float vstart[] = {
-        0, -1, 0,             // 0
-        0, -1, 0,             // 1
-        0, -1, 1,             // 2
-        0, -1, 0.5,           // 3
-        0, 0, -1,             // 4
-        -1, 0, 0,             // 5
-        id0 - 1, -1, 0,       // 6 - (shifted 1)
-        id0 - 1, -1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, -1,       // 8 - (shifted 4)
-        id0 - 1, id1 - 1, -1, // 9 - (shifted 8)
-    };
-
-    float vend[] = {
-        0, id1, 0,             // 0
-        0, id1, 0,             // 1
-        0, id1, 1,             // 2
-        0, id1, 0.5,           // 3
-        0, 0, id2,             // 4
-        id0, 0, 0,             // 5
-        id0 - 1, id1, 0,       // 6 - (shifted 1)
-        id0 - 1, id1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, id2,       // 8 - (shifted 4)
-        id0 - 1, id1 - 1, id2, // 9 - (shifted 8)
-    };
+    memcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float));
+
+    // read the ray start coordinates in voxel coordinates
+    // reast vstart from vstart.txt and convert into a cuda managed array
+    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    float *vstart;
+    cudaMallocManaged(&vstart, (3 * nlors) * sizeof(float));
+    memcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float));
+
+    // read the ray end coordinates in voxel coordinates
+    // read vend from vend.txt and convert into a cuda managed array
+    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
+    float *vend;
+    cudaMallocManaged(&vend, (3 * nlors) * sizeof(float));
+    memcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float));
 
     for (int ir = 0; ir < nlors; ir++)
     {
@@ -95,20 +69,20 @@ int main()
     }
 
     // calculate the start and end coordinates in world coordinates
-
     float *xstart;
     cudaMallocManaged(&xstart, (3 * nlors) * sizeof(float));
     float *xend;
     cudaMallocManaged(&xend, (3 * nlors) * sizeof(float));
 
     for (int ir = 0; ir < nlors; ir++)
-
     {
-        for (int j = 0; j < 3; j++)
-        {
-            xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j];
-            xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j];
-        }
+        xstart[ir * 3 + 0] = img_origin[0] + vstart[ir * 3 + 0] * voxsize[0];
+        xstart[ir * 3 + 1] = img_origin[1] + vstart[ir * 3 + 1] * voxsize[1];
+        xstart[ir * 3 + 2] = img_origin[2] + vstart[ir * 3 + 2] * voxsize[2];
+
+        xend[ir * 3 + 0] = img_origin[0] + vend[ir * 3 + 0] * voxsize[0];
+        xend[ir * 3 + 1] = img_origin[1] + vend[ir * 3 + 1] * voxsize[1];
+        xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2];
     }
 
     float *img_fwd;
@@ -116,8 +90,6 @@ int main()
 
     joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
 
-    // calculate the expected values
-
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
@@ -125,64 +97,11 @@ int main()
     int retval = 0;
     float eps = 1e-7;
 
-    float *expected_fwd_vals = new float[nlors];
-    // initialize expected_fwd_vals with 0s
-    for (int ir = 0; ir < nlors; ir++)
-    {
-        expected_fwd_vals[ir] = 0;
-    }
-
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[0] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1];
-    }
-
-    expected_fwd_vals[1] = expected_fwd_vals[0];
-
-    // calculate the expected value of ray2 from [0,-1,1] to [0,last+1,1]
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[2] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 1] * voxsize[1];
-    }
-
-    // calculate the expected value of ray3 from [0,-1,0.5] to [0,last+1,0.5]
-    expected_fwd_vals[3] = 0.5 * (expected_fwd_vals[0] + expected_fwd_vals[2]);
-
-    // calculate the expected value of ray4 from [0,0,-1] to [0,0,last+1]
-    for (int i2 = 0; i2 < img_dim[2]; i2++)
-    {
-        expected_fwd_vals[4] += img[0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2];
-    }
-
-    // calculate the expected value of ray5 from [-1,0,0] to [last+1,0,0]
-    for (int i0 = 0; i0 < img_dim[0]; i0++)
-    {
-        expected_fwd_vals[5] += img[i0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + 0] * voxsize[0];
-    }
-
-    // calculate the expected value of rays6 from [img_dim[0]-1,-1,0] to [img_dim[0]-1,last+1,0]
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[6] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1];
-    }
-
-    // calculate the expected value of rays7 from [img_dim[0]-1,-1,img_dim[2]-1] to [img_dim[0]-1,last+1,img_dim[2]-1]
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[7] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + (img_dim[2] - 1)] * voxsize[1];
-    }
-
-    // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1]
-    for (int i2 = 0; i2 < img_dim[2]; i2++)
-    {
-        expected_fwd_vals[8] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2];
-    }
-
-    // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1]
-    for (int i2 = 0; i2 < img_dim[2]; i2++)
-    {
-        expected_fwd_vals[9] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + (img_dim[1] - 1) * img_dim[2] + i2] * voxsize[2];
-    }
+    // read the expected_fwd_vals from expected_fwd_vals.txt and convert into a cuda managed array
+    std::vector<float> expected_fwd_vals_from_file = readArrayFromFile<float>("expected_fwd_vals.txt");
+    float *expected_fwd_vals;
+    cudaMallocManaged(&expected_fwd_vals, (nlors) * sizeof(float));
+    memcpy(expected_fwd_vals, expected_fwd_vals_from_file.data(), expected_fwd_vals_from_file.size() * sizeof(float));
 
     // check if we got the expected results
     float fwd_diff = 0;
@@ -281,7 +200,5 @@ int main()
     cudaFree(bimg);
     cudaFree(ones);
 
-    free(expected_fwd_vals);
-
     return retval;
 }
diff --git a/tests/vend.txt b/tests/vend.txt
new file mode 100644
index 0000000..0421680
--- /dev/null
+++ b/tests/vend.txt
@@ -0,0 +1,30 @@
+0
+3
+0
+0
+3
+0
+0
+3
+1
+0
+3
+0.5
+0
+0
+4
+2
+0
+0
+1
+3
+0
+1
+3
+3
+1
+0
+4
+1
+2
+4
diff --git a/tests/vstart.txt b/tests/vstart.txt
new file mode 100644
index 0000000..5965472
--- /dev/null
+++ b/tests/vstart.txt
@@ -0,0 +1,30 @@
+0
+-1
+0
+0
+-1
+0
+0
+-1
+1
+0
+-1
+0.5
+0
+0
+-1
+-1
+0
+0
+1
+-1
+0
+1
+-1
+3
+1
+0
+-1
+1
+2
+-1

From b27d29b05427f7aee54ba355ef82b512e61f9a5d Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 16:08:34 +0200
Subject: [PATCH 21/35] wip tests

---
 tests/test_nontof_fwd_back.cpp | 193 +++++++--------------------------
 tests/test_nontof_fwd_back.cu  |   4 +-
 2 files changed, 40 insertions(+), 157 deletions(-)

diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp
index 9e07908..e88ac5b 100644
--- a/tests/test_nontof_fwd_back.cpp
+++ b/tests/test_nontof_fwd_back.cpp
@@ -3,12 +3,10 @@
 #include <iostream>
 #include <chrono>
 #include <cmath>
+#include <vector>
 
 int main()
 {
-    const size_t repetitions = 5;
-    size_t nlors = 10;
-
     ////////////////////////////////////////////////////////
     // OpenMP managed memory use case
     ////////////////////////////////////////////////////////
@@ -16,7 +14,6 @@ int main()
     std::cout << "OpenMP use case\n";
 
     int img_dim[3] = {2, 3, 4};
-
     float voxsize[3] = {4, 3, 2};
 
     float img_origin[3];
@@ -25,148 +22,53 @@ int main()
         img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i];
     }
 
-    float *img = new float[img_dim[0] * img_dim[1] * img_dim[2]];
+    // Read the image from file
+    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
 
-    // fill the test image
-    for (int i0 = 0; i0 < img_dim[0]; i0++)
-    {
-        for (int i1 = 0; i1 < img_dim[1]; i1++)
-        {
-            for (int i2 = 0; i2 < img_dim[2]; i2++)
-            {
-                img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2] = float(img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2 + 1);
-                printf("%.1f ", img[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
-            }
-            printf("\n");
-        }
-        printf("\n");
-    }
+    // Read the ray start coordinates from file
+    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
 
-    float id0 = static_cast<float>(img_dim[0]);
-    float id1 = static_cast<float>(img_dim[1]);
-    float id2 = static_cast<float>(img_dim[2]);
+    // Read the ray end coordinates from file
+    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
 
-    float vstart[] = {
-        0, -1, 0,             // 0
-        0, -1, 0,             // 1
-        0, -1, 1,             // 2
-        0, -1, 0.5,           // 3
-        0, 0, -1,             // 4
-        -1, 0, 0,             // 5
-        id0 - 1, -1, 0,       // 6 - (shifted 1)
-        id0 - 1, -1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, -1,       // 8 - (shifted 4)
-        id0 - 1, id1 - 1, -1, // 9 - (shifted 8)
-    };
+    size_t nlors = vstart_from_file.size() / 3;
 
-    float vend[] = {
-        0, id1, 0,             // 0
-        0, id1, 0,             // 1
-        0, id1, 1,             // 2
-        0, id1, 0.5,           // 3
-        0, 0, id2,             // 4
-        id0, 0, 0,             // 5
-        id0 - 1, id1, 0,       // 6 - (shifted 1)
-        id0 - 1, id1, id2 - 1, // 7 - (shifted 6)
-        id0 - 1, 0, id2,       // 8 - (shifted 4)
-        id0 - 1, id1 - 1, id2, // 9 - (shifted 8)
-    };
+    // Calculate the start and end coordinates in world coordinates
+    std::vector<float> xstart(3 * nlors);
+    std::vector<float> xend(3 * nlors);
 
     for (int ir = 0; ir < nlors; ir++)
     {
-        printf("test ray %d\n", ir);
-        printf("start voxel num .: %.1f %.1f %.1f\n", vstart[ir * 3 + 0], vstart[ir * 3 + 1], vstart[ir * 3 + 2]);
-        printf("end   voxel num .: %.1f %.1f %.1f\n", vend[ir * 3 + 0], vend[ir * 3 + 1], vend[ir * 3 + 2]);
-    }
-
-    // calculate the start and end coordinates in world coordinates
-
-    float *xstart = new float[3 * nlors];
-    float *xend = new float[3 * nlors];
+        xstart[ir * 3 + 0] = img_origin[0] + vstart_from_file[ir * 3 + 0] * voxsize[0];
+        xstart[ir * 3 + 1] = img_origin[1] + vstart_from_file[ir * 3 + 1] * voxsize[1];
+        xstart[ir * 3 + 2] = img_origin[2] + vstart_from_file[ir * 3 + 2] * voxsize[2];
 
-    for (int ir = 0; ir < nlors; ir++)
-
-    {
-        for (int j = 0; j < 3; j++)
-        {
-            xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j];
-            xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j];
-        }
+        xend[ir * 3 + 0] = img_origin[0] + vend_from_file[ir * 3 + 0] * voxsize[0];
+        xend[ir * 3 + 1] = img_origin[1] + vend_from_file[ir * 3 + 1] * voxsize[1];
+        xend[ir * 3 + 2] = img_origin[2] + vend_from_file[ir * 3 + 2] * voxsize[2];
     }
 
-    float *img_fwd = new float[nlors];
-    joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
+    // Allocate memory for forward projection results
+    std::vector<float> img_fwd(nlors);
+
+    // Perform forward projection
+    joseph3d_fwd(
+        xstart.data(), xend.data(), img_from_file.data(),
+        img_origin, voxsize, img_fwd.data(),
+        nlors, img_dim, 0, 64);
 
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
 
-    // calculate the expected values
+    // Read the expected forward values from file
+    std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
 
-    int retval = 0;
+    // Check if we got the expected results
+    float fwd_diff = 0;
     float eps = 1e-7;
+    int retval = 0;
 
-    float *expected_fwd_vals = new float[nlors];
-    // initialize expected_fwd_vals with 0s
-    for (int ir = 0; ir < nlors; ir++)
-    {
-        expected_fwd_vals[ir] = 0;
-    }
-
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[0] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1];
-    }
-
-    expected_fwd_vals[1] = expected_fwd_vals[0];
-
-    // calculate the expected value of ray2 from [0,-1,1] to [0,last+1,1]
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[2] += img[0 * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 1] * voxsize[1];
-    }
-
-    // calculate the expected value of ray3 from [0,-1,0.5] to [0,last+1,0.5]
-    expected_fwd_vals[3] = 0.5 * (expected_fwd_vals[0] + expected_fwd_vals[2]);
-
-    // calculate the expected value of ray4 from [0,0,-1] to [0,0,last+1]
-    for (int i2 = 0; i2 < img_dim[2]; i2++)
-    {
-        expected_fwd_vals[4] += img[0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2];
-    }
-
-    // calculate the expected value of ray5 from [-1,0,0] to [last+1,0,0]
-    for (int i0 = 0; i0 < img_dim[0]; i0++)
-    {
-        expected_fwd_vals[5] += img[i0 * img_dim[1] * img_dim[2] + 0 * img_dim[2] + 0] * voxsize[0];
-    }
-
-    // calculate the expected value of rays6 from [img_dim[0]-1,-1,0] to [img_dim[0]-1,last+1,0]
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[6] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + 0] * voxsize[1];
-    }
-
-    // calculate the expected value of rays7 from [img_dim[0]-1,-1,img_dim[2]-1] to [img_dim[0]-1,last+1,img_dim[2]-1]
-    for (int i1 = 0; i1 < img_dim[1]; i1++)
-    {
-        expected_fwd_vals[7] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + i1 * img_dim[2] + (img_dim[2] - 1)] * voxsize[1];
-    }
-
-    // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1]
-    for (int i2 = 0; i2 < img_dim[2]; i2++)
-    {
-        expected_fwd_vals[8] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + 0 * img_dim[2] + i2] * voxsize[2];
-    }
-
-    // calculate the expected value of ray4 from [img_dim[0]-1,0,-1] to [img_dim[0]-1,0,last+1]
-    for (int i2 = 0; i2 < img_dim[2]; i2++)
-    {
-        expected_fwd_vals[9] += img[(img_dim[0] - 1) * img_dim[1] * img_dim[2] + (img_dim[1] - 1) * img_dim[2] + i2] * voxsize[2];
-    }
-
-    // check if we got the expected results
-    float fwd_diff = 0;
     printf("\nforward projection test\n");
     for (int ir = 0; ir < nlors; ir++)
     {
@@ -186,21 +88,14 @@ int main()
     ////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////
 
-    // test the back projection
-
-    float *bimg = new float[img_dim[0] * img_dim[1] * img_dim[2]];
-    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
-    {
-        bimg[i] = 0;
-    }
-
-    float *ones = new float[nlors];
-    for (size_t i = 0; i < nlors; i++)
-    {
-        ones[i] = 1;
-    }
+    // Test the back projection
+    std::vector<float> bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f);
+    std::vector<float> ones(nlors, 1.0f);
 
-    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim);
+    joseph3d_back(
+        xstart.data(), xend.data(), bimg.data(),
+        img_origin, voxsize, ones.data(),
+        nlors, img_dim);
 
     printf("\nback projection of ones along all rays:\n");
     for (size_t i0 = 0; i0 < img_dim[0]; i0++)
@@ -225,7 +120,7 @@ int main()
 
     for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
     {
-        inner_product1 += (img[i] * bimg[i]);
+        inner_product1 += (img_from_file[i] * bimg[i]);
     }
 
     for (size_t ir = 0; ir < nlors; ir++)
@@ -244,17 +139,5 @@ int main()
         retval = 1;
     }
 
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
-    free(img);
-    free(xstart);
-    free(xend);
-    free(img_fwd);
-    free(expected_fwd_vals);
-
-    free(bimg);
-    free(ones);
-
     return retval;
 }
diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu
index ec8bf2e..f944204 100644
--- a/tests/test_nontof_fwd_back.cu
+++ b/tests/test_nontof_fwd_back.cu
@@ -8,8 +8,6 @@
 
 int main()
 {
-    size_t nlors = 10;
-
     // get the number of cuda devices - because we want to run on the last device
     int device_count;
     cudaGetDeviceCount(&device_count);
@@ -50,6 +48,8 @@ int main()
     // read the ray start coordinates in voxel coordinates
     // reast vstart from vstart.txt and convert into a cuda managed array
     std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    size_t nlors = vstart_from_file.size() / 3;
+
     float *vstart;
     cudaMallocManaged(&vstart, (3 * nlors) * sizeof(float));
     memcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float));

From facf7be219f2bcbbfa9425dbcfe8f897ec14e41d Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 16:11:03 +0200
Subject: [PATCH 22/35] free all cuda managed arrays

---
 tests/test_nontof_fwd_back.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu
index f944204..bec2107 100644
--- a/tests/test_nontof_fwd_back.cu
+++ b/tests/test_nontof_fwd_back.cu
@@ -193,10 +193,12 @@ int main()
     cudaFree(voxsize);
     cudaFree(img_origin);
     cudaFree(img);
+    cudaFree(vstart);
+    cudaFree(vend);
     cudaFree(xstart);
     cudaFree(xend);
     cudaFree(img_fwd);
-
+    cudaFree(expected_fwd_vals);
     cudaFree(bimg);
     cudaFree(ones);
 

From 99f0d5ac246ffd31211ec2a4d0d24cfb54e5779b Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 17:09:07 +0200
Subject: [PATCH 23/35] simplify example

---
 tests/test_nontof_fwd_back.cpp | 53 ++++++++++++++--------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp
index e88ac5b..df6d5df 100644
--- a/tests/test_nontof_fwd_back.cpp
+++ b/tests/test_nontof_fwd_back.cpp
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cmath>
 #include <vector>
+#include <numeric>
 
 int main()
 {
@@ -13,25 +14,25 @@ int main()
 
     std::cout << "OpenMP use case\n";
 
-    int img_dim[3] = {2, 3, 4};
-    float voxsize[3] = {4, 3, 2};
+    std::vector<int> img_dim = {2, 3, 4};
+    std::vector<float> voxsize = {4.0f, 3.0f, 2.0f};
 
-    float img_origin[3];
+    std::vector<float> img_origin(3);
     for (int i = 0; i < 3; ++i)
     {
-        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i];
+        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
     }
 
     // Read the image from file
-    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
+    std::vector<float> img = readArrayFromFile<float>("img.txt");
 
     // Read the ray start coordinates from file
-    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    std::vector<float> vstart = readArrayFromFile<float>("vstart.txt");
 
     // Read the ray end coordinates from file
-    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
+    std::vector<float> vend = readArrayFromFile<float>("vend.txt");
 
-    size_t nlors = vstart_from_file.size() / 3;
+    size_t nlors = vstart.size() / 3;
 
     // Calculate the start and end coordinates in world coordinates
     std::vector<float> xstart(3 * nlors);
@@ -39,13 +40,13 @@ int main()
 
     for (int ir = 0; ir < nlors; ir++)
     {
-        xstart[ir * 3 + 0] = img_origin[0] + vstart_from_file[ir * 3 + 0] * voxsize[0];
-        xstart[ir * 3 + 1] = img_origin[1] + vstart_from_file[ir * 3 + 1] * voxsize[1];
-        xstart[ir * 3 + 2] = img_origin[2] + vstart_from_file[ir * 3 + 2] * voxsize[2];
+        xstart[ir * 3 + 0] = img_origin[0] + vstart[ir * 3 + 0] * voxsize[0];
+        xstart[ir * 3 + 1] = img_origin[1] + vstart[ir * 3 + 1] * voxsize[1];
+        xstart[ir * 3 + 2] = img_origin[2] + vstart[ir * 3 + 2] * voxsize[2];
 
-        xend[ir * 3 + 0] = img_origin[0] + vend_from_file[ir * 3 + 0] * voxsize[0];
-        xend[ir * 3 + 1] = img_origin[1] + vend_from_file[ir * 3 + 1] * voxsize[1];
-        xend[ir * 3 + 2] = img_origin[2] + vend_from_file[ir * 3 + 2] * voxsize[2];
+        xend[ir * 3 + 0] = img_origin[0] + vend[ir * 3 + 0] * voxsize[0];
+        xend[ir * 3 + 1] = img_origin[1] + vend[ir * 3 + 1] * voxsize[1];
+        xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2];
     }
 
     // Allocate memory for forward projection results
@@ -53,9 +54,9 @@ int main()
 
     // Perform forward projection
     joseph3d_fwd(
-        xstart.data(), xend.data(), img_from_file.data(),
-        img_origin, voxsize, img_fwd.data(),
-        nlors, img_dim, 0, 64);
+        xstart.data(), xend.data(), img.data(),
+        img_origin.data(), voxsize.data(), img_fwd.data(),
+        nlors, img_dim.data(), 0, 64);
 
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
@@ -94,8 +95,8 @@ int main()
 
     joseph3d_back(
         xstart.data(), xend.data(), bimg.data(),
-        img_origin, voxsize, ones.data(),
-        nlors, img_dim);
+        img_origin.data(), voxsize.data(), ones.data(),
+        nlors, img_dim.data());
 
     printf("\nback projection of ones along all rays:\n");
     for (size_t i0 = 0; i0 < img_dim[0]; i0++)
@@ -115,18 +116,8 @@ int main()
     // of the forward projector. This is more practical than checking a lot of single voxels in the
     // back projected image.
 
-    float inner_product1 = 0;
-    float inner_product2 = 0;
-
-    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
-    {
-        inner_product1 += (img_from_file[i] * bimg[i]);
-    }
-
-    for (size_t ir = 0; ir < nlors; ir++)
-    {
-        inner_product2 += (img_fwd[ir] * ones[ir]);
-    }
+    float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f);
+    float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f);
 
     float ip_diff = fabs(inner_product1 - inner_product2);
 

From 7bb90bd83f0178f0b25f5a44132fcf13d750b9cd Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 17:20:31 +0200
Subject: [PATCH 24/35] copy results back to host and simply cuda test (host
 case so far)

---
 src/joseph3d_back.cu          |   4 +
 src/joseph3d_fwd.cu           |   3 +
 tests/test_nontof_fwd_back.cu | 152 +++++++++-------------------------
 3 files changed, 47 insertions(+), 112 deletions(-)

diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu
index c090d5d..378d387 100644
--- a/src/joseph3d_back.cu
+++ b/src/joseph3d_back.cu
@@ -115,7 +115,11 @@ void joseph3d_back(const float *xstart,
     if (free_xend)
         cudaFree(d_xend);
     if (free_img)
+    {
+        // Copy the result back to the host
+        cudaMemcpy(const_cast<float *>(img), d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyDeviceToHost);
         cudaFree(d_img);
+    }
     if (free_img_origin)
         cudaFree(d_img_origin);
     if (free_voxsize)
diff --git a/src/joseph3d_fwd.cu b/src/joseph3d_fwd.cu
index 1093983..a5444c4 100644
--- a/src/joseph3d_fwd.cu
+++ b/src/joseph3d_fwd.cu
@@ -121,7 +121,10 @@ void joseph3d_fwd(const float *xstart,
     if (free_voxsize)
         cudaFree(d_voxsize);
     if (free_p)
+    {
+        cudaMemcpy(const_cast<float *>(p), d_p, sizeof(float) * nlors, cudaMemcpyDeviceToHost);
         cudaFree(d_p);
+    }
     if (free_img_dim)
         cudaFree(d_img_dim);
 }
diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu
index bec2107..2c309e7 100644
--- a/tests/test_nontof_fwd_back.cu
+++ b/tests/test_nontof_fwd_back.cu
@@ -1,78 +1,42 @@
 #include "parallelproj.h"
 #include "file_utils.h"
 #include <iostream>
-#include <cuda_runtime.h>
 #include <chrono>
 #include <cmath>
-#include <cstring>
+#include <vector>
+#include <numeric>
 
 int main()
 {
-    // get the number of cuda devices - because we want to run on the last device
-    int device_count;
-    cudaGetDeviceCount(&device_count);
-
     ////////////////////////////////////////////////////////
-    // CUDA memory managed use case
+    // Host array use case
     ////////////////////////////////////////////////////////
 
-    std::cout << "CUDA managed memory use case\n";
-
-    cudaSetDevice(device_count - 1);
+    std::cout << "Host array use case\n";
 
-    int *img_dim;
-    cudaMallocManaged(&img_dim, 3 * sizeof(int));
-    img_dim[0] = 2;
-    img_dim[1] = 3;
-    img_dim[2] = 4;
+    std::vector<int> img_dim = {2, 3, 4};
+    std::vector<float> voxsize = {4.0f, 3.0f, 2.0f};
 
-    float *voxsize;
-    cudaMallocManaged(&voxsize, 3 * sizeof(float));
-    voxsize[0] = 4;
-    voxsize[1] = 3;
-    voxsize[2] = 2;
-
-    float *img_origin;
-    cudaMallocManaged(&img_origin, 3 * sizeof(float));
+    std::vector<float> img_origin(3);
     for (int i = 0; i < 3; ++i)
     {
-        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5) * voxsize[i];
+        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
     }
 
-    // read the image from file and convert into a cuda managed array
-    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
-    float *img;
-    cudaMallocManaged(&img, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float));
-    memcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float));
-
-    // read the ray start coordinates in voxel coordinates
-    // reast vstart from vstart.txt and convert into a cuda managed array
-    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
-    size_t nlors = vstart_from_file.size() / 3;
+    // Read the image from file
+    std::vector<float> img = readArrayFromFile<float>("img.txt");
 
-    float *vstart;
-    cudaMallocManaged(&vstart, (3 * nlors) * sizeof(float));
-    memcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float));
+    // Read the ray start coordinates from file
+    std::vector<float> vstart = readArrayFromFile<float>("vstart.txt");
 
-    // read the ray end coordinates in voxel coordinates
-    // read vend from vend.txt and convert into a cuda managed array
-    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
-    float *vend;
-    cudaMallocManaged(&vend, (3 * nlors) * sizeof(float));
-    memcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float));
+    // Read the ray end coordinates from file
+    std::vector<float> vend = readArrayFromFile<float>("vend.txt");
 
-    for (int ir = 0; ir < nlors; ir++)
-    {
-        printf("test ray %d\n", ir);
-        printf("start voxel num .: %.1f %.1f %.1f\n", vstart[ir * 3 + 0], vstart[ir * 3 + 1], vstart[ir * 3 + 2]);
-        printf("end   voxel num .: %.1f %.1f %.1f\n", vend[ir * 3 + 0], vend[ir * 3 + 1], vend[ir * 3 + 2]);
-    }
+    size_t nlors = vstart.size() / 3;
 
-    // calculate the start and end coordinates in world coordinates
-    float *xstart;
-    cudaMallocManaged(&xstart, (3 * nlors) * sizeof(float));
-    float *xend;
-    cudaMallocManaged(&xend, (3 * nlors) * sizeof(float));
+    // Calculate the start and end coordinates in world coordinates
+    std::vector<float> xstart(3 * nlors);
+    std::vector<float> xend(3 * nlors);
 
     for (int ir = 0; ir < nlors; ir++)
     {
@@ -85,26 +49,27 @@ int main()
         xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2];
     }
 
-    float *img_fwd;
-    cudaMallocManaged(&img_fwd, nlors * sizeof(float));
+    // Allocate memory for forward projection results
+    std::vector<float> img_fwd(nlors);
 
-    joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
+    // Perform forward projection
+    joseph3d_fwd(
+        xstart.data(), xend.data(), img.data(),
+        img_origin.data(), voxsize.data(), img_fwd.data(),
+        nlors, img_dim.data(), 0, 64);
 
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////
 
-    int retval = 0;
-    float eps = 1e-7;
-
-    // read the expected_fwd_vals from expected_fwd_vals.txt and convert into a cuda managed array
-    std::vector<float> expected_fwd_vals_from_file = readArrayFromFile<float>("expected_fwd_vals.txt");
-    float *expected_fwd_vals;
-    cudaMallocManaged(&expected_fwd_vals, (nlors) * sizeof(float));
-    memcpy(expected_fwd_vals, expected_fwd_vals_from_file.data(), expected_fwd_vals_from_file.size() * sizeof(float));
+    // Read the expected forward values from file
+    std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
 
-    // check if we got the expected results
+    // Check if we got the expected results
     float fwd_diff = 0;
+    float eps = 1e-7;
+    int retval = 0;
+
     printf("\nforward projection test\n");
     for (int ir = 0; ir < nlors; ir++)
     {
@@ -124,24 +89,14 @@ int main()
     ////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////
 
-    // test the back projection
-
-    float *bimg;
-    cudaMallocManaged(&bimg, (img_dim[0] * img_dim[1] * img_dim[2]) * sizeof(float));
+    // Test the back projection
+    std::vector<float> bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f);
+    std::vector<float> ones(nlors, 1.0f);
 
-    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
-    {
-        bimg[i] = 0;
-    }
-
-    float *ones;
-    cudaMallocManaged(&ones, nlors * sizeof(float));
-    for (size_t i = 0; i < nlors; i++)
-    {
-        ones[i] = 1;
-    }
-
-    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim, 0, 64);
+    joseph3d_back(
+        xstart.data(), xend.data(), bimg.data(),
+        img_origin.data(), voxsize.data(), ones.data(),
+        nlors, img_dim.data());
 
     printf("\nback projection of ones along all rays:\n");
     for (size_t i0 = 0; i0 < img_dim[0]; i0++)
@@ -161,18 +116,8 @@ int main()
     // of the forward projector. This is more practical than checking a lot of single voxels in the
     // back projected image.
 
-    float inner_product1 = 0;
-    float inner_product2 = 0;
-
-    for (size_t i = 0; i < (img_dim[0] * img_dim[1] * img_dim[2]); i++)
-    {
-        inner_product1 += (img[i] * bimg[i]);
-    }
-
-    for (size_t ir = 0; ir < nlors; ir++)
-    {
-        inner_product2 += (img_fwd[ir] * ones[ir]);
-    }
+    float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f);
+    float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f);
 
     float ip_diff = fabs(inner_product1 - inner_product2);
 
@@ -185,22 +130,5 @@ int main()
         retval = 1;
     }
 
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
-
-    cudaFree(img_dim);
-    cudaFree(voxsize);
-    cudaFree(img_origin);
-    cudaFree(img);
-    cudaFree(vstart);
-    cudaFree(vend);
-    cudaFree(xstart);
-    cudaFree(xend);
-    cudaFree(img_fwd);
-    cudaFree(expected_fwd_vals);
-    cudaFree(bimg);
-    cudaFree(ones);
-
     return retval;
 }

From 23add1621786ccca26f8e4d501d523c4e0c38def Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 17:30:15 +0200
Subject: [PATCH 25/35] add forward test in 3 modes

---
 tests/test_nontof_fwd_back.cu | 211 ++++++++++++++++++++++++----------
 1 file changed, 151 insertions(+), 60 deletions(-)

diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu
index 2c309e7..3ba2636 100644
--- a/tests/test_nontof_fwd_back.cu
+++ b/tests/test_nontof_fwd_back.cu
@@ -5,15 +5,31 @@
 #include <cmath>
 #include <vector>
 #include <numeric>
+#include <cuda_runtime.h>
+
+void test_host_arrays();
+void test_cuda_managed_arrays();
+void test_cuda_device_arrays();
 
 int main()
 {
-    ////////////////////////////////////////////////////////
-    // Host array use case
-    ////////////////////////////////////////////////////////
+    std::cout << "Testing joseph3d_fwd and joseph3d_back with different memory modes:\n";
+
+    std::cout << "\n--- Testing with Host Arrays ---\n";
+    test_host_arrays();
+
+    std::cout << "\n--- Testing with CUDA-Managed Arrays ---\n";
+    test_cuda_managed_arrays();
+
+    std::cout << "\n--- Testing with CUDA Device Arrays ---\n";
+    test_cuda_device_arrays();
 
-    std::cout << "Host array use case\n";
+    return 0;
+}
 
+void test_host_arrays()
+{
+    // Host array test (same as your current implementation)
     std::vector<int> img_dim = {2, 3, 4};
     std::vector<float> voxsize = {4.0f, 3.0f, 2.0f};
 
@@ -23,18 +39,11 @@ int main()
         img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
     }
 
-    // Read the image from file
     std::vector<float> img = readArrayFromFile<float>("img.txt");
-
-    // Read the ray start coordinates from file
     std::vector<float> vstart = readArrayFromFile<float>("vstart.txt");
-
-    // Read the ray end coordinates from file
     std::vector<float> vend = readArrayFromFile<float>("vend.txt");
-
     size_t nlors = vstart.size() / 3;
 
-    // Calculate the start and end coordinates in world coordinates
     std::vector<float> xstart(3 * nlors);
     std::vector<float> xend(3 * nlors);
 
@@ -49,86 +58,168 @@ int main()
         xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2];
     }
 
-    // Allocate memory for forward projection results
     std::vector<float> img_fwd(nlors);
-
-    // Perform forward projection
     joseph3d_fwd(
         xstart.data(), xend.data(), img.data(),
         img_origin.data(), voxsize.data(), img_fwd.data(),
         nlors, img_dim.data(), 0, 64);
 
-    /////////////////////////////////////////////////////////////////////////////
-    /////////////////////////////////////////////////////////////////////////////
-    /////////////////////////////////////////////////////////////////////////////
-
-    // Read the expected forward values from file
     std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
-
-    // Check if we got the expected results
     float fwd_diff = 0;
     float eps = 1e-7;
-    int retval = 0;
 
-    printf("\nforward projection test\n");
     for (int ir = 0; ir < nlors; ir++)
     {
-        printf("test ray %d: fwd projected: %.7e expected: %.7e\n", ir, img_fwd[ir], expected_fwd_vals[ir]);
+        fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]);
+        if (fwd_diff > eps)
+        {
+            std::cerr << "Host array test failed for ray " << ir << "\n";
+            return;
+        }
+    }
+}
+
+void test_cuda_managed_arrays()
+{
+    // CUDA-managed array test
+    int img_dim[3] = {2, 3, 4};
+    float voxsize[3] = {4.0f, 3.0f, 2.0f};
+
+    float *img_origin;
+    cudaMallocManaged(&img_origin, 3 * sizeof(float));
+    for (int i = 0; i < 3; ++i)
+    {
+        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
+    }
+
+    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
+    float *img;
+    cudaMallocManaged(&img, img_from_file.size() * sizeof(float));
+    std::copy(img_from_file.begin(), img_from_file.end(), img);
+
+    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    float *vstart;
+    cudaMallocManaged(&vstart, vstart_from_file.size() * sizeof(float));
+    std::copy(vstart_from_file.begin(), vstart_from_file.end(), vstart);
+
+    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
+    float *vend;
+    cudaMallocManaged(&vend, vend_from_file.size() * sizeof(float));
+    std::copy(vend_from_file.begin(), vend_from_file.end(), vend);
 
+    size_t nlors = vstart_from_file.size() / 3;
+
+    float *xstart, *xend;
+    cudaMallocManaged(&xstart, 3 * nlors * sizeof(float));
+    cudaMallocManaged(&xend, 3 * nlors * sizeof(float));
+
+    for (int ir = 0; ir < nlors; ir++)
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j];
+            xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j];
+        }
+    }
+
+    float *img_fwd;
+    cudaMallocManaged(&img_fwd, nlors * sizeof(float));
+    joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
+
+    cudaDeviceSynchronize();
+
+    std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
+    float fwd_diff = 0;
+    float eps = 1e-7;
+
+    for (int ir = 0; ir < nlors; ir++)
+    {
         fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]);
         if (fwd_diff > eps)
         {
-            printf("\n################################################################################");
-            printf("\nabs(fwd projected - expected value) = %.2e for ray%d above tolerance %.2e", fwd_diff, ir, eps);
-            printf("\n################################################################################\n");
-            retval = 1;
+            std::cerr << "CUDA-managed array test failed for ray " << ir << "\n";
+            return;
         }
     }
 
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
+    cudaFree(img_origin);
+    cudaFree(img);
+    cudaFree(vstart);
+    cudaFree(vend);
+    cudaFree(xstart);
+    cudaFree(xend);
+    cudaFree(img_fwd);
+}
 
-    // Test the back projection
-    std::vector<float> bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f);
-    std::vector<float> ones(nlors, 1.0f);
+void test_cuda_device_arrays()
+{
+    // CUDA device array test
+    int img_dim[3] = {2, 3, 4};
+    float voxsize[3] = {4.0f, 3.0f, 2.0f};
 
-    joseph3d_back(
-        xstart.data(), xend.data(), bimg.data(),
-        img_origin.data(), voxsize.data(), ones.data(),
-        nlors, img_dim.data());
+    float img_origin[3];
+    for (int i = 0; i < 3; ++i)
+    {
+        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
+    }
+
+    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
+    float *img;
+    cudaMalloc(&img, img_from_file.size() * sizeof(float));
+    cudaMemcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+
+    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    float *vstart;
+    cudaMalloc(&vstart, vstart_from_file.size() * sizeof(float));
+    cudaMemcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
 
-    printf("\nback projection of ones along all rays:\n");
-    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
+    float *vend;
+    cudaMalloc(&vend, vend_from_file.size() * sizeof(float));
+    cudaMemcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+
+    size_t nlors = vstart_from_file.size() / 3;
+
+    float *xstart, *xend;
+    cudaMalloc(&xstart, 3 * nlors * sizeof(float));
+    cudaMalloc(&xend, 3 * nlors * sizeof(float));
+
+    for (int ir = 0; ir < nlors; ir++)
     {
-        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        for (int j = 0; j < 3; j++)
         {
-            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
-            {
-                printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
-            }
-            printf("\n");
+            float xstart_val = img_origin[j] + vstart_from_file[ir * 3 + j] * voxsize[j];
+            float xend_val = img_origin[j] + vend_from_file[ir * 3 + j] * voxsize[j];
+            cudaMemcpy(&xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice);
+            cudaMemcpy(&xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice);
         }
-        printf("\n");
     }
 
-    // To test whether the back projection is correct, we test if the back projector is the adjoint
-    // of the forward projector. This is more practical than checking a lot of single voxels in the
-    // back projected image.
+    float *img_fwd;
+    cudaMalloc(&img_fwd, nlors * sizeof(float));
+    joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
 
-    float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f);
-    float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f);
+    std::vector<float> img_fwd_host(nlors);
+    cudaMemcpy(img_fwd_host.data(), img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost);
 
-    float ip_diff = fabs(inner_product1 - inner_product2);
+    std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
+    float fwd_diff = 0;
+    float eps = 1e-7;
 
-    if (ip_diff > eps)
+    for (int ir = 0; ir < nlors; ir++)
     {
-        printf("\n#########################################################################");
-        printf("\nback projection test failed. back projection seems not to be the adjoint.");
-        printf("\n %.7e", ip_diff);
-        printf("\n#########################################################################\n");
-        retval = 1;
+        fwd_diff = std::abs(img_fwd_host[ir] - expected_fwd_vals[ir]);
+        if (fwd_diff > eps)
+        {
+            std::cerr << "CUDA device array test failed for ray " << ir << "\n";
+            return;
+        }
     }
 
-    return retval;
+    cudaFree(img);
+    cudaFree(vstart);
+    cudaFree(vend);
+    cudaFree(xstart);
+    cudaFree(xend);
+    cudaFree(img_fwd);
 }

From b252684f06d654275ede3317bda67a5ccdfda7a7 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 17:44:11 +0200
Subject: [PATCH 26/35] add back projection test for all 3 modes

---
 tests/test_nontof_fwd_back.cu | 156 +++++++++++++++++++++++++++++++++-
 1 file changed, 154 insertions(+), 2 deletions(-)

diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back.cu
index 3ba2636..cf9bec3 100644
--- a/tests/test_nontof_fwd_back.cu
+++ b/tests/test_nontof_fwd_back.cu
@@ -77,6 +77,51 @@ void test_host_arrays()
             return;
         }
     }
+
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////
+
+    // Test the back projection using the defintion of the adjoint operator
+    std::vector<float> bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f);
+    std::vector<float> ones(nlors, 1.0f);
+
+    joseph3d_back(
+        xstart.data(), xend.data(), bimg.data(),
+        img_origin.data(), voxsize.data(), ones.data(),
+        nlors, img_dim.data());
+
+    printf("\nback projection of ones along all rays:\n");
+    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    {
+        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        {
+            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
+            {
+                printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    // To test whether the back projection is correct, we test if the back projector is the adjoint
+    // of the forward projector. This is more practical than checking a lot of single voxels in the
+    // back projected image.
+
+    float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f);
+    float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f);
+
+    float ip_diff = fabs(inner_product1 - inner_product2);
+
+    if (ip_diff > eps)
+    {
+        printf("\n#########################################################################");
+        printf("\nback projection test failed. back projection seems not to be the adjoint.");
+        printf("\n %.7e", ip_diff);
+        printf("\n#########################################################################\n");
+        std::cerr << "Back projection via adjointness test failed.\n";
+    }
 }
 
 void test_cuda_managed_arrays()
@@ -126,8 +171,6 @@ void test_cuda_managed_arrays()
     cudaMallocManaged(&img_fwd, nlors * sizeof(float));
     joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
 
-    cudaDeviceSynchronize();
-
     std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
     float fwd_diff = 0;
     float eps = 1e-7;
@@ -142,6 +185,56 @@ void test_cuda_managed_arrays()
         }
     }
 
+    // Test the back projection
+    float *bimg;
+    cudaMallocManaged(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float));
+    std::fill(bimg, bimg + (img_dim[0] * img_dim[1] * img_dim[2]), 0.0f);
+
+    float *ones;
+    cudaMallocManaged(&ones, nlors * sizeof(float));
+    std::fill(ones, ones + nlors, 1.0f);
+
+    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim);
+
+    printf("\nCUDA-managed back projection of ones along all rays:\n");
+    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    {
+        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        {
+            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
+            {
+                printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    // Validate the back projection using adjointness
+    float inner_product1 = 0.0f;
+    float inner_product2 = 0.0f;
+
+    for (size_t i = 0; i < img_from_file.size(); i++)
+    {
+        inner_product1 += img[i] * bimg[i];
+    }
+
+    for (size_t ir = 0; ir < nlors; ir++)
+    {
+        inner_product2 += img_fwd[ir] * ones[ir];
+    }
+
+    float ip_diff = fabs(inner_product1 - inner_product2);
+
+    if (ip_diff > eps)
+    {
+        std::cerr << "CUDA-managed array back projection test failed: adjointness property violated.\n";
+    }
+    else
+    {
+        std::cout << "CUDA-managed array back projection test passed.\n";
+    }
+
     cudaFree(img_origin);
     cudaFree(img);
     cudaFree(vstart);
@@ -149,6 +242,8 @@ void test_cuda_managed_arrays()
     cudaFree(xstart);
     cudaFree(xend);
     cudaFree(img_fwd);
+    cudaFree(bimg);
+    cudaFree(ones);
 }
 
 void test_cuda_device_arrays()
@@ -216,10 +311,67 @@ void test_cuda_device_arrays()
         }
     }
 
+    // Test the back projection
+    float *bimg;
+    cudaMalloc(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float));
+    cudaMemset(bimg, 0, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float));
+
+    float *ones;
+    cudaMalloc(&ones, nlors * sizeof(float));
+    cudaMemset(ones, 0, nlors * sizeof(float));
+    std::vector<float> ones_host(nlors, 1.0f);
+    cudaMemcpy(ones, ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice);
+
+    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim);
+
+    std::vector<float> bimg_host(img_dim[0] * img_dim[1] * img_dim[2]);
+    cudaMemcpy(bimg_host.data(), bimg, bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost);
+
+    printf("\nCUDA device back projection of ones along all rays:\n");
+    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    {
+        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        {
+            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
+            {
+                printf("%.1f ", bimg_host[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    // Validate the back projection using adjointness
+    float inner_product1 = 0.0f;
+    float inner_product2 = 0.0f;
+
+    for (size_t i = 0; i < img_from_file.size(); i++)
+    {
+        inner_product1 += img_from_file[i] * bimg_host[i];
+    }
+
+    for (size_t ir = 0; ir < nlors; ir++)
+    {
+        inner_product2 += img_fwd_host[ir] * ones_host[ir];
+    }
+
+    float ip_diff = fabs(inner_product1 - inner_product2);
+
+    if (ip_diff > eps)
+    {
+        std::cerr << "CUDA device array back projection test failed: adjointness property violated.\n";
+    }
+    else
+    {
+        std::cout << "CUDA device array back projection test passed.\n";
+    }
+
     cudaFree(img);
     cudaFree(vstart);
     cudaFree(vend);
     cudaFree(xstart);
     cudaFree(xend);
     cudaFree(img_fwd);
+    cudaFree(bimg);
+    cudaFree(ones);
 }

From 6616d9553360c5e47d9ea6d25d84c8d38e48f6ed Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 19:42:03 +0200
Subject: [PATCH 27/35] use single source for host array tests

---
 CMakeLists.txt                                |   3 +-
 tests/test_nontof_fwd_back.cpp                |  31 +++---
 ...d_back.cu => test_nontof_fwd_back_cuda.cu} | 103 ------------------
 3 files changed, 20 insertions(+), 117 deletions(-)
 rename tests/{test_nontof_fwd_back.cu => test_nontof_fwd_back_cuda.cu} (68%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7650395..93b7347 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,8 @@ if(CMAKE_CUDA_COMPILER)
     target_compile_definitions(parallelproj PUBLIC PARALLELPROJ_CUDA=1) 
 
     set(TEST_SOURCES
-        tests/test_nontof_fwd_back.cu
+        tests/test_nontof_fwd_back.cpp
+        tests/test_nontof_fwd_back_cuda.cu
     )
 else()
     set(LIB_SOURCES
diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp
index df6d5df..dad7072 100644
--- a/tests/test_nontof_fwd_back.cpp
+++ b/tests/test_nontof_fwd_back.cpp
@@ -9,10 +9,14 @@
 int main()
 {
     ////////////////////////////////////////////////////////
-    // OpenMP managed memory use case
+    // host array test cases
     ////////////////////////////////////////////////////////
 
-    std::cout << "OpenMP use case\n";
+#ifdef __CUDACC__
+    std::cout << "CUDA host array test\n";
+#else
+    std::cout << "OpenMP test\n";
+#endif
 
     std::vector<int> img_dim = {2, 3, 4};
     std::vector<float> voxsize = {4.0f, 3.0f, 2.0f};
@@ -68,7 +72,6 @@ int main()
     // Check if we got the expected results
     float fwd_diff = 0;
     float eps = 1e-7;
-    int retval = 0;
 
     printf("\nforward projection test\n");
     for (int ir = 0; ir < nlors; ir++)
@@ -78,10 +81,12 @@ int main()
         fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]);
         if (fwd_diff > eps)
         {
-            printf("\n################################################################################");
-            printf("\nabs(fwd projected - expected value) = %.2e for ray%d above tolerance %.2e", fwd_diff, ir, eps);
-            printf("\n################################################################################\n");
-            retval = 1;
+            std::cerr << "Forward projection test failed.\n";
+            std::cerr << "Difference: " << fwd_diff << "\n";
+            std::cerr << "Expected: " << expected_fwd_vals[ir] << "\n";
+            std::cerr << "Actual: " << img_fwd[ir] << "\n";
+            std::cerr << "Tolerance: " << eps << "\n";
+            std::cerr << "Ray index: " << ir << "\n";
         }
     }
 
@@ -123,12 +128,12 @@ int main()
 
     if (ip_diff > eps)
     {
-        printf("\n#########################################################################");
-        printf("\nback projection test failed. back projection seems not to be the adjoint.");
-        printf("\n %.7e", ip_diff);
-        printf("\n#########################################################################\n");
-        retval = 1;
+        std::cerr << "Back projection test failed.\n";
+        std::cerr << "Inner product 1: " << inner_product1 << "\n";
+        std::cerr << "Inner product 2: " << inner_product2 << "\n";
+        std::cerr << "Difference: " << ip_diff << "\n";
+        std::cerr << "Tolerance: " << eps << "\n";
     }
 
-    return retval;
+    return 0;
 }
diff --git a/tests/test_nontof_fwd_back.cu b/tests/test_nontof_fwd_back_cuda.cu
similarity index 68%
rename from tests/test_nontof_fwd_back.cu
rename to tests/test_nontof_fwd_back_cuda.cu
index cf9bec3..82c1592 100644
--- a/tests/test_nontof_fwd_back.cu
+++ b/tests/test_nontof_fwd_back_cuda.cu
@@ -7,17 +7,11 @@
 #include <numeric>
 #include <cuda_runtime.h>
 
-void test_host_arrays();
 void test_cuda_managed_arrays();
 void test_cuda_device_arrays();
 
 int main()
 {
-    std::cout << "Testing joseph3d_fwd and joseph3d_back with different memory modes:\n";
-
-    std::cout << "\n--- Testing with Host Arrays ---\n";
-    test_host_arrays();
-
     std::cout << "\n--- Testing with CUDA-Managed Arrays ---\n";
     test_cuda_managed_arrays();
 
@@ -27,103 +21,6 @@ int main()
     return 0;
 }
 
-void test_host_arrays()
-{
-    // Host array test (same as your current implementation)
-    std::vector<int> img_dim = {2, 3, 4};
-    std::vector<float> voxsize = {4.0f, 3.0f, 2.0f};
-
-    std::vector<float> img_origin(3);
-    for (int i = 0; i < 3; ++i)
-    {
-        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
-    }
-
-    std::vector<float> img = readArrayFromFile<float>("img.txt");
-    std::vector<float> vstart = readArrayFromFile<float>("vstart.txt");
-    std::vector<float> vend = readArrayFromFile<float>("vend.txt");
-    size_t nlors = vstart.size() / 3;
-
-    std::vector<float> xstart(3 * nlors);
-    std::vector<float> xend(3 * nlors);
-
-    for (int ir = 0; ir < nlors; ir++)
-    {
-        xstart[ir * 3 + 0] = img_origin[0] + vstart[ir * 3 + 0] * voxsize[0];
-        xstart[ir * 3 + 1] = img_origin[1] + vstart[ir * 3 + 1] * voxsize[1];
-        xstart[ir * 3 + 2] = img_origin[2] + vstart[ir * 3 + 2] * voxsize[2];
-
-        xend[ir * 3 + 0] = img_origin[0] + vend[ir * 3 + 0] * voxsize[0];
-        xend[ir * 3 + 1] = img_origin[1] + vend[ir * 3 + 1] * voxsize[1];
-        xend[ir * 3 + 2] = img_origin[2] + vend[ir * 3 + 2] * voxsize[2];
-    }
-
-    std::vector<float> img_fwd(nlors);
-    joseph3d_fwd(
-        xstart.data(), xend.data(), img.data(),
-        img_origin.data(), voxsize.data(), img_fwd.data(),
-        nlors, img_dim.data(), 0, 64);
-
-    std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
-    float fwd_diff = 0;
-    float eps = 1e-7;
-
-    for (int ir = 0; ir < nlors; ir++)
-    {
-        fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]);
-        if (fwd_diff > eps)
-        {
-            std::cerr << "Host array test failed for ray " << ir << "\n";
-            return;
-        }
-    }
-
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////////////////////////////////////////////////
-
-    // Test the back projection using the defintion of the adjoint operator
-    std::vector<float> bimg(img_dim[0] * img_dim[1] * img_dim[2], 0.0f);
-    std::vector<float> ones(nlors, 1.0f);
-
-    joseph3d_back(
-        xstart.data(), xend.data(), bimg.data(),
-        img_origin.data(), voxsize.data(), ones.data(),
-        nlors, img_dim.data());
-
-    printf("\nback projection of ones along all rays:\n");
-    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
-    {
-        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
-        {
-            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
-            {
-                printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
-            }
-            printf("\n");
-        }
-        printf("\n");
-    }
-
-    // To test whether the back projection is correct, we test if the back projector is the adjoint
-    // of the forward projector. This is more practical than checking a lot of single voxels in the
-    // back projected image.
-
-    float inner_product1 = std::inner_product(img.begin(), img.end(), bimg.begin(), 0.0f);
-    float inner_product2 = std::inner_product(img_fwd.begin(), img_fwd.end(), ones.begin(), 0.0f);
-
-    float ip_diff = fabs(inner_product1 - inner_product2);
-
-    if (ip_diff > eps)
-    {
-        printf("\n#########################################################################");
-        printf("\nback projection test failed. back projection seems not to be the adjoint.");
-        printf("\n %.7e", ip_diff);
-        printf("\n#########################################################################\n");
-        std::cerr << "Back projection via adjointness test failed.\n";
-    }
-}
-
 void test_cuda_managed_arrays()
 {
     // CUDA-managed array test

From f04f74fd52d199239faf23c36fa8618f71ac731e Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 19:48:38 +0200
Subject: [PATCH 28/35] use h_, d_, cm_ prefixes

---
 tests/test_nontof_fwd_back_cuda.cu | 216 ++++++++++++++---------------
 1 file changed, 108 insertions(+), 108 deletions(-)

diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu
index 82c1592..cc9aabc 100644
--- a/tests/test_nontof_fwd_back_cuda.cu
+++ b/tests/test_nontof_fwd_back_cuda.cu
@@ -24,57 +24,57 @@ int main()
 void test_cuda_managed_arrays()
 {
     // CUDA-managed array test
-    int img_dim[3] = {2, 3, 4};
-    float voxsize[3] = {4.0f, 3.0f, 2.0f};
+    int h_img_dim[3] = {2, 3, 4};
+    float h_voxsize[3] = {4.0f, 3.0f, 2.0f};
 
-    float *img_origin;
-    cudaMallocManaged(&img_origin, 3 * sizeof(float));
+    float *cm_img_origin;
+    cudaMallocManaged(&cm_img_origin, 3 * sizeof(float));
     for (int i = 0; i < 3; ++i)
     {
-        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
+        cm_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i];
     }
 
-    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
-    float *img;
-    cudaMallocManaged(&img, img_from_file.size() * sizeof(float));
-    std::copy(img_from_file.begin(), img_from_file.end(), img);
+    std::vector<float> h_img_from_file = readArrayFromFile<float>("img.txt");
+    float *cm_img;
+    cudaMallocManaged(&cm_img, h_img_from_file.size() * sizeof(float));
+    std::copy(h_img_from_file.begin(), h_img_from_file.end(), cm_img);
 
-    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
-    float *vstart;
-    cudaMallocManaged(&vstart, vstart_from_file.size() * sizeof(float));
-    std::copy(vstart_from_file.begin(), vstart_from_file.end(), vstart);
+    std::vector<float> h_vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    float *cm_vstart;
+    cudaMallocManaged(&cm_vstart, h_vstart_from_file.size() * sizeof(float));
+    std::copy(h_vstart_from_file.begin(), h_vstart_from_file.end(), cm_vstart);
 
-    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
-    float *vend;
-    cudaMallocManaged(&vend, vend_from_file.size() * sizeof(float));
-    std::copy(vend_from_file.begin(), vend_from_file.end(), vend);
+    std::vector<float> h_vend_from_file = readArrayFromFile<float>("vend.txt");
+    float *cm_vend;
+    cudaMallocManaged(&cm_vend, h_vend_from_file.size() * sizeof(float));
+    std::copy(h_vend_from_file.begin(), h_vend_from_file.end(), cm_vend);
 
-    size_t nlors = vstart_from_file.size() / 3;
+    size_t nlors = h_vstart_from_file.size() / 3;
 
-    float *xstart, *xend;
-    cudaMallocManaged(&xstart, 3 * nlors * sizeof(float));
-    cudaMallocManaged(&xend, 3 * nlors * sizeof(float));
+    float *cm_xstart, *cm_xend;
+    cudaMallocManaged(&cm_xstart, 3 * nlors * sizeof(float));
+    cudaMallocManaged(&cm_xend, 3 * nlors * sizeof(float));
 
     for (int ir = 0; ir < nlors; ir++)
     {
         for (int j = 0; j < 3; j++)
         {
-            xstart[ir * 3 + j] = img_origin[j] + vstart[ir * 3 + j] * voxsize[j];
-            xend[ir * 3 + j] = img_origin[j] + vend[ir * 3 + j] * voxsize[j];
+            cm_xstart[ir * 3 + j] = cm_img_origin[j] + cm_vstart[ir * 3 + j] * h_voxsize[j];
+            cm_xend[ir * 3 + j] = cm_img_origin[j] + cm_vend[ir * 3 + j] * h_voxsize[j];
         }
     }
 
-    float *img_fwd;
-    cudaMallocManaged(&img_fwd, nlors * sizeof(float));
-    joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
+    float *cm_img_fwd;
+    cudaMallocManaged(&cm_img_fwd, nlors * sizeof(float));
+    joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, h_voxsize, cm_img_fwd, nlors, h_img_dim, 0, 64);
 
-    std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
+    std::vector<float> h_expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
     float fwd_diff = 0;
     float eps = 1e-7;
 
     for (int ir = 0; ir < nlors; ir++)
     {
-        fwd_diff = std::abs(img_fwd[ir] - expected_fwd_vals[ir]);
+        fwd_diff = std::abs(cm_img_fwd[ir] - h_expected_fwd_vals[ir]);
         if (fwd_diff > eps)
         {
             std::cerr << "CUDA-managed array test failed for ray " << ir << "\n";
@@ -83,24 +83,24 @@ void test_cuda_managed_arrays()
     }
 
     // Test the back projection
-    float *bimg;
-    cudaMallocManaged(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float));
-    std::fill(bimg, bimg + (img_dim[0] * img_dim[1] * img_dim[2]), 0.0f);
+    float *cm_bimg;
+    cudaMallocManaged(&cm_bimg, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float));
+    std::fill(cm_bimg, cm_bimg + (h_img_dim[0] * h_img_dim[1] * h_img_dim[2]), 0.0f);
 
-    float *ones;
-    cudaMallocManaged(&ones, nlors * sizeof(float));
-    std::fill(ones, ones + nlors, 1.0f);
+    float *cm_ones;
+    cudaMallocManaged(&cm_ones, nlors * sizeof(float));
+    std::fill(cm_ones, cm_ones + nlors, 1.0f);
 
-    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim);
+    joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, h_voxsize, cm_ones, nlors, h_img_dim);
 
     printf("\nCUDA-managed back projection of ones along all rays:\n");
-    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    for (size_t i0 = 0; i0 < h_img_dim[0]; i0++)
     {
-        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        for (size_t i1 = 0; i1 < h_img_dim[1]; i1++)
         {
-            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
+            for (size_t i2 = 0; i2 < h_img_dim[2]; i2++)
             {
-                printf("%.1f ", bimg[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+                printf("%.1f ", cm_bimg[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]);
             }
             printf("\n");
         }
@@ -111,14 +111,14 @@ void test_cuda_managed_arrays()
     float inner_product1 = 0.0f;
     float inner_product2 = 0.0f;
 
-    for (size_t i = 0; i < img_from_file.size(); i++)
+    for (size_t i = 0; i < h_img_from_file.size(); i++)
     {
-        inner_product1 += img[i] * bimg[i];
+        inner_product1 += cm_img[i] * cm_bimg[i];
     }
 
     for (size_t ir = 0; ir < nlors; ir++)
     {
-        inner_product2 += img_fwd[ir] * ones[ir];
+        inner_product2 += cm_img_fwd[ir] * cm_ones[ir];
     }
 
     float ip_diff = fabs(inner_product1 - inner_product2);
@@ -132,75 +132,75 @@ void test_cuda_managed_arrays()
         std::cout << "CUDA-managed array back projection test passed.\n";
     }
 
-    cudaFree(img_origin);
-    cudaFree(img);
-    cudaFree(vstart);
-    cudaFree(vend);
-    cudaFree(xstart);
-    cudaFree(xend);
-    cudaFree(img_fwd);
-    cudaFree(bimg);
-    cudaFree(ones);
+    cudaFree(cm_img_origin);
+    cudaFree(cm_img);
+    cudaFree(cm_vstart);
+    cudaFree(cm_vend);
+    cudaFree(cm_xstart);
+    cudaFree(cm_xend);
+    cudaFree(cm_img_fwd);
+    cudaFree(cm_bimg);
+    cudaFree(cm_ones);
 }
 
 void test_cuda_device_arrays()
 {
     // CUDA device array test
-    int img_dim[3] = {2, 3, 4};
-    float voxsize[3] = {4.0f, 3.0f, 2.0f};
+    int h_img_dim[3] = {2, 3, 4};
+    float h_voxsize[3] = {4.0f, 3.0f, 2.0f};
 
-    float img_origin[3];
+    float h_img_origin[3];
     for (int i = 0; i < 3; ++i)
     {
-        img_origin[i] = (-(float)img_dim[i] / 2 + 0.5f) * voxsize[i];
+        h_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i];
     }
 
-    std::vector<float> img_from_file = readArrayFromFile<float>("img.txt");
-    float *img;
-    cudaMalloc(&img, img_from_file.size() * sizeof(float));
-    cudaMemcpy(img, img_from_file.data(), img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+    std::vector<float> h_img_from_file = readArrayFromFile<float>("img.txt");
+    float *d_img;
+    cudaMalloc(&d_img, h_img_from_file.size() * sizeof(float));
+    cudaMemcpy(d_img, h_img_from_file.data(), h_img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
 
-    std::vector<float> vstart_from_file = readArrayFromFile<float>("vstart.txt");
-    float *vstart;
-    cudaMalloc(&vstart, vstart_from_file.size() * sizeof(float));
-    cudaMemcpy(vstart, vstart_from_file.data(), vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+    std::vector<float> h_vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    float *d_vstart;
+    cudaMalloc(&d_vstart, h_vstart_from_file.size() * sizeof(float));
+    cudaMemcpy(d_vstart, h_vstart_from_file.data(), h_vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
 
-    std::vector<float> vend_from_file = readArrayFromFile<float>("vend.txt");
-    float *vend;
-    cudaMalloc(&vend, vend_from_file.size() * sizeof(float));
-    cudaMemcpy(vend, vend_from_file.data(), vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+    std::vector<float> h_vend_from_file = readArrayFromFile<float>("vend.txt");
+    float *d_vend;
+    cudaMalloc(&d_vend, h_vend_from_file.size() * sizeof(float));
+    cudaMemcpy(d_vend, h_vend_from_file.data(), h_vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
 
-    size_t nlors = vstart_from_file.size() / 3;
+    size_t nlors = h_vstart_from_file.size() / 3;
 
-    float *xstart, *xend;
-    cudaMalloc(&xstart, 3 * nlors * sizeof(float));
-    cudaMalloc(&xend, 3 * nlors * sizeof(float));
+    float *d_xstart, *d_xend;
+    cudaMalloc(&d_xstart, 3 * nlors * sizeof(float));
+    cudaMalloc(&d_xend, 3 * nlors * sizeof(float));
 
     for (int ir = 0; ir < nlors; ir++)
     {
         for (int j = 0; j < 3; j++)
         {
-            float xstart_val = img_origin[j] + vstart_from_file[ir * 3 + j] * voxsize[j];
-            float xend_val = img_origin[j] + vend_from_file[ir * 3 + j] * voxsize[j];
-            cudaMemcpy(&xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice);
-            cudaMemcpy(&xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice);
+            float xstart_val = h_img_origin[j] + h_vstart_from_file[ir * 3 + j] * h_voxsize[j];
+            float xend_val = h_img_origin[j] + h_vend_from_file[ir * 3 + j] * h_voxsize[j];
+            cudaMemcpy(&d_xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice);
+            cudaMemcpy(&d_xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice);
         }
     }
 
-    float *img_fwd;
-    cudaMalloc(&img_fwd, nlors * sizeof(float));
-    joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd, nlors, img_dim, 0, 64);
+    float *d_img_fwd;
+    cudaMalloc(&d_img_fwd, nlors * sizeof(float));
+    joseph3d_fwd(d_xstart, d_xend, d_img, h_img_origin, h_voxsize, d_img_fwd, nlors, h_img_dim, 0, 64);
 
-    std::vector<float> img_fwd_host(nlors);
-    cudaMemcpy(img_fwd_host.data(), img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost);
+    std::vector<float> h_img_fwd_host(nlors);
+    cudaMemcpy(h_img_fwd_host.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost);
 
-    std::vector<float> expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
+    std::vector<float> h_expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
     float fwd_diff = 0;
     float eps = 1e-7;
 
     for (int ir = 0; ir < nlors; ir++)
     {
-        fwd_diff = std::abs(img_fwd_host[ir] - expected_fwd_vals[ir]);
+        fwd_diff = std::abs(h_img_fwd_host[ir] - h_expected_fwd_vals[ir]);
         if (fwd_diff > eps)
         {
             std::cerr << "CUDA device array test failed for ray " << ir << "\n";
@@ -209,29 +209,29 @@ void test_cuda_device_arrays()
     }
 
     // Test the back projection
-    float *bimg;
-    cudaMalloc(&bimg, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float));
-    cudaMemset(bimg, 0, img_dim[0] * img_dim[1] * img_dim[2] * sizeof(float));
+    float *d_bimg;
+    cudaMalloc(&d_bimg, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float));
+    cudaMemset(d_bimg, 0, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float));
 
-    float *ones;
-    cudaMalloc(&ones, nlors * sizeof(float));
-    cudaMemset(ones, 0, nlors * sizeof(float));
-    std::vector<float> ones_host(nlors, 1.0f);
-    cudaMemcpy(ones, ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice);
+    float *d_ones;
+    cudaMalloc(&d_ones, nlors * sizeof(float));
+    cudaMemset(d_ones, 0, nlors * sizeof(float));
+    std::vector<float> h_ones_host(nlors, 1.0f);
+    cudaMemcpy(d_ones, h_ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice);
 
-    joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones, nlors, img_dim);
+    joseph3d_back(d_xstart, d_xend, d_bimg, h_img_origin, h_voxsize, d_ones, nlors, h_img_dim);
 
-    std::vector<float> bimg_host(img_dim[0] * img_dim[1] * img_dim[2]);
-    cudaMemcpy(bimg_host.data(), bimg, bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost);
+    std::vector<float> h_bimg_host(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]);
+    cudaMemcpy(h_bimg_host.data(), d_bimg, h_bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost);
 
     printf("\nCUDA device back projection of ones along all rays:\n");
-    for (size_t i0 = 0; i0 < img_dim[0]; i0++)
+    for (size_t i0 = 0; i0 < h_img_dim[0]; i0++)
     {
-        for (size_t i1 = 0; i1 < img_dim[1]; i1++)
+        for (size_t i1 = 0; i1 < h_img_dim[1]; i1++)
         {
-            for (size_t i2 = 0; i2 < img_dim[2]; i2++)
+            for (size_t i2 = 0; i2 < h_img_dim[2]; i2++)
             {
-                printf("%.1f ", bimg_host[img_dim[1] * img_dim[2] * i0 + img_dim[2] * i1 + i2]);
+                printf("%.1f ", h_bimg_host[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]);
             }
             printf("\n");
         }
@@ -242,14 +242,14 @@ void test_cuda_device_arrays()
     float inner_product1 = 0.0f;
     float inner_product2 = 0.0f;
 
-    for (size_t i = 0; i < img_from_file.size(); i++)
+    for (size_t i = 0; i < h_img_from_file.size(); i++)
     {
-        inner_product1 += img_from_file[i] * bimg_host[i];
+        inner_product1 += h_img_from_file[i] * h_bimg_host[i];
     }
 
     for (size_t ir = 0; ir < nlors; ir++)
     {
-        inner_product2 += img_fwd_host[ir] * ones_host[ir];
+        inner_product2 += h_img_fwd_host[ir] * h_ones_host[ir];
     }
 
     float ip_diff = fabs(inner_product1 - inner_product2);
@@ -263,12 +263,12 @@ void test_cuda_device_arrays()
         std::cout << "CUDA device array back projection test passed.\n";
     }
 
-    cudaFree(img);
-    cudaFree(vstart);
-    cudaFree(vend);
-    cudaFree(xstart);
-    cudaFree(xend);
-    cudaFree(img_fwd);
-    cudaFree(bimg);
-    cudaFree(ones);
+    cudaFree(d_img);
+    cudaFree(d_vstart);
+    cudaFree(d_vend);
+    cudaFree(d_xstart);
+    cudaFree(d_xend);
+    cudaFree(d_img_fwd);
+    cudaFree(d_bimg);
+    cudaFree(d_ones);
 }

From 19e7bcffa707771ebca736e900af7a8286ad711e Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sat, 12 Apr 2025 19:53:40 +0200
Subject: [PATCH 29/35] remove _from_file

---
 tests/test_nontof_fwd_back_cuda.cu | 69 +++++++++++++++---------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu
index cc9aabc..c65a5e5 100644
--- a/tests/test_nontof_fwd_back_cuda.cu
+++ b/tests/test_nontof_fwd_back_cuda.cu
@@ -34,22 +34,22 @@ void test_cuda_managed_arrays()
         cm_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i];
     }
 
-    std::vector<float> h_img_from_file = readArrayFromFile<float>("img.txt");
+    std::vector<float> h_img = readArrayFromFile<float>("img.txt");
     float *cm_img;
-    cudaMallocManaged(&cm_img, h_img_from_file.size() * sizeof(float));
-    std::copy(h_img_from_file.begin(), h_img_from_file.end(), cm_img);
+    cudaMallocManaged(&cm_img, h_img.size() * sizeof(float));
+    std::copy(h_img.begin(), h_img.end(), cm_img);
 
-    std::vector<float> h_vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    std::vector<float> h_vstart = readArrayFromFile<float>("vstart.txt");
     float *cm_vstart;
-    cudaMallocManaged(&cm_vstart, h_vstart_from_file.size() * sizeof(float));
-    std::copy(h_vstart_from_file.begin(), h_vstart_from_file.end(), cm_vstart);
+    cudaMallocManaged(&cm_vstart, h_vstart.size() * sizeof(float));
+    std::copy(h_vstart.begin(), h_vstart.end(), cm_vstart);
 
-    std::vector<float> h_vend_from_file = readArrayFromFile<float>("vend.txt");
+    std::vector<float> h_vend = readArrayFromFile<float>("vend.txt");
     float *cm_vend;
-    cudaMallocManaged(&cm_vend, h_vend_from_file.size() * sizeof(float));
-    std::copy(h_vend_from_file.begin(), h_vend_from_file.end(), cm_vend);
+    cudaMallocManaged(&cm_vend, h_vend.size() * sizeof(float));
+    std::copy(h_vend.begin(), h_vend.end(), cm_vend);
 
-    size_t nlors = h_vstart_from_file.size() / 3;
+    size_t nlors = h_vstart.size() / 3;
 
     float *cm_xstart, *cm_xend;
     cudaMallocManaged(&cm_xstart, 3 * nlors * sizeof(float));
@@ -111,7 +111,7 @@ void test_cuda_managed_arrays()
     float inner_product1 = 0.0f;
     float inner_product2 = 0.0f;
 
-    for (size_t i = 0; i < h_img_from_file.size(); i++)
+    for (size_t i = 0; i < h_img.size(); i++)
     {
         inner_product1 += cm_img[i] * cm_bimg[i];
     }
@@ -155,22 +155,22 @@ void test_cuda_device_arrays()
         h_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i];
     }
 
-    std::vector<float> h_img_from_file = readArrayFromFile<float>("img.txt");
+    std::vector<float> h_img = readArrayFromFile<float>("img.txt");
     float *d_img;
-    cudaMalloc(&d_img, h_img_from_file.size() * sizeof(float));
-    cudaMemcpy(d_img, h_img_from_file.data(), h_img_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_img, h_img.size() * sizeof(float));
+    cudaMemcpy(d_img, h_img.data(), h_img.size() * sizeof(float), cudaMemcpyHostToDevice);
 
-    std::vector<float> h_vstart_from_file = readArrayFromFile<float>("vstart.txt");
+    std::vector<float> h_vstart = readArrayFromFile<float>("vstart.txt");
     float *d_vstart;
-    cudaMalloc(&d_vstart, h_vstart_from_file.size() * sizeof(float));
-    cudaMemcpy(d_vstart, h_vstart_from_file.data(), h_vstart_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_vstart, h_vstart.size() * sizeof(float));
+    cudaMemcpy(d_vstart, h_vstart.data(), h_vstart.size() * sizeof(float), cudaMemcpyHostToDevice);
 
-    std::vector<float> h_vend_from_file = readArrayFromFile<float>("vend.txt");
+    std::vector<float> h_vend = readArrayFromFile<float>("vend.txt");
     float *d_vend;
-    cudaMalloc(&d_vend, h_vend_from_file.size() * sizeof(float));
-    cudaMemcpy(d_vend, h_vend_from_file.data(), h_vend_from_file.size() * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_vend, h_vend.size() * sizeof(float));
+    cudaMemcpy(d_vend, h_vend.data(), h_vend.size() * sizeof(float), cudaMemcpyHostToDevice);
 
-    size_t nlors = h_vstart_from_file.size() / 3;
+    size_t nlors = h_vstart.size() / 3;
 
     float *d_xstart, *d_xend;
     cudaMalloc(&d_xstart, 3 * nlors * sizeof(float));
@@ -180,8 +180,8 @@ void test_cuda_device_arrays()
     {
         for (int j = 0; j < 3; j++)
         {
-            float xstart_val = h_img_origin[j] + h_vstart_from_file[ir * 3 + j] * h_voxsize[j];
-            float xend_val = h_img_origin[j] + h_vend_from_file[ir * 3 + j] * h_voxsize[j];
+            float xstart_val = h_img_origin[j] + h_vstart[ir * 3 + j] * h_voxsize[j];
+            float xend_val = h_img_origin[j] + h_vend[ir * 3 + j] * h_voxsize[j];
             cudaMemcpy(&d_xstart[ir * 3 + j], &xstart_val, sizeof(float), cudaMemcpyHostToDevice);
             cudaMemcpy(&d_xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice);
         }
@@ -191,8 +191,8 @@ void test_cuda_device_arrays()
     cudaMalloc(&d_img_fwd, nlors * sizeof(float));
     joseph3d_fwd(d_xstart, d_xend, d_img, h_img_origin, h_voxsize, d_img_fwd, nlors, h_img_dim, 0, 64);
 
-    std::vector<float> h_img_fwd_host(nlors);
-    cudaMemcpy(h_img_fwd_host.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost);
+    std::vector<float> h_img_fwd(nlors);
+    cudaMemcpy(h_img_fwd.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost);
 
     std::vector<float> h_expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
     float fwd_diff = 0;
@@ -200,7 +200,7 @@ void test_cuda_device_arrays()
 
     for (int ir = 0; ir < nlors; ir++)
     {
-        fwd_diff = std::abs(h_img_fwd_host[ir] - h_expected_fwd_vals[ir]);
+        fwd_diff = std::abs(h_img_fwd[ir] - h_expected_fwd_vals[ir]);
         if (fwd_diff > eps)
         {
             std::cerr << "CUDA device array test failed for ray " << ir << "\n";
@@ -215,14 +215,13 @@ void test_cuda_device_arrays()
 
     float *d_ones;
     cudaMalloc(&d_ones, nlors * sizeof(float));
-    cudaMemset(d_ones, 0, nlors * sizeof(float));
-    std::vector<float> h_ones_host(nlors, 1.0f);
-    cudaMemcpy(d_ones, h_ones_host.data(), nlors * sizeof(float), cudaMemcpyHostToDevice);
+    std::vector<float> h_ones(nlors, 1.0f);
+    cudaMemcpy(d_ones, h_ones.data(), nlors * sizeof(float), cudaMemcpyHostToDevice);
 
     joseph3d_back(d_xstart, d_xend, d_bimg, h_img_origin, h_voxsize, d_ones, nlors, h_img_dim);
 
-    std::vector<float> h_bimg_host(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]);
-    cudaMemcpy(h_bimg_host.data(), d_bimg, h_bimg_host.size() * sizeof(float), cudaMemcpyDeviceToHost);
+    std::vector<float> h_bimg(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]);
+    cudaMemcpy(h_bimg.data(), d_bimg, h_bimg.size() * sizeof(float), cudaMemcpyDeviceToHost);
 
     printf("\nCUDA device back projection of ones along all rays:\n");
     for (size_t i0 = 0; i0 < h_img_dim[0]; i0++)
@@ -231,7 +230,7 @@ void test_cuda_device_arrays()
         {
             for (size_t i2 = 0; i2 < h_img_dim[2]; i2++)
             {
-                printf("%.1f ", h_bimg_host[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]);
+                printf("%.1f ", h_bimg[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]);
             }
             printf("\n");
         }
@@ -242,14 +241,14 @@ void test_cuda_device_arrays()
     float inner_product1 = 0.0f;
     float inner_product2 = 0.0f;
 
-    for (size_t i = 0; i < h_img_from_file.size(); i++)
+    for (size_t i = 0; i < h_img.size(); i++)
     {
-        inner_product1 += h_img_from_file[i] * h_bimg_host[i];
+        inner_product1 += h_img[i] * h_bimg[i];
     }
 
     for (size_t ir = 0; ir < nlors; ir++)
     {
-        inner_product2 += h_img_fwd_host[ir] * h_ones_host[ir];
+        inner_product2 += h_img_fwd[ir] * h_ones[ir];
     }
 
     float ip_diff = fabs(inner_product1 - inner_product2);

From 7d0a41de64bfb2570fb2ccd7dbc548c05fa63deb Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Sun, 13 Apr 2025 09:53:19 +0200
Subject: [PATCH 30/35] correct return values of tests

---
 src/joseph3d_back.cu               |   7 +-
 src/joseph3d_fwd.cu                |   6 +-
 tests/test_nontof_fwd_back.cpp     |  14 ++--
 tests/test_nontof_fwd_back_cuda.cu | 111 ++++++++++++++++++++---------
 4 files changed, 96 insertions(+), 42 deletions(-)

diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu
index 378d387..e20db17 100644
--- a/src/joseph3d_back.cu
+++ b/src/joseph3d_back.cu
@@ -60,9 +60,12 @@ void joseph3d_back(const float *xstart,
     handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly);
 
     // Handle img (write access)
+    int h_img_dim[3];
+    cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost);
+    size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2];
     float *d_img = nullptr;
     bool free_img = false;
-    handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetAccessedBy);
+    handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetAccessedBy);
 
     // Handle img_origin (read mostly)
     float *d_img_origin = nullptr;
@@ -117,7 +120,7 @@ void joseph3d_back(const float *xstart,
     if (free_img)
     {
         // Copy the result back to the host
-        cudaMemcpy(const_cast<float *>(img), d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], cudaMemcpyDeviceToHost);
+        cudaMemcpy(const_cast<float *>(img), d_img, img_size, cudaMemcpyDeviceToHost);
         cudaFree(d_img);
     }
     if (free_img_origin)
diff --git a/src/joseph3d_fwd.cu b/src/joseph3d_fwd.cu
index a5444c4..7585cb1 100644
--- a/src/joseph3d_fwd.cu
+++ b/src/joseph3d_fwd.cu
@@ -60,9 +60,13 @@ void joseph3d_fwd(const float *xstart,
     handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly);
 
     // Handle img (read mostly)
+    // Copy img_dim to the host if needed to calculate the size
+    int h_img_dim[3];
+    cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost);
+    size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2];
     float *d_img = nullptr;
     bool free_img = false;
-    handle_cuda_input_array(img, &d_img, sizeof(float) * img_dim[0] * img_dim[1] * img_dim[2], free_img, device_id, cudaMemAdviseSetReadMostly);
+    handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetReadMostly);
 
     // Handle img_origin (read mostly)
     float *d_img_origin = nullptr;
diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp
index dad7072..32e0856 100644
--- a/tests/test_nontof_fwd_back.cpp
+++ b/tests/test_nontof_fwd_back.cpp
@@ -12,11 +12,9 @@ int main()
     // host array test cases
     ////////////////////////////////////////////////////////
 
-#ifdef __CUDACC__
-    std::cout << "CUDA host array test\n";
-#else
-    std::cout << "OpenMP test\n";
-#endif
+    bool all_tests_passed = true;
+
+    std::cout << "Host array test\n";
 
     std::vector<int> img_dim = {2, 3, 4};
     std::vector<float> voxsize = {4.0f, 3.0f, 2.0f};
@@ -87,6 +85,8 @@ int main()
             std::cerr << "Actual: " << img_fwd[ir] << "\n";
             std::cerr << "Tolerance: " << eps << "\n";
             std::cerr << "Ray index: " << ir << "\n";
+
+            all_tests_passed = false;
         }
     }
 
@@ -133,7 +133,9 @@ int main()
         std::cerr << "Inner product 2: " << inner_product2 << "\n";
         std::cerr << "Difference: " << ip_diff << "\n";
         std::cerr << "Tolerance: " << eps << "\n";
+
+        all_tests_passed = false;
     }
 
-    return 0;
+    return all_tests_passed ? 0 : 1;
 }
diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu
index c65a5e5..2041cbe 100644
--- a/tests/test_nontof_fwd_back_cuda.cu
+++ b/tests/test_nontof_fwd_back_cuda.cu
@@ -7,31 +7,69 @@
 #include <numeric>
 #include <cuda_runtime.h>
 
-void test_cuda_managed_arrays();
-void test_cuda_device_arrays();
+void test_cuda_managed_arrays(int device_id, int threadsperblock);
+void test_cuda_device_arrays(int device_id, int threadsperblock);
 
 int main()
 {
-    std::cout << "\n--- Testing with CUDA-Managed Arrays ---\n";
-    test_cuda_managed_arrays();
+    bool all_tests_passed = true;
 
-    std::cout << "\n--- Testing with CUDA Device Arrays ---\n";
-    test_cuda_device_arrays();
+    int device_count;
+    cudaGetDeviceCount(&device_count);
 
-    return 0;
+    for (int i = 0; i < device_count; i++)
+    {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        std::cout << "Device " << i << ": " << prop.name << "\n";
+
+        try
+        {
+            test_cuda_managed_arrays(i, 64);
+        }
+        catch (const std::exception &e)
+        {
+            std::cerr << "CUDA-managed array test failed on device " << i << ": " << e.what() << "\n";
+            all_tests_passed = false;
+        }
+
+        std::cout << "\n--- Testing with CUDA Device Arrays ---\n";
+        try
+        {
+            test_cuda_device_arrays(i, 64);
+        }
+        catch (const std::exception &e)
+        {
+            std::cerr << "CUDA device array test failed on device " << i << ": " << e.what() << "\n";
+            all_tests_passed = false;
+        }
+    }
+
+    return all_tests_passed ? 0 : 1;
 }
 
-void test_cuda_managed_arrays()
+void test_cuda_managed_arrays(int device_id, int threadsperblock)
 {
+    cudaSetDevice(device_id);
+
     // CUDA-managed array test
-    int h_img_dim[3] = {2, 3, 4};
-    float h_voxsize[3] = {4.0f, 3.0f, 2.0f};
+    int *cm_img_dim;
+    cudaMallocManaged(&cm_img_dim, 3 * sizeof(int));
+    cm_img_dim[0] = 2;
+    cm_img_dim[1] = 3;
+    cm_img_dim[2] = 4;
+
+    float *cm_voxsize;
+    cudaMallocManaged(&cm_voxsize, 3 * sizeof(float));
+    cm_voxsize[0] = 4.0f;
+    cm_voxsize[1] = 3.0f;
+    cm_voxsize[2] = 2.0f;
 
     float *cm_img_origin;
     cudaMallocManaged(&cm_img_origin, 3 * sizeof(float));
     for (int i = 0; i < 3; ++i)
     {
-        cm_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i];
+        cm_img_origin[i] = (-(float)cm_img_dim[i] / 2 + 0.5f) * cm_voxsize[i];
     }
 
     std::vector<float> h_img = readArrayFromFile<float>("img.txt");
@@ -59,14 +97,14 @@ void test_cuda_managed_arrays()
     {
         for (int j = 0; j < 3; j++)
         {
-            cm_xstart[ir * 3 + j] = cm_img_origin[j] + cm_vstart[ir * 3 + j] * h_voxsize[j];
-            cm_xend[ir * 3 + j] = cm_img_origin[j] + cm_vend[ir * 3 + j] * h_voxsize[j];
+            cm_xstart[ir * 3 + j] = cm_img_origin[j] + cm_vstart[ir * 3 + j] * cm_voxsize[j];
+            cm_xend[ir * 3 + j] = cm_img_origin[j] + cm_vend[ir * 3 + j] * cm_voxsize[j];
         }
     }
 
     float *cm_img_fwd;
     cudaMallocManaged(&cm_img_fwd, nlors * sizeof(float));
-    joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, h_voxsize, cm_img_fwd, nlors, h_img_dim, 0, 64);
+    joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, cm_voxsize, cm_img_fwd, nlors, cm_img_dim, device_id, threadsperblock);
 
     std::vector<float> h_expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
     float fwd_diff = 0;
@@ -84,23 +122,23 @@ void test_cuda_managed_arrays()
 
     // Test the back projection
     float *cm_bimg;
-    cudaMallocManaged(&cm_bimg, h_img_dim[0] * h_img_dim[1] * h_img_dim[2] * sizeof(float));
-    std::fill(cm_bimg, cm_bimg + (h_img_dim[0] * h_img_dim[1] * h_img_dim[2]), 0.0f);
+    cudaMallocManaged(&cm_bimg, cm_img_dim[0] * cm_img_dim[1] * cm_img_dim[2] * sizeof(float));
+    std::fill(cm_bimg, cm_bimg + (cm_img_dim[0] * cm_img_dim[1] * cm_img_dim[2]), 0.0f);
 
     float *cm_ones;
     cudaMallocManaged(&cm_ones, nlors * sizeof(float));
     std::fill(cm_ones, cm_ones + nlors, 1.0f);
 
-    joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, h_voxsize, cm_ones, nlors, h_img_dim);
+    joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, cm_voxsize, cm_ones, nlors, cm_img_dim, device_id, threadsperblock);
 
     printf("\nCUDA-managed back projection of ones along all rays:\n");
-    for (size_t i0 = 0; i0 < h_img_dim[0]; i0++)
+    for (size_t i0 = 0; i0 < cm_img_dim[0]; i0++)
     {
-        for (size_t i1 = 0; i1 < h_img_dim[1]; i1++)
+        for (size_t i1 = 0; i1 < cm_img_dim[1]; i1++)
         {
-            for (size_t i2 = 0; i2 < h_img_dim[2]; i2++)
+            for (size_t i2 = 0; i2 < cm_img_dim[2]; i2++)
             {
-                printf("%.1f ", cm_bimg[h_img_dim[1] * h_img_dim[2] * i0 + h_img_dim[2] * i1 + i2]);
+                printf("%.1f ", cm_bimg[cm_img_dim[1] * cm_img_dim[2] * i0 + cm_img_dim[2] * i1 + i2]);
             }
             printf("\n");
         }
@@ -127,11 +165,9 @@ void test_cuda_managed_arrays()
     {
         std::cerr << "CUDA-managed array back projection test failed: adjointness property violated.\n";
     }
-    else
-    {
-        std::cout << "CUDA-managed array back projection test passed.\n";
-    }
 
+    cudaFree(cm_img_dim);
+    cudaFree(cm_voxsize);
     cudaFree(cm_img_origin);
     cudaFree(cm_img);
     cudaFree(cm_vstart);
@@ -143,17 +179,29 @@ void test_cuda_managed_arrays()
     cudaFree(cm_ones);
 }
 
-void test_cuda_device_arrays()
+void test_cuda_device_arrays(int device_id, int threadsperblock)
 {
+    cudaSetDevice(device_id);
+
     // CUDA device array test
+    int *d_img_dim;
+    cudaMalloc(&d_img_dim, 3 * sizeof(int));
     int h_img_dim[3] = {2, 3, 4};
+    cudaMemcpy(d_img_dim, h_img_dim, 3 * sizeof(int), cudaMemcpyHostToDevice);
+
+    float *d_voxsize;
+    cudaMalloc(&d_voxsize, 3 * sizeof(float));
     float h_voxsize[3] = {4.0f, 3.0f, 2.0f};
+    cudaMemcpy(d_voxsize, h_voxsize, 3 * sizeof(float), cudaMemcpyHostToDevice);
 
     float h_img_origin[3];
     for (int i = 0; i < 3; ++i)
     {
         h_img_origin[i] = (-(float)h_img_dim[i] / 2 + 0.5f) * h_voxsize[i];
     }
+    float *d_img_origin;
+    cudaMalloc(&d_img_origin, 3 * sizeof(float));
+    cudaMemcpy(d_img_origin, h_img_origin, 3 * sizeof(float), cudaMemcpyHostToDevice);
 
     std::vector<float> h_img = readArrayFromFile<float>("img.txt");
     float *d_img;
@@ -186,10 +234,9 @@ void test_cuda_device_arrays()
             cudaMemcpy(&d_xend[ir * 3 + j], &xend_val, sizeof(float), cudaMemcpyHostToDevice);
         }
     }
-
     float *d_img_fwd;
     cudaMalloc(&d_img_fwd, nlors * sizeof(float));
-    joseph3d_fwd(d_xstart, d_xend, d_img, h_img_origin, h_voxsize, d_img_fwd, nlors, h_img_dim, 0, 64);
+    joseph3d_fwd(d_xstart, d_xend, d_img, d_img_origin, d_voxsize, d_img_fwd, nlors, d_img_dim, device_id, threadsperblock);
 
     std::vector<float> h_img_fwd(nlors);
     cudaMemcpy(h_img_fwd.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost);
@@ -218,7 +265,7 @@ void test_cuda_device_arrays()
     std::vector<float> h_ones(nlors, 1.0f);
     cudaMemcpy(d_ones, h_ones.data(), nlors * sizeof(float), cudaMemcpyHostToDevice);
 
-    joseph3d_back(d_xstart, d_xend, d_bimg, h_img_origin, h_voxsize, d_ones, nlors, h_img_dim);
+    joseph3d_back(d_xstart, d_xend, d_bimg, d_img_origin, d_voxsize, d_ones, nlors, d_img_dim, device_id, threadsperblock);
 
     std::vector<float> h_bimg(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]);
     cudaMemcpy(h_bimg.data(), d_bimg, h_bimg.size() * sizeof(float), cudaMemcpyDeviceToHost);
@@ -257,11 +304,9 @@ void test_cuda_device_arrays()
     {
         std::cerr << "CUDA device array back projection test failed: adjointness property violated.\n";
     }
-    else
-    {
-        std::cout << "CUDA device array back projection test passed.\n";
-    }
 
+    cudaFree(d_img_dim);
+    cudaFree(d_voxsize);
     cudaFree(d_img);
     cudaFree(d_vstart);
     cudaFree(d_vend);

From 502a4a60bdd2ba4e0514aed5eaf426a280ad01ff Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Mon, 14 Apr 2025 10:26:41 +0200
Subject: [PATCH 31/35] add size_t nvoxels to facilitate img transfer

---
 include/parallelproj.h             | 48 ++++++++++++++++--------------
 src/joseph3d_back.cpp              |  1 +
 src/joseph3d_back.cu               |  8 ++---
 src/joseph3d_fwd.cpp               |  1 +
 src/joseph3d_fwd.cu                |  9 ++----
 tests/test_nontof_fwd_back.cpp     |  6 ++--
 tests/test_nontof_fwd_back_cuda.cu | 13 +++++---
 7 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/include/parallelproj.h b/include/parallelproj.h
index e554942..2ec6a5f 100644
--- a/include/parallelproj.h
+++ b/include/parallelproj.h
@@ -2,38 +2,42 @@
 #include <cstddef>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-void joseph3d_fwd(const float *xstart,
-                  const float *xend,
-                  const float *img,
-                  const float *img_origin,
-                  const float *voxsize,
-                  float *p,
-                  size_t nlors,
-                  const int *img_dim,
-                  int device_id = 0,
-                  int threadsperblock = 64);
+    void joseph3d_fwd(const float *xstart,
+                      const float *xend,
+                      const float *img,
+                      const float *img_origin,
+                      const float *voxsize,
+                      float *p,
+                      size_t nvoxels,
+                      size_t nlors,
+                      const int *img_dim,
+                      int device_id = 0,
+                      int threadsperblock = 64);
 
 #ifdef __cplusplus
 }
 #endif
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-void joseph3d_back(const float *xstart,
-                   const float *xend,
-                   float *img,
-                   const float *img_origin, 
-                   const float *voxsize, 
-                   const float *p,
-                   size_t nlors, 
-                   const int *img_dim,
-                   int device_id = 0,
-                   int threadsperblock = 64);
+    void joseph3d_back(const float *xstart,
+                       const float *xend,
+                       float *img,
+                       const float *img_origin,
+                       const float *voxsize,
+                       const float *p,
+                       size_t nvoxels,
+                       size_t nlors,
+                       const int *img_dim,
+                       int device_id = 0,
+                       int threadsperblock = 64);
 
 #ifdef __cplusplus
 }
diff --git a/src/joseph3d_back.cpp b/src/joseph3d_back.cpp
index f63d8da..bf5d037 100644
--- a/src/joseph3d_back.cpp
+++ b/src/joseph3d_back.cpp
@@ -8,6 +8,7 @@ void joseph3d_back(const float *xstart,
                    const float *img_origin,
                    const float *voxsize,
                    const float *p,
+                   size_t nvoxels,
                    size_t nlors,
                    const int *img_dim,
                    int device_id,
diff --git a/src/joseph3d_back.cu b/src/joseph3d_back.cu
index e20db17..e8bae57 100644
--- a/src/joseph3d_back.cu
+++ b/src/joseph3d_back.cu
@@ -32,6 +32,7 @@ void joseph3d_back(const float *xstart,
                    const float *img_origin,
                    const float *voxsize,
                    const float *p,
+                   size_t nvoxels,
                    size_t nlors,
                    const int *img_dim,
                    int device_id,
@@ -60,12 +61,9 @@ void joseph3d_back(const float *xstart,
     handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly);
 
     // Handle img (write access)
-    int h_img_dim[3];
-    cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost);
-    size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2];
     float *d_img = nullptr;
     bool free_img = false;
-    handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetAccessedBy);
+    handle_cuda_input_array(img, &d_img, sizeof(float) * nvoxels, free_img, device_id, cudaMemAdviseSetAccessedBy);
 
     // Handle img_origin (read mostly)
     float *d_img_origin = nullptr;
@@ -120,7 +118,7 @@ void joseph3d_back(const float *xstart,
     if (free_img)
     {
         // Copy the result back to the host
-        cudaMemcpy(const_cast<float *>(img), d_img, img_size, cudaMemcpyDeviceToHost);
+        cudaMemcpy(img, d_img, sizeof(float) * nvoxels, cudaMemcpyDeviceToHost);
         cudaFree(d_img);
     }
     if (free_img_origin)
diff --git a/src/joseph3d_fwd.cpp b/src/joseph3d_fwd.cpp
index 553cd8a..e7bbd04 100644
--- a/src/joseph3d_fwd.cpp
+++ b/src/joseph3d_fwd.cpp
@@ -8,6 +8,7 @@ void joseph3d_fwd(const float *xstart,
                   const float *img_origin,
                   const float *voxsize,
                   float *p,
+                  size_t nvoxels,
                   size_t nlors,
                   const int *img_dim,
                   int device_id,
diff --git a/src/joseph3d_fwd.cu b/src/joseph3d_fwd.cu
index 7585cb1..7305c5e 100644
--- a/src/joseph3d_fwd.cu
+++ b/src/joseph3d_fwd.cu
@@ -32,6 +32,7 @@ void joseph3d_fwd(const float *xstart,
                   const float *img_origin,
                   const float *voxsize,
                   float *p,
+                  size_t nvoxels,
                   size_t nlors,
                   const int *img_dim,
                   int device_id,
@@ -60,13 +61,9 @@ void joseph3d_fwd(const float *xstart,
     handle_cuda_input_array(xend, &d_xend, sizeof(float) * nlors * 3, free_xend, device_id, cudaMemAdviseSetReadMostly);
 
     // Handle img (read mostly)
-    // Copy img_dim to the host if needed to calculate the size
-    int h_img_dim[3];
-    cudaMemcpy(h_img_dim, img_dim, sizeof(int) * 3, cudaMemcpyDeviceToHost);
-    size_t img_size = sizeof(float) * h_img_dim[0] * h_img_dim[1] * h_img_dim[2];
     float *d_img = nullptr;
     bool free_img = false;
-    handle_cuda_input_array(img, &d_img, img_size, free_img, device_id, cudaMemAdviseSetReadMostly);
+    handle_cuda_input_array(img, &d_img, sizeof(float) * nvoxels, free_img, device_id, cudaMemAdviseSetReadMostly);
 
     // Handle img_origin (read mostly)
     float *d_img_origin = nullptr;
@@ -126,7 +123,7 @@ void joseph3d_fwd(const float *xstart,
         cudaFree(d_voxsize);
     if (free_p)
     {
-        cudaMemcpy(const_cast<float *>(p), d_p, sizeof(float) * nlors, cudaMemcpyDeviceToHost);
+        cudaMemcpy(p, d_p, sizeof(float) * nlors, cudaMemcpyDeviceToHost);
         cudaFree(d_p);
     }
     if (free_img_dim)
diff --git a/tests/test_nontof_fwd_back.cpp b/tests/test_nontof_fwd_back.cpp
index 32e0856..781184a 100644
--- a/tests/test_nontof_fwd_back.cpp
+++ b/tests/test_nontof_fwd_back.cpp
@@ -54,10 +54,12 @@ int main()
     // Allocate memory for forward projection results
     std::vector<float> img_fwd(nlors);
 
+    size_t nvoxels = img_dim[0] * img_dim[1] * img_dim[2];
+
     // Perform forward projection
     joseph3d_fwd(
         xstart.data(), xend.data(), img.data(),
-        img_origin.data(), voxsize.data(), img_fwd.data(),
+        img_origin.data(), voxsize.data(), img_fwd.data(), nvoxels,
         nlors, img_dim.data(), 0, 64);
 
     /////////////////////////////////////////////////////////////////////////////
@@ -100,7 +102,7 @@ int main()
 
     joseph3d_back(
         xstart.data(), xend.data(), bimg.data(),
-        img_origin.data(), voxsize.data(), ones.data(),
+        img_origin.data(), voxsize.data(), ones.data(), nvoxels,
         nlors, img_dim.data());
 
     printf("\nback projection of ones along all rays:\n");
diff --git a/tests/test_nontof_fwd_back_cuda.cu b/tests/test_nontof_fwd_back_cuda.cu
index 2041cbe..2371f6e 100644
--- a/tests/test_nontof_fwd_back_cuda.cu
+++ b/tests/test_nontof_fwd_back_cuda.cu
@@ -25,6 +25,7 @@ int main()
 
         try
         {
+            std::cout << "\n--- Testing with CUDA Managed Arrays ---\n";
             test_cuda_managed_arrays(i, 64);
         }
         catch (const std::exception &e)
@@ -36,6 +37,7 @@ int main()
         std::cout << "\n--- Testing with CUDA Device Arrays ---\n";
         try
         {
+            std::cout << "\n--- Testing with CUDA Device Arrays ---\n";
             test_cuda_device_arrays(i, 64);
         }
         catch (const std::exception &e)
@@ -72,6 +74,8 @@ void test_cuda_managed_arrays(int device_id, int threadsperblock)
         cm_img_origin[i] = (-(float)cm_img_dim[i] / 2 + 0.5f) * cm_voxsize[i];
     }
 
+    size_t nvoxels = cm_img_dim[0] * cm_img_dim[1] * cm_img_dim[2];
+
     std::vector<float> h_img = readArrayFromFile<float>("img.txt");
     float *cm_img;
     cudaMallocManaged(&cm_img, h_img.size() * sizeof(float));
@@ -104,7 +108,7 @@ void test_cuda_managed_arrays(int device_id, int threadsperblock)
 
     float *cm_img_fwd;
     cudaMallocManaged(&cm_img_fwd, nlors * sizeof(float));
-    joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, cm_voxsize, cm_img_fwd, nlors, cm_img_dim, device_id, threadsperblock);
+    joseph3d_fwd(cm_xstart, cm_xend, cm_img, cm_img_origin, cm_voxsize, cm_img_fwd, nvoxels, nlors, cm_img_dim, device_id, threadsperblock);
 
     std::vector<float> h_expected_fwd_vals = readArrayFromFile<float>("expected_fwd_vals.txt");
     float fwd_diff = 0;
@@ -129,7 +133,7 @@ void test_cuda_managed_arrays(int device_id, int threadsperblock)
     cudaMallocManaged(&cm_ones, nlors * sizeof(float));
     std::fill(cm_ones, cm_ones + nlors, 1.0f);
 
-    joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, cm_voxsize, cm_ones, nlors, cm_img_dim, device_id, threadsperblock);
+    joseph3d_back(cm_xstart, cm_xend, cm_bimg, cm_img_origin, cm_voxsize, cm_ones, nvoxels, nlors, cm_img_dim, device_id, threadsperblock);
 
     printf("\nCUDA-managed back projection of ones along all rays:\n");
     for (size_t i0 = 0; i0 < cm_img_dim[0]; i0++)
@@ -187,6 +191,7 @@ void test_cuda_device_arrays(int device_id, int threadsperblock)
     int *d_img_dim;
     cudaMalloc(&d_img_dim, 3 * sizeof(int));
     int h_img_dim[3] = {2, 3, 4};
+    size_t nvoxels = h_img_dim[0] * h_img_dim[1] * h_img_dim[2];
     cudaMemcpy(d_img_dim, h_img_dim, 3 * sizeof(int), cudaMemcpyHostToDevice);
 
     float *d_voxsize;
@@ -236,7 +241,7 @@ void test_cuda_device_arrays(int device_id, int threadsperblock)
     }
     float *d_img_fwd;
     cudaMalloc(&d_img_fwd, nlors * sizeof(float));
-    joseph3d_fwd(d_xstart, d_xend, d_img, d_img_origin, d_voxsize, d_img_fwd, nlors, d_img_dim, device_id, threadsperblock);
+    joseph3d_fwd(d_xstart, d_xend, d_img, d_img_origin, d_voxsize, d_img_fwd, nvoxels, nlors, d_img_dim, device_id, threadsperblock);
 
     std::vector<float> h_img_fwd(nlors);
     cudaMemcpy(h_img_fwd.data(), d_img_fwd, nlors * sizeof(float), cudaMemcpyDeviceToHost);
@@ -265,7 +270,7 @@ void test_cuda_device_arrays(int device_id, int threadsperblock)
     std::vector<float> h_ones(nlors, 1.0f);
     cudaMemcpy(d_ones, h_ones.data(), nlors * sizeof(float), cudaMemcpyHostToDevice);
 
-    joseph3d_back(d_xstart, d_xend, d_bimg, d_img_origin, d_voxsize, d_ones, nlors, d_img_dim, device_id, threadsperblock);
+    joseph3d_back(d_xstart, d_xend, d_bimg, d_img_origin, d_voxsize, d_ones, nvoxels, nlors, d_img_dim, device_id, threadsperblock);
 
     std::vector<float> h_bimg(h_img_dim[0] * h_img_dim[1] * h_img_dim[2]);
     cudaMemcpy(h_bimg.data(), d_bimg, h_bimg.size() * sizeof(float), cudaMemcpyDeviceToHost);

From aa8a8859dfc1e0c359d0330eed228520301477b8 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Mon, 14 Apr 2025 11:31:33 +0200
Subject: [PATCH 32/35] add more cases for handle_cuda_input_array

---
 include/cuda_utils.h |  7 ++++-
 src/cuda_utils.cu    | 62 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/include/cuda_utils.h b/include/cuda_utils.h
index 2accedb..7a51609 100644
--- a/include/cuda_utils.h
+++ b/include/cuda_utils.h
@@ -2,5 +2,10 @@
 
 #include <cuda_runtime.h>
 
+// Overload for constant input_ptr (const T*)
 template <typename T>
-void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint);
+void handle_cuda_input_array(const T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint);
+
+// Overload for non-constant input_ptr (T*)
+template <typename T>
+void handle_cuda_input_array(T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint);
diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu
index 4be521a..becc197 100644
--- a/src/cuda_utils.cu
+++ b/src/cuda_utils.cu
@@ -1,31 +1,81 @@
 #include "cuda_utils.h"
+#include <iostream>
 
+// Overload for constant input_ptr (const T*)
 template <typename T>
-void handle_cuda_input_array(const T *host_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint)
+void handle_cuda_input_array(const T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint)
 {
     cudaPointerAttributes attr;
-    cudaError_t err = cudaPointerGetAttributes(&attr, host_ptr);
+    cudaError_t err = cudaPointerGetAttributes(&attr, input_ptr);
     free_flag = false;
 
     if (err == cudaSuccess && attr.type == cudaMemoryTypeManaged)
     {
-        cudaMemPrefetchAsync(const_cast<void *>(static_cast<const void *>(host_ptr)), size, device_id);
-        cudaMemAdvise(const_cast<void *>(static_cast<const void *>(host_ptr)), size, memory_hint, device_id);
+        // Prefetch and advise for managed memory
+        cudaMemPrefetchAsync(const_cast<void *>(static_cast<const void *>(input_ptr)), size, device_id);
+        cudaMemAdvise(const_cast<void *>(static_cast<const void *>(input_ptr)), size, memory_hint, device_id);
     }
 
     if (err == cudaSuccess && (attr.type == cudaMemoryTypeManaged || attr.type == cudaMemoryTypeDevice))
     {
-        *device_ptr = const_cast<T *>(host_ptr);
+        // Assign managed or device pointer
+        *device_ptr = const_cast<T *>(input_ptr);
     }
     else
     {
         // Host pointer case, transfer to device
         cudaMalloc(device_ptr, size);
-        cudaMemcpy(*device_ptr, host_ptr, size, cudaMemcpyHostToDevice);
+        cudaMemcpy(*device_ptr, input_ptr, size, cudaMemcpyHostToDevice);
+        free_flag = true;
+    }
+}
+
+// Overload for non-constant input_ptr (T*) (not const)
+template <typename T>
+void handle_cuda_input_array(T *input_ptr, T **device_ptr, size_t size, bool &free_flag, int device_id, cudaMemoryAdvise memory_hint)
+{
+    cudaPointerAttributes attr;
+    cudaError_t err = cudaPointerGetAttributes(&attr, input_ptr);
+    free_flag = false;
+
+    if (err == cudaSuccess && attr.type == cudaMemoryTypeManaged)
+    {
+        // Prefetch and advise for managed memory
+        cudaMemPrefetchAsync(input_ptr, size, device_id);
+        cudaMemAdvise(input_ptr, size, memory_hint, device_id);
+    }
+
+    if (err == cudaSuccess && (attr.type == cudaMemoryTypeManaged || attr.type == cudaMemoryTypeDevice))
+    {
+        // Assign managed or device pointer
+        *device_ptr = input_ptr;
+    }
+    else
+    {
+        // Host pointer case, transfer to device
+        cudaMalloc(device_ptr, size);
+        cudaMemcpy(*device_ptr, input_ptr, size, cudaMemcpyHostToDevice);
         free_flag = true;
     }
 }
 
 // Explicit template instantiations
+template void handle_cuda_input_array<double>(const double *, double **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<double>(double *, double **, size_t, bool &, int, cudaMemoryAdvise);
 template void handle_cuda_input_array<float>(const float *, float **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<float>(float *, float **, size_t, bool &, int, cudaMemoryAdvise);
+
 template void handle_cuda_input_array<int>(const int *, int **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<int>(int *, int **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<unsigned int>(const unsigned int *, unsigned int **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<unsigned int>(unsigned int *, unsigned int **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<size_t>(const size_t *, size_t **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<size_t>(size_t *, size_t **, size_t, bool &, int, cudaMemoryAdvise);
+
+template void handle_cuda_input_array<char>(const char *, char **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<char>(char *, char **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<unsigned char>(const unsigned char *, unsigned char **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<unsigned char>(unsigned char *, unsigned char **, size_t, bool &, int, cudaMemoryAdvise);
+
+template void handle_cuda_input_array<bool>(const bool *, bool **, size_t, bool &, int, cudaMemoryAdvise);
+template void handle_cuda_input_array<bool>(bool *, bool **, size_t, bool &, int, cudaMemoryAdvise);

From 0fd48deec5d85e15a5e2f919031a1e1508703df4 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Mon, 14 Apr 2025 17:43:24 +0200
Subject: [PATCH 33/35] wip python interface using pybind11

---
 CMakeLists.txt              |  42 ++++++++
 src/parallelproj_pybind.cpp | 190 ++++++++++++++++++++++++++++++++++++
 tests/test.py               |  53 ++++++++++
 3 files changed, 285 insertions(+)
 create mode 100644 src/parallelproj_pybind.cpp
 create mode 100644 tests/test.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93b7347..adeaf04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,14 @@
 cmake_minimum_required(VERSION 3.18)
 project(PARALLELPROJ LANGUAGES C CXX)
 
+# Set the C++ standard
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Add an option to build the Python interface
+option(BUILD_PYTHON_INTERFACE "Build the Python interface using pybind11" OFF)
+
+
 include(CheckLanguage)
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
@@ -88,6 +96,40 @@ else()
     )
 endif()
 
+################################################################################
+### BUILD THE PYTHON INTERFACE #################################################
+################################################################################
+
+# Build the Python interface if enabled
+if(BUILD_PYTHON_INTERFACE)
+    # Enable the use of FindPython in pybind11
+    set(PYBIND11_FINDPYTHON ON)
+    # Find pybind11
+    find_package(pybind11 REQUIRED)
+
+    find_path(DLPACK_INCLUDE_DIR
+    NAMES dlpack/dlpack.h
+    DOC "Path to the DLPack include directory")
+
+    # Check if DLPack was found
+    if (NOT DLPACK_INCLUDE_DIR)
+        message(FATAL_ERROR "DLPack not found. Please install DLPack via conda or provide the path to dlpack.h.")
+    else()
+        message(STATUS "DLPack found at: ${DLPACK_INCLUDE_DIR}")
+    endif()
+
+
+    # Add the Python module
+    pybind11_add_module(parallelproj_pybind
+        src/parallelproj_pybind.cpp
+    )
+
+    target_include_directories(parallelproj_pybind PRIVATE ${CMAKE_SOURCE_DIR}/include)
+    # Link the core library to the Python module
+    target_link_libraries(parallelproj_pybind PRIVATE parallelproj)
+endif()
+
+
 ################################################################################
 ### INSTALL AND CONFIGS ########################################################
 ################################################################################
diff --git a/src/parallelproj_pybind.cpp b/src/parallelproj_pybind.cpp
new file mode 100644
index 0000000..f505566
--- /dev/null
+++ b/src/parallelproj_pybind.cpp
@@ -0,0 +1,190 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include <dlpack/dlpack.h>
+#include "parallelproj.h"
+
+namespace py = pybind11;
+
+// Helper function to extract raw pointer and shape
+template <typename T>
+std::pair<T *, std::vector<size_t>> extract_pointer_and_shape(py::object array)
+{
+    T *raw_ptr = nullptr;
+    std::vector<size_t> shape;
+
+    // Handle arrays using the __dlpack__ protocol (default)
+    if (py::hasattr(array, "__dlpack__"))
+    {
+        // Call the __dlpack__ method to get a DLPack tensor
+        py::capsule dlpack_capsule = array.attr("__dlpack__")();
+
+        // Extract the DLManagedTensor from the capsule
+        auto *managed_tensor = static_cast<DLManagedTensor *>(
+            PyCapsule_GetPointer(dlpack_capsule.ptr(), "dltensor"));
+
+        if (!managed_tensor)
+        {
+            throw std::runtime_error("Failed to extract DLManagedTensor from PyCapsule.");
+        }
+
+        // Access the DLTensor from the DLManagedTensor
+        DLTensor dltensor = managed_tensor->dl_tensor;
+
+        // Ensure the data type matches
+        if (dltensor.dtype.code != kDLFloat || dltensor.dtype.bits != sizeof(T) * 8)
+        {
+            throw std::invalid_argument("DLPack tensor has an incompatible data type.");
+        }
+
+        // Get the raw pointer and shape
+        raw_ptr = reinterpret_cast<T *>(dltensor.data);
+        shape = std::vector<size_t>(dltensor.shape, dltensor.shape + dltensor.ndim);
+    }
+    // Handle NumPy arrays
+    else if (py::isinstance<py::array_t<T>>(array))
+    {
+        auto numpy_array = array.cast<py::array_t<T>>();
+        raw_ptr = numpy_array.mutable_data();
+        shape = std::vector<size_t>(numpy_array.shape(), numpy_array.shape() + numpy_array.ndim());
+    }
+    // Handle arrays using the __cuda_array_interface__ (e.g. cupy or pytorch gpu tensors)
+    else if (py::hasattr(array, "__cuda_array_interface__"))
+    {
+        auto cuda_interface = array.attr("__cuda_array_interface__");
+        raw_ptr = reinterpret_cast<T *>(cuda_interface["data"].cast<std::pair<size_t, bool>>().first);
+        shape = cuda_interface["shape"].cast<std::vector<size_t>>();
+    }
+    // Handle arrays using the __array_interface__ (Python Array API or array_api_strict)
+    else
+    {
+        throw std::invalid_argument("Unsupported array type. Must have __dlpack__, __cuda_array_interface__ or be numpy.");
+    }
+
+    return {raw_ptr, shape};
+}
+
+// Wrapper for joseph3d_fwd
+void joseph3d_fwd_py(py::object xstart,
+                     py::object xend,
+                     py::object img,
+                     py::object img_origin,
+                     py::object voxsize,
+                     py::object p,
+                     int device_id = 0,
+                     int threadsperblock = 64)
+{
+    // Extract raw pointers and shapes
+    auto [xstart_ptr, xstart_shape] = extract_pointer_and_shape<float>(xstart);
+    auto [xend_ptr, xend_shape] = extract_pointer_and_shape<float>(xend);
+    auto [img_ptr, img_shape] = extract_pointer_and_shape<float>(img);
+    auto [img_origin_ptr, img_origin_shape] = extract_pointer_and_shape<float>(img_origin);
+    auto [voxsize_ptr, voxsize_shape] = extract_pointer_and_shape<float>(voxsize);
+    auto [p_ptr, p_shape] = extract_pointer_and_shape<float>(p);
+
+    // Validate shapes
+    if (xstart_shape.size() < 2 || xstart_shape[1] != 3)
+    {
+        throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)");
+    }
+    if (xend_shape.size() < 2 || xend_shape[1] != 3)
+    {
+        throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)");
+    }
+    if (img_shape.size() != 3)
+    {
+        throw std::invalid_argument("img must be a 3D array");
+    }
+    // Validate that p.shape == xstart.shape[:-1]
+    if (p_shape.size() != xstart_shape.size() - 1 ||
+        !std::equal(p_shape.begin(), p_shape.end(), xstart_shape.begin()))
+    {
+        throw std::invalid_argument("p must have a shape equal to xstart.shape[:-1]");
+    }
+    if (img_origin_shape.size() != 1 || img_origin_shape[0] != 3)
+    {
+        throw std::invalid_argument("img_origin must be a 1D array with 3 elements");
+    }
+    if (voxsize_shape.size() != 1 || voxsize_shape[0] != 3)
+    {
+        throw std::invalid_argument("voxsize must be a 1D array with 3 elements");
+    }
+
+    // Calculate nlors using xstart_shape (multiply shape except the last dimension)
+    size_t nlors = std::accumulate(xstart_shape.begin(), xstart_shape.end() - 1, 1, std::multiplies<size_t>());
+    int img_dim[3] = {static_cast<int>(img_shape[0]), static_cast<int>(img_shape[1]), static_cast<int>(img_shape[2])};
+    size_t nvoxels = img_dim[0] * img_dim[1] * img_dim[2];
+
+    // Call the C++ function
+    joseph3d_fwd(xstart_ptr, xend_ptr, img_ptr, img_origin_ptr, voxsize_ptr, p_ptr, nvoxels, nlors, img_dim, device_id, threadsperblock);
+}
+
+// Wrapper for joseph3d_back
+void joseph3d_back_py(py::object xstart,
+                      py::object xend,
+                      py::object img,
+                      py::object img_origin,
+                      py::object voxsize,
+                      py::object p,
+                      int device_id = 0,
+                      int threadsperblock = 64)
+{
+    // Extract raw pointers and shapes
+    auto [xstart_ptr, xstart_shape] = extract_pointer_and_shape<float>(xstart);
+    auto [xend_ptr, xend_shape] = extract_pointer_and_shape<float>(xend);
+    auto [img_ptr, img_shape] = extract_pointer_and_shape<float>(img);
+    auto [img_origin_ptr, img_origin_shape] = extract_pointer_and_shape<float>(img_origin);
+    auto [voxsize_ptr, voxsize_shape] = extract_pointer_and_shape<float>(voxsize);
+    auto [p_ptr, p_shape] = extract_pointer_and_shape<float>(p);
+
+    // Validate shapes
+    if (xstart_shape.size() < 2 || xstart_shape[1] != 3)
+    {
+        throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)");
+    }
+    if (xend_shape.size() < 2 || xend_shape[1] != 3)
+    {
+        throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)");
+    }
+    if (img_shape.size() != 3)
+    {
+        throw std::invalid_argument("img must be a 3D array");
+    }
+    // Validate that p.shape == xstart.shape[:-1]
+    if (p_shape.size() != xstart_shape.size() - 1 ||
+        !std::equal(p_shape.begin(), p_shape.end(), xstart_shape.begin()))
+    {
+        throw std::invalid_argument("p must have a shape equal to xstart.shape[:-1]");
+    }
+    if (img_origin_shape.size() != 1 || img_origin_shape[0] != 3)
+    {
+        throw std::invalid_argument("img_origin must be a 1D array with 3 elements");
+    }
+    if (voxsize_shape.size() != 1 || voxsize_shape[0] != 3)
+    {
+        throw std::invalid_argument("voxsize must be a 1D array with 3 elements");
+    }
+
+    // Calculate nlors using xstart_shape (multiply shape except the last dimension)
+    size_t nlors = std::accumulate(xstart_shape.begin(), xstart_shape.end() - 1, 1, std::multiplies<size_t>());
+
+    int img_dim[3] = {static_cast<int>(img_shape[0]), static_cast<int>(img_shape[1]), static_cast<int>(img_shape[2])};
+    size_t nvoxels = img_dim[0] * img_dim[1] * img_dim[2];
+
+    // Call the C++ function
+    joseph3d_back(xstart_ptr, xend_ptr, img_ptr, img_origin_ptr, voxsize_ptr, p_ptr, nvoxels, nlors, img_dim, device_id, threadsperblock);
+}
+
+// Pybind11 module definition
+PYBIND11_MODULE(parallelproj_pybind, m)
+{
+    m.doc() = "Python bindings for parallelproj";
+
+    m.def("joseph3d_fwd", &joseph3d_fwd_py, "Forward projection",
+          py::arg("xstart"), py::arg("xend"), py::arg("img"), py::arg("img_origin"),
+          py::arg("voxsize"), py::arg("p"), py::arg("device_id") = 0, py::arg("threadsperblock") = 64);
+
+    m.def("joseph3d_back", &joseph3d_back_py, "Back projection",
+          py::arg("xstart"), py::arg("xend"), py::arg("img"), py::arg("img_origin"),
+          py::arg("voxsize"), py::arg("p"), py::arg("device_id") = 0, py::arg("threadsperblock") = 64);
+}
diff --git a/tests/test.py b/tests/test.py
new file mode 100644
index 0000000..b5585e8
--- /dev/null
+++ b/tests/test.py
@@ -0,0 +1,53 @@
+import numpy as np
+import array_api_strict as xp
+import parallelproj_pybind as pp
+
+img_dim = (2, 3, 4)
+
+# img_dim = xp.asarray([2, 3, 4], dtype=xp.int32)
+voxsize = xp.asarray([4.0, 3.0, 2.0], dtype=xp.float32)
+
+img_origin = (-0.5 * xp.asarray(img_dim, dtype=xp.float32) + 0.5) * voxsize
+
+# Read the image from file
+img = xp.reshape(xp.asarray(np.loadtxt("img.txt", dtype=np.float32)), img_dim)
+
+# Read the ray start and coordinates from file
+vstart = xp.reshape(xp.asarray(np.loadtxt("vstart.txt", dtype=np.float32)), (2, 5, 3))
+vend = xp.reshape(xp.asarray(np.loadtxt("vend.txt", dtype=np.float32)), (2, 5, 3))
+
+# Calculate the start and end coordinates in world coordinates
+xstart = vstart * voxsize + img_origin
+xend = vend * voxsize + img_origin
+
+# Allocate memory for forward projection results
+img_fwd = xp.zeros(xstart.shape[0], dtype=xp.float32)
+
+# Perform forward projection
+pp.joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd)
+
+# Read the expected forward values from file
+expected_fwd_vals = xp.reshape(
+    xp.asarray(np.loadtxt("expected_fwd_vals.txt", dtype=np.float32)), img_fwd.shape
+)
+
+# Check if we got the expected results
+eps = 1e-7
+
+assert (
+    xp.max(xp.abs(img_fwd - expected_fwd_vals)) < eps
+), "Forward projection test failed."
+
+# Test the back projection
+bimg = xp.zeros(img_dim, dtype=xp.float32)
+ones = xp.ones(img_fwd.shape, dtype=xp.float32)
+pp.joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones)
+
+print(bimg)
+
+ip1 = float(xp.sum(img * bimg))
+ip2 = float(xp.sum(img_fwd * ones))
+
+print(ip1, ip2)
+
+assert abs(ip1 - ip2) / abs(ip1) < eps, "Back projection test failed."

From b12b9d45f8bc5790034778df89fa070e0661766d Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Mon, 14 Apr 2025 17:56:53 +0200
Subject: [PATCH 34/35] add test for python interface

---
 src/parallelproj_pybind.cpp   |  8 ++---
 tests/test.py                 | 53 -----------------------------
 tests/test_nontof_fwd_back.py | 63 +++++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 57 deletions(-)
 delete mode 100644 tests/test.py
 create mode 100644 tests/test_nontof_fwd_back.py

diff --git a/src/parallelproj_pybind.cpp b/src/parallelproj_pybind.cpp
index f505566..7aeb7d0 100644
--- a/src/parallelproj_pybind.cpp
+++ b/src/parallelproj_pybind.cpp
@@ -83,11 +83,11 @@ void joseph3d_fwd_py(py::object xstart,
     auto [p_ptr, p_shape] = extract_pointer_and_shape<float>(p);
 
     // Validate shapes
-    if (xstart_shape.size() < 2 || xstart_shape[1] != 3)
+    if (xstart_shape.size() < 2 || xstart_shape.back() != 3)
     {
         throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)");
     }
-    if (xend_shape.size() < 2 || xend_shape[1] != 3)
+    if (xend_shape.size() < 2 || xend_shape.back() != 3)
     {
         throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)");
     }
@@ -138,11 +138,11 @@ void joseph3d_back_py(py::object xstart,
     auto [p_ptr, p_shape] = extract_pointer_and_shape<float>(p);
 
     // Validate shapes
-    if (xstart_shape.size() < 2 || xstart_shape[1] != 3)
+    if (xstart_shape.size() < 2 || xstart_shape.back() != 3)
     {
         throw std::invalid_argument("xstart must have at least 2 dims and shape (..., 3)");
     }
-    if (xend_shape.size() < 2 || xend_shape[1] != 3)
+    if (xend_shape.size() < 2 || xend_shape.back() != 3)
     {
         throw std::invalid_argument("xend must have at least 2 dims and shape (..., 3)");
     }
diff --git a/tests/test.py b/tests/test.py
deleted file mode 100644
index b5585e8..0000000
--- a/tests/test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import numpy as np
-import array_api_strict as xp
-import parallelproj_pybind as pp
-
-img_dim = (2, 3, 4)
-
-# img_dim = xp.asarray([2, 3, 4], dtype=xp.int32)
-voxsize = xp.asarray([4.0, 3.0, 2.0], dtype=xp.float32)
-
-img_origin = (-0.5 * xp.asarray(img_dim, dtype=xp.float32) + 0.5) * voxsize
-
-# Read the image from file
-img = xp.reshape(xp.asarray(np.loadtxt("img.txt", dtype=np.float32)), img_dim)
-
-# Read the ray start and coordinates from file
-vstart = xp.reshape(xp.asarray(np.loadtxt("vstart.txt", dtype=np.float32)), (2, 5, 3))
-vend = xp.reshape(xp.asarray(np.loadtxt("vend.txt", dtype=np.float32)), (2, 5, 3))
-
-# Calculate the start and end coordinates in world coordinates
-xstart = vstart * voxsize + img_origin
-xend = vend * voxsize + img_origin
-
-# Allocate memory for forward projection results
-img_fwd = xp.zeros(xstart.shape[0], dtype=xp.float32)
-
-# Perform forward projection
-pp.joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd)
-
-# Read the expected forward values from file
-expected_fwd_vals = xp.reshape(
-    xp.asarray(np.loadtxt("expected_fwd_vals.txt", dtype=np.float32)), img_fwd.shape
-)
-
-# Check if we got the expected results
-eps = 1e-7
-
-assert (
-    xp.max(xp.abs(img_fwd - expected_fwd_vals)) < eps
-), "Forward projection test failed."
-
-# Test the back projection
-bimg = xp.zeros(img_dim, dtype=xp.float32)
-ones = xp.ones(img_fwd.shape, dtype=xp.float32)
-pp.joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones)
-
-print(bimg)
-
-ip1 = float(xp.sum(img * bimg))
-ip2 = float(xp.sum(img_fwd * ones))
-
-print(ip1, ip2)
-
-assert abs(ip1 - ip2) / abs(ip1) < eps, "Back projection test failed."
diff --git a/tests/test_nontof_fwd_back.py b/tests/test_nontof_fwd_back.py
new file mode 100644
index 0000000..0b1bcb5
--- /dev/null
+++ b/tests/test_nontof_fwd_back.py
@@ -0,0 +1,63 @@
+import numpy as np
+import array_api_strict as xp
+import parallelproj_pybind as pp
+import pytest
+
+
+@pytest.fixture
+def setup_data():
+    """Fixture to set up test data."""
+    img_dim = (2, 3, 4)
+    voxsize = xp.asarray([4.0, 3.0, 2.0], dtype=xp.float32)
+    img_origin = (-0.5 * xp.asarray(img_dim, dtype=xp.float32) + 0.5) * voxsize
+
+    # Read the image from file
+    img = xp.reshape(xp.asarray(np.loadtxt("img.txt", dtype=np.float32)), img_dim)
+
+    # Read the ray start and end coordinates from file
+    vstart = xp.reshape(
+        xp.asarray(np.loadtxt("vstart.txt", dtype=np.float32)), (2, 5, 3)
+    )
+    vend = xp.reshape(xp.asarray(np.loadtxt("vend.txt", dtype=np.float32)), (2, 5, 3))
+
+    # Calculate the start and end coordinates in world coordinates
+    xstart = vstart * voxsize + img_origin
+    xend = vend * voxsize + img_origin
+
+    return img_dim, voxsize, img_origin, img, xstart, xend
+
+
+def test_forward_and_back_projection(setup_data):
+    """Test the forward projection."""
+    img_dim, voxsize, img_origin, img, xstart, xend = setup_data
+
+    # Allocate memory for forward projection results
+    img_fwd = xp.zeros(xstart.shape[:-1], dtype=xp.float32)
+
+    # Perform forward projection
+    pp.joseph3d_fwd(xstart, xend, img, img_origin, voxsize, img_fwd)
+
+    # Read the expected forward values from file
+    expected_fwd_vals = xp.reshape(
+        xp.asarray(np.loadtxt("expected_fwd_vals.txt", dtype=np.float32)), img_fwd.shape
+    )
+
+    # Check if we got the expected results
+    eps = 1e-7
+    assert (
+        xp.max(xp.abs(img_fwd - expected_fwd_vals)) < eps
+    ), "Forward projection test failed."
+
+    # Allocate memory for back projection results
+    bimg = xp.zeros(img_dim, dtype=xp.float32)
+    ones = xp.ones(xstart.shape[:-1], dtype=xp.float32)
+
+    # Perform back projection
+    pp.joseph3d_back(xstart, xend, bimg, img_origin, voxsize, ones)
+
+    # Check the results
+    ip1 = float(xp.sum(img * bimg))
+    ip2 = float(xp.sum(img_fwd * ones))
+
+    eps = 1e-7
+    assert abs(ip1 - ip2) / abs(ip1) < eps, "Back projection test failed."

From a5f3a4dcc9a1933a2641cd23c146f30b8e60da15 Mon Sep 17 00:00:00 2001
From: Georg Schramm <georg.schramm@kuleuven.be>
Date: Mon, 14 Apr 2025 18:28:02 +0200
Subject: [PATCH 35/35] add python tests to CMakeLists.txt

---
 CMakeLists.txt | 105 +++++++++++++++++++++++++++----------------------
 1 file changed, 59 insertions(+), 46 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adeaf04..fb4e470 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,8 @@ endif()
 ### BUILD THE PYTHON INTERFACE #################################################
 ################################################################################
 
+enable_testing()
+
 # Build the Python interface if enabled
 if(BUILD_PYTHON_INTERFACE)
     # Enable the use of FindPython in pybind11
@@ -118,7 +120,6 @@ if(BUILD_PYTHON_INTERFACE)
         message(STATUS "DLPack found at: ${DLPACK_INCLUDE_DIR}")
     endif()
 
-
     # Add the Python module
     pybind11_add_module(parallelproj_pybind
         src/parallelproj_pybind.cpp
@@ -130,6 +131,63 @@ if(BUILD_PYTHON_INTERFACE)
 endif()
 
 
+
+
+################################################################################
+### BUILD TESTS ################################################################
+################################################################################
+
+set(TEST_DATA_DIR ${CMAKE_SOURCE_DIR}/tests)
+
+# List all the .txt files that need to be copied
+set(TEST_DATA_FILES
+    img.txt
+    vstart.txt
+    vend.txt
+    expected_fwd_vals.txt
+)
+
+# Copy each file to the build directory
+foreach(TEST_FILE ${TEST_DATA_FILES})
+    add_custom_command(
+        OUTPUT ${CMAKE_BINARY_DIR}/${TEST_FILE}
+        COMMAND ${CMAKE_COMMAND} -E copy ${TEST_DATA_DIR}/${TEST_FILE} ${CMAKE_BINARY_DIR}/${TEST_FILE}
+        DEPENDS ${TEST_DATA_DIR}/${TEST_FILE}
+    )
+    list(APPEND TEST_DATA_OUTPUTS ${CMAKE_BINARY_DIR}/${TEST_FILE})
+endforeach()
+
+# Add a custom target to ensure the files are copied before running tests
+add_custom_target(copy_test_data ALL DEPENDS ${TEST_DATA_OUTPUTS})
+
+# Add the directory containing file_utils.h to the include paths
+include_directories(${CMAKE_SOURCE_DIR}/tests)
+
+foreach(TEST_SOURCE ${TEST_SOURCES})
+    get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE)
+    add_executable(${TEST_NAME} ${TEST_SOURCE})
+    target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/tests)
+    add_dependencies(${TEST_NAME} copy_test_data) # Ensure test data is copied before running tests
+    if(CMAKE_CUDA_COMPILER)
+        target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart)
+    else()
+        target_link_libraries(${TEST_NAME} PRIVATE parallelproj OpenMP::OpenMP_CXX)
+    endif()
+    add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+endforeach()
+
+if(BUILD_PYTHON_INTERFACE)
+    ### ADD THE PYTHON INTERFACE TESTS ###
+    find_program(PYTEST_EXECUTABLE pytest REQUIRED)
+    add_test(NAME python_interface_test COMMAND pytest ${CMAKE_SOURCE_DIR}/tests)
+
+    # Set PYTHONPATH to include the build directory
+    set_tests_properties(python_interface_test PROPERTIES
+        ENVIRONMENT "PYTHONPATH=${CMAKE_BINARY_DIR}:$ENV{PYTHONPATH}"
+    )
+endif()
+
+
 ################################################################################
 ### INSTALL AND CONFIGS ########################################################
 ################################################################################
@@ -178,48 +236,3 @@ install(FILES
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parallelproj
 )
 
-
-################################################################################
-### BUILD TESTS ################################################################
-################################################################################
-
-enable_testing()
-
-set(TEST_DATA_DIR ${CMAKE_SOURCE_DIR}/tests)
-
-# List all the .txt files that need to be copied
-set(TEST_DATA_FILES
-    img.txt
-    vstart.txt
-    vend.txt
-    expected_fwd_vals.txt
-)
-
-# Copy each file to the build directory
-foreach(TEST_FILE ${TEST_DATA_FILES})
-    add_custom_command(
-        OUTPUT ${CMAKE_BINARY_DIR}/${TEST_FILE}
-        COMMAND ${CMAKE_COMMAND} -E copy ${TEST_DATA_DIR}/${TEST_FILE} ${CMAKE_BINARY_DIR}/${TEST_FILE}
-        DEPENDS ${TEST_DATA_DIR}/${TEST_FILE}
-    )
-    list(APPEND TEST_DATA_OUTPUTS ${CMAKE_BINARY_DIR}/${TEST_FILE})
-endforeach()
-
-# Add a custom target to ensure the files are copied before running tests
-add_custom_target(copy_test_data ALL DEPENDS ${TEST_DATA_OUTPUTS})
-
-# Add the directory containing file_utils.h to the include paths
-include_directories(${CMAKE_SOURCE_DIR}/tests)
-
-foreach(TEST_SOURCE ${TEST_SOURCES})
-    get_filename_component(TEST_NAME ${TEST_SOURCE} NAME_WE)
-    add_executable(${TEST_NAME} ${TEST_SOURCE})
-    target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/tests)
-    add_dependencies(${TEST_NAME} copy_test_data) # Ensure test data is copied before running tests
-    if(CMAKE_CUDA_COMPILER)
-        target_link_libraries(${TEST_NAME} PRIVATE parallelproj CUDA::cudart)
-    else()
-        target_link_libraries(${TEST_NAME} PRIVATE parallelproj OpenMP::OpenMP_CXX)
-    endif()
-    add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
-endforeach()