8000 GPU Support via OpenCL by Pencilcaseman · Pull Request #1377 · rust-ndarray/ndarray · GitHub
[go: up one dir, main page]

Skip to content

GPU Support via OpenCL #1377

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
First OpenCL kernels!
  • Loading branch information
Pencilcaseman committed Mar 30, 2024
commit c4541f58c90f04952715063b020cace93d7ec869
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ num-complex = { version = "0.4", default-features = false }

# Use via the `opencl` crate feature!
hasty_ = { version = "0.2", optional = true, package = "hasty", default-features = false }
# hasty_ = { path = "../../hasty_dev/hasty", optional = true, package = "hasty", default-features = false }
#hasty_ = { path = "../../hasty_dev/hasty", optional = true, package = "hasty", default-features = false }
#once_cell_ = { version = "1.19", optional = true, package = "once_cell" }

# Use via the `rayon` crate feature!
rayon_ = { version = "1.0.3", optional = true, package = "rayon" }
Expand Down
3 changes: 2 additions & 1 deletion src/arrayformat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be 8000 copied, modified, or distributed
// except according to those terms.
use super::{ArrayBase, ArrayView, Axis, Data, Dimension, NdProducer};
use super::{ArrayBase, ArrayView, Axis, Data, Device, Dimension, NdProducer};
use crate::aliases::{Ix1, IxDyn};
use alloc::format;
use std::fmt;
Expand Down Expand Up @@ -116,6 +116,7 @@ where
{
// Cast into a dynamically dimensioned view
// This is required to be able to use `index_axis` for the recursive case
assert_eq!(array.device(), Device::Host, "Cannot print an array that is not on the Host.");
format_array_inner(array.view().into_dyn(), f, format, fmt_opt, 0, array.ndim())
}

Expand Down
9 changes: 9 additions & 0 deletions src/data_repr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ impl<A> OwnedRepr<A> {
self.device
}

pub(crate) const unsafe fn from_components(ptr: NonNull<A>, len: usize, capacity: usize, device: Device) -> Self {
Self {
ptr,
len,
capacity,
device,
}
}

/// Move this storage object to a specified device.
#[allow(clippy::unnecessary_wraps)]
pub(crate) fn move_to_device(self, device: Device) -> Option<Self> {
Expand Down
20 changes: 13 additions & 7 deletions src/data_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

//! The data (inner representation) traits for ndarray

use rawpointer::PointerExt;

use alloc::sync::Arc;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
use std::mem::MaybeUninit;
use std::mem::{self, size_of};
use std::mem::MaybeUninit;
use std::ptr::NonNull;

use rawpointer::PointerExt;

use crate::{ArcArray, Array, ArrayBase, CowRepr, Device, Dimension, OwnedArcRepr, OwnedRepr, RawViewRepr, ViewRepr};

/// Array representation trait.
Expand Down Expand Up @@ -329,10 +329,16 @@ unsafe impl<A> RawData for OwnedRepr<A> {
}

fn _is_pointer_inbounds(&self, self_ptr: *const Self::Elem) -> bool {
let slc = self.as_slice();
let ptr = slc.as_ptr() as *mut A;
let end = unsafe { ptr.add(slc.len()) };
self_ptr >= ptr && self_ptr <= end
// let slc = self.as_slice();
// let ptr = slc.as_ptr() as *mut A;
// let end = unsafe { ptr.add(slc.len()) };
// self_ptr >= ptr && self_ptr <= end

// Instead of using a slice, we just get the raw pointer. This assumes that `self.len()`
// is correct, but since this is internally managed, it's safe to assume it is
let ptr = self.as_ptr();
let end = unsafe { ptr.add(self.len()) };
ptr <= self_ptr && self_ptr <= end
}

fn _device(&self) -> Option<Device> {
Expand Down
9 changes: 9 additions & 0 deletions src/impl_constructors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,15 @@ where
array
}

pub(crate) const unsafe fn from_parts(data: S, ptr: std::ptr::NonNull<S::Elem>, dim: D, strides: D) -> Self {
Self {
data,
ptr,
dim,
strides,
}
}

#[deprecated(
note = "This method is hard to use correctly. Use `uninit` instead.",
since = "0.15.0"
Expand Down
6 changes: 5 additions & 1 deletion src/impl_methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ w 236B here
self.dim.clone()
}

pub fn raw_strides(&self) -> D {
self.strides.clone()
}

/// Return the shape of the array as a slice.
///
/// Note that you probably don't want to use this to create an array of the
Expand Down Expand Up @@ -2990,7 +2994,7 @@ unsafe fn unlimited_transmute<A, B>(data: A) -> B {
type DimMaxOf<A, B> = <A as DimMax<B>>::Output;

impl<A, D> ArrayBase<OwnedRepr<A>, D>
where A: std::fmt::Debug
// where A: std::fmt::Debug
{
// pub fn device(&self) -> Device {
// self.data.device()
Expand Down
126 changes: 111 additions & 15 deletions src/impl_ops.rs
10000
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::mem::ManuallyDrop;
use crate::dimension::DimMax;
use crate::Device;
use crate::Zip;
use crate::Layout;
use crate::OwnedRepr;
use num_complex::Complex;

/// Elements that can be used as direct operands in arithmetic with arrays.
Expand Down Expand Up @@ -58,6 +62,16 @@ macro_rules! device_check_assert(
}
);

// Pick the expression $a for commutative and $b for ordered binop
macro_rules! if_commutative {
(Commute { $a:expr } or { $b:expr }) => {
$a
};
(Ordered { $a:expr } or { $b:expr }) => {
$b
};
}

macro_rules! impl_binary_op(
($rs_trait:ident, $operator:tt, $math_op:ident, $inplace_op:tt, $docstring:expr) => (
/// Perform elementwise
Expand Down Expand Up @@ -186,7 +200,7 @@ where
/// **Panics** if broadcasting isn’t possible.
impl<'a, A, B, S, S2, D, E> $rs_trait<&'a ArrayBase<S2, E>> for &'a ArrayBase<S, D>
where
A: Clone + $rs_trait<B, Output=A>,
A: Clone + $rs_trait<B, Output=A> + std::fmt::Debug,
B: Clone,
S: Data<Elem=A>,
S2: Data<Elem=B>,
Expand All @@ -205,7 +219,97 @@ where
} else {
self.broadcast_with(rhs).unwrap()
};
Zip::from(lhs).and(rhs).map_collect(clone_opf(A::$math_op))

match self.device() {
Device::Host => {
Zip::from(lhs).and(rhs).map_collect(clone_opf(A::$math_op))
}

#[cfg(feature = "opencl")]
Device::OpenCL => {
if lhs.raw_dim().ndim() == 0 && rhs.raw_dim().ndim() == 0 {
// println!("Scalar");
todo!();
} else if lhs.layout_impl().is(Layout::CORDER | Layout::FORDER) &&
rhs.layout_impl().is(Layout::CORDER | Layout::FORDER) &&
lhs.layout_impl().matches(rhs.layout_impl()) {
// println!("Contiguous");

static mut KERNEL_BUILT: bool = false; // todo: fix monomorphization issue

let typename = match crate::opencl::rust_type_to_c_name::<A>() {
Some(x) => x,
None => panic!("The Rust type {} is not supported by the \
OpenCL backend", std::any::type_name::<A>())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Enforce at type level?

};

let kernel_name = format!("binary_op_{}_{}", stringify!($math_op), typename);

#[cold]
if unsafe { !KERNEL_BUILT } {
let kernel = crate::opencl::gen_contiguous_linear_kernel_3(
&kernel_name,
typename,
stringify!($operator));

unsafe {
hasty_::opencl::opencl_add_kernel(&kernel);
KERNEL_BUILT = true;
}
}

unsafe {
let elements = self.len();
let self_ptr = self.as_ptr() as *mut std::ffi::c_void;
let other_ptr = rhs.as_ptr() as *mut std::ffi::c_void;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm assuming this operation can be invalid here because they are only known to be on the same device by a debug assertion, not a hard assertion?

let res_ptr = match hasty_::opencl::opencl_allocate(
elements * std::mem::size_of::<A>(),
hasty_::opencl::OpenCLMemoryType::ReadWrite
) {
Ok(buf) => buf,
Err(e) => panic!("Failed to allocate OpenCL buffer. Exited with: {:?}", e)
};

match hasty_::opencl::opencl_run_contiguous_linear_kernel_3(
&kernel_name,
elements,
self_ptr,
other_ptr,
res_ptr,
) {
Ok(()) => {
use std::ptr::NonNull;

let ptr = NonNull::new(res_ptr as *mut A).unwrap();
let data = OwnedRepr::<A>::from_components(
ptr,
self.len(),
self.len(),
self.device(),
);

Self::Output::from_parts(
data,
ptr,
<D as DimMax<E>>::Output::from_dimension(&self.raw_dim()).unwrap(),
<D as DimMax<E>>::Output::from_dimension(&self.raw_strides()).unwrap(),
)
}
Err(e) => panic!("Failed to run OpenCL kernel '{}'. \
Exited with code: {:?}", kernel_name, e),
}
}
} else {
println!("Strided");
todo!();
}
}

#[cfg(feature = "cuda")]
Device::CUDA => {
todo!();
}
}
}
}

Expand Down Expand Up @@ -248,16 +352,6 @@ impl<'a, A, S, D, B> $rs_trait<B> for &'a ArrayBase<S, D>
);
);

// Pick the expression $a for commutative and $b for ordered binop
macro_rules! if_commutative {
(Commute { $a:expr } or { $b:expr }) => {
$a
};
(Ordered { $a:expr } or { $b:expr }) => {
$b
};
}

macro_rules! impl_scalar_lhs_op {
// $commutative flag. Reuse the self + scalar impl if we can.
// We can do this safely since these are the primitive numeric types
Expand Down Expand Up @@ -304,10 +398,11 @@ impl<'a, S, D> $trt<&'a ArrayBase<S, D>> for $scalar
}

mod arithmetic_ops {
use super::*;
use std::ops::*;

use crate::imp_prelude::*;

use std::ops::*;
use super::*;

fn clone_opf<A: Clone, B: Clone, C>(f: impl Fn(A, B) -> C) -> impl FnMut(&A, &B) -> C {
move |x, y| f(x.clone(), y.clone())
Expand Down Expand Up @@ -447,9 +542,10 @@ mod arithmetic_ops {
}

mod assign_ops {
use super::*;
use crate::imp_prelude::*;

use super::*;

macro_rules! impl_assign_op {
($trt:ident, $method:ident, $doc:expr) => {
use std::ops::$trt;
Expand Down
14 changes: 14 additions & 0 deletions src/layout/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,20 @@ impl Layout {
(self.is(Layout::CORDER) as i32 - self.is(Layout::FORDER) as i32)
+ (self.is(Layout::CPREFER) as i32 - self.is(Layout::FPREFER) as i32)
}

/// Return true if the layout order of `self` matches the layout order of `other`
///
/// **Note**: We ignore the preference bits
#[inline(always)]
pub(crate) fn matches(self, other: Self) -> bool {
self.0 & (0b11) == other.0 & (0b11)
}

/// Return true if this layout exactly matches the other layout
#[inline(always)]
pub(crate) fn matches_exact(self, other: Self) -> bool {
self.0 == other.0
}
}

#[cfg(test)]
Expand Down
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1602,6 +1602,9 @@ pub(crate) fn is_aligned<T>(ptr: *const T) -> bool {
(ptr as usize) % ::std::mem::align_of::<T>() == 0
}

#[cfg(feature = "opencl")]
mod opencl;

pub fn configure() {
#[cfg(feature = "opencl")]
unsafe {
Expand Down
44 changes: 44 additions & 0 deletions src/opencl.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
pub(crate) fn rust_type_to_c_name<T>() -> Option<&'static str> {
match std::any::type_name::<T>() {
"f32" => Some("float"),
"f64" => Some("double"),
"i8" => Some("int8_t"),
"i16" => Some("int16_t"),
"i32" => Some("int32_t"),
"i64" => Some("int64_t"),
"u8" => Some("uint8_t"),
"u16" => Some("uint16_t"),
"u32" => Some("uint32_t"),
"u64" | "usize" => Some("uint64_t"),
_ => None,
}
}

pub(crate) fn gen_contiguous_linear_kernel_3(kernel_name: &str, typename: &str, op: &str) -> String {
format!(
r#"
#ifndef NDARRAY_INCLUDE_STDINT
#define NDARRAY_INCLUDE_STDINT

// We should probably verify that these are, in fact, correct
typedef char int8_t;
typedef short int16_t;
typedef int int32_t;
typedef long int64_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long uint64_t;
#endif // NDARRAY_INCLUDE_STDINT

__kernel void {kernel_name}(__global const {typename} *a, __global const {typename} *b, __global {typename} *c) {{
// Get id as 64-bit integer to avoid overflow
uint64_t i = get_global_id(0);
c[i] = a[i] {op} b[i];
}}
"#,
kernel_name = kernel_name,
typename = typename,
op = op,
)
}
0