pytorch
diff --git a/‎aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
Lines changed: 5 additions & 0 deletions b/‎aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎c10/core/Stream.h
Lines changed: 26 additions & 9 deletions b/‎c10/core/Stream.h
Lines changed: 26 additions & 9 deletions
diff --git a/‎c10/cuda/CUDAStream.cpp
Lines changed: 53 additions & 16 deletions b/‎c10/cuda/CUDAStream.cpp
Lines changed: 53 additions & 16 deletions
diff --git a/‎c10/cuda/CUDAStream.h
Lines changed: 10 additions & 0 deletions b/‎c10/cuda/CUDAStream.h
Lines changed: 10 additions & 0 deletions
diff --git a/‎caffe2/contrib/opencl/context.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/contrib/opencl/context.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/core/context.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/core/context.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/core/context_base.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/core/context_base.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/ideep/utils/ideep_context.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/ideep/utils/ideep_context.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/test_cuda.py
Lines changed: 33 additions & 0 deletions b/‎test/test_cuda.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎torch/_C/__init__.pyi.in
Lines changed: 1 addition & 1 deletion b/‎torch/_C/__init__.pyi.in
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/csrc/cuda/Stream.cpp
Lines changed: 11 additions & 4 deletions b/‎torch/csrc/cuda/Stream.cpp
Lines changed: 11 additions & 4 deletions
diff --git a/‎torch/cuda/streams.py
Lines changed: 23 additions & 0 deletions b/‎torch/cuda/streams.py
Lines changed: 23 additions & 0 deletions
@@ -90,6 +90,11 @@ inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, De
   return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device));
 }
 
+HIPStreamMasqueradingAsCUDA
+inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {
+  return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device));
+}
+
 inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
   return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index));
 }
 
@@ -12,7 +12,7 @@ namespace c10 {
 /// numbering system which is not visible to the user.  HOWEVER, we
 /// guarantee that StreamId 0 is always a valid stream, and corresponds
 /// to some sort of "default" stream.
-using StreamId = int32_t;
+using StreamId = int64_t;
 
 // NB: I decided not to call the above StreamIndex to avoid confusion with
 // DeviceIndex.  This way, you access device index with index(), and stream id
@@ -119,21 +119,38 @@ class Stream final {
     // that the bitmasking code below is updated accordingly!
     static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
     static_assert(sizeof(DeviceIndex) == 1, "DeviceIndex is not 8-bit");
-    static_assert(sizeof(StreamId) == 4, "DeviceIndex is not 32-bit");
+    static_assert(sizeof(StreamId) == 8, "StreamId is not 64-bit");
     // Concat these together into a 64-bit integer
     // See Note [Hazard when concatenating signed integers]
     uint64_t bits = static_cast<uint64_t>(static_cast<uint8_t>(device_type()))
-            << 48 |
-        static_cast<uint64_t>(static_cast<uint8_t>(device_index())) << 32 |
-        static_cast<uint64_t>(static_cast<uint32_t>(id()));
+            << 56 |
+        static_cast<uint64_t>(static_cast<uint8_t>(device_index())) << 48 |
+        // Remove the sign extension part of the 64-bit address because
+        // the id might be used to hold a pointer.
+        (static_cast<uint64_t>(id()) & ((1ull << 48) - 1));
+    TORCH_INTERNAL_ASSERT(
+        static_cast<DeviceIndex>((bits >> 48) & 0xFFull) == device_index(),
+        "DeviceIndex is not correctly packed");
+    TORCH_INTERNAL_ASSERT(
+        static_cast<DeviceType>((bits >> 56)) == device_type(),
+        "DeviceType is not correctly packed");
+    // Re-extend the sign of stream_id for checking
+    uint64_t mask = (1ull << 47);
+    TORCH_INTERNAL_ASSERT(
+        static_cast<StreamId>(((bits & 0xFFFFFFFFFFFFull) ^ mask) - mask) ==
+            id(),
+        "DeviceType is not correctly packed");
     return bits;
   }
 
   static Stream unpack(uint64_t bits) {
-    const auto stream_id = static_cast<StreamId>(bits & 0xFFFFFFFFull);
-    bits >>= 32;
-    const auto device_index = static_cast<DeviceIndex>(bits & 0xFFFFull);
-    bits >>= 16;
+    // Re-extend the sign of stream_id
+    uint64_t mask = (1ull << 47);
+    const auto stream_id =
+        (static_cast<StreamId>(bits & 0xFFFFFFFFFFFFull) ^ mask) - mask;
+    bits >>= 48;
+    const auto device_index = static_cast<DeviceIndex>(bits & 0xFFull);
+    bits >>= 8;
     const auto device_type = static_cast<DeviceType>(bits);
     TORCH_CHECK(isValidDeviceType(device_type));
     // Unfortunately, we can't check if the StreamId is valid here; it
 
@@ -10,6 +10,7 @@
 #include <mutex>
 #include <vector>
 
+#include <iostream>
 namespace c10 {
 namespace cuda {
 
@@ -41,6 +42,7 @@ static DeviceIndex num_gpus = -1;
 static constexpr int kStreamsPerPoolBits = 5;
 static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
 static constexpr unsigned int kDefaultFlags = cudaStreamNonBlocking;
+static constexpr int kStreamTypeBits = 3;
 
 // Note: lower numbers are higher priorities, zero is default priority
 static int kHighPriority = -1;
@@ -73,13 +75,13 @@ static std::array<LeakyStreamInternals, kStreamsPerPool>
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~
 // How do we assign stream IDs?
 //
-// -- 25 bits -- -- 2 bits --  -- 5 bits -----
-// zeros         StreamIdType  stream id index
+// -- 57 bits --  -- 5 bits -----  -- 3 bits --
+// zeros          stream id index  StreamIdType
 //
 // Where StreamIdType:
-//  00 = default stream
-//  01 = low priority stream
-//  10 = high priority stream
+//  000 = default stream or externally allocated if id[63:3] != 0
+//  001 = low priority stream
+//  010 = high priority stream
 //
 // This is not really for efficiency; it's just easier to write the code
 // to extract the index if we do this with bitmasks :)
@@ -95,11 +97,16 @@ static std::array<LeakyStreamInternals, kStreamsPerPool>
 // could work around this with something like
 // https://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
 // but it seems a bit overkill for this.
-
+//
+// Also, external managed stream pointers (cudaStream_t) can be directly stored
+// in the Id field so in this case, we need to check the stream alignment.
+// The IdType uses an additional bit to match with the 64-bit address alignment
+// making easy to identify an external stream when its value (X & 7) > 0
 enum class StreamIdType : uint8_t {
   DEFAULT = 0x0,
   LOW = 0x1,
   HIGH = 0x2,
+  EXT = 0x3,
 };
 
 std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
@@ -113,28 +120,39 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
     case StreamIdType::HIGH:
       stream << "HIGH";
       break;
+    case StreamIdType::EXT:
+      stream << "EXT";
+      break;
     default:
       stream << static_cast<uint8_t>(s);
       break;
   }
   return stream;
 }
 
-// StreamId is 32-bit, so we can just rely on regular promotion rules.
+// StreamId is 64-bit, so we can just rely on regular promotion rules.
 // We rely on streamIdIndex and streamIdType being non-negative;
 // see Note [Hazard when concatenating signed integers]
 
 static inline StreamIdType streamIdType(StreamId s) {
-  return static_cast<StreamIdType>(s >> kStreamsPerPoolBits);
+  int mask_for_type = (1 << kStreamTypeBits) - 1;
+  if (s && ((s & mask_for_type) == 0)) {
+    // Externally allocated streams have their id being the cudaStream_ptr
+    // so the bits corresponding to the type will be 0 and will collide with
+    // the default stream.
+    return StreamIdType::EXT;
+  }
+  return static_cast<StreamIdType>(s & mask_for_type);
 }
 
 static inline size_t streamIdIndex(StreamId s) {
-  return static_cast<size_t>(s & ((1 << kStreamsPerPoolBits) - 1));
+  return static_cast<size_t>(
+      (s >> kStreamTypeBits) & ((1 << kStreamsPerPoolBits) - 1));
 }
 
 StreamId makeStreamId(StreamIdType st, size_t si) {
-  return (static_cast<StreamId>(st) << kStreamsPerPoolBits) |
-      static_cast<StreamId>(si);
+  return (static_cast<StreamId>(si) << kStreamTypeBits) |
+      static_cast<StreamId>(st);
 }
 
 template <typename T, typename A>
@@ -251,7 +269,7 @@ static void initCUDAStreamsOnce() {
 
 // Helper to verify the GPU index is valid
 static inline void check_gpu(DeviceIndex device_index) {
-  AT_ASSERT(device_index >= 0 && device_index < num_gpus);
+  TORCH_INTERNAL_ASSERT(device_index >= 0 && device_index < num_gpus);
 }
 
 // Helper to determine the index of the stream to return
@@ -305,9 +323,16 @@ CUDAStream CUDAStream_fromInternals(const LeakyStreamInternals* ptr) {
 } // anonymous namespace
 
 cudaStream_t CUDAStream::stream() const {
-  auto ptr = CUDAStream_internals(*this);
-  AT_ASSERT(ptr);
-  return ptr->stream;
+  int64_t stream_id = unwrap().id();
+  if (streamIdType(stream_id) == StreamIdType::EXT) {
+    // In this case this is a externally allocated stream
+    // we don't need to manage its life cycle
+    return reinterpret_cast<cudaStream_t>(stream_id);
+  } else {
+    auto ptr = CUDAStream_internals(*this);
+    TORCH_INTERNAL_ASSERT(ptr);
+    return ptr->stream;
+  }
 }
 
 // Returns a stream
10000
 from the requested pool
@@ -334,6 +359,18 @@ CUDAStream getStreamFromPool(
   return CUDAStream_fromInternals(&low_priority_streams[device_index][idx]);
 }
 
+CUDAStream getStreamFromExternal(
+    cudaStream_t ext_stream,
+    DeviceIndex device_index) {
+  return CUDAStream(
+      CUDAStream::UNCHECKED,
+      // The stream pointer will be the actual id
+      Stream(
+          Stream::UNSAFE,
+          c10::Device(DeviceType::CUDA, device_index),
+          reinterpret_cast<int64_t>(ext_stream)));
+}
+
 CUDAStream getDefaultCUDAStream(DeviceIndex device_index) {
   initCUDAStreamsOnce();
   if (device_index == -1) {
@@ -354,7 +391,7 @@ CUDAStream getCurrentCUDAStream(DeviceIndex device_index) {
 void setCurrentCUDAStream(CUDAStream stream) {
   initCUDAStreamsOnce();
   auto ptr = CUDAStream_internals(stream);
-  AT_ASSERT(ptr);
+  TORCH_INTERNAL_ASSERT(ptr);
   current_streams[ptr->device_index] = ptr;
 }
 
 
@@ -195,6 +195,16 @@ class C10_CUDA_API CUDAStream {
 TORCH_API CUDAStream
 getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
 
+/**
+ * Get a CUDAStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+TORCH_API CUDAStream
+getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
+
 /**
  * Get the default CUDA stream, for the passed CUDA device, or for the
  * current device if no device index is passed.  The default stream is
 
@@ -64,7 +64,7 @@ class OpenCLContext final {
     CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
   }
 
-  void SwitchToDevice(int a, ...) {
+  void SwitchToDevice(int64_t a, ...) {
     auto& ctx = GetSingleton();
     CAFFE_ENFORCE(a < ctx.devices.size());
     ctx.device = ctx.devices[a];
 
@@ -63,7 +63,7 @@ class TORCH_API CPUContext final : public BaseContext {
 
   ~CPUContext() noexcept override {}
 
-  inline void SwitchToDevice(int /*stream_id*/) override {}
+  inline void SwitchToDevice(int64_t /*stream_id*/) override {}
 
   using BaseContext::SwitchToDevice;
 
 
@@ -42,7 +42,7 @@ class TORCH_API BaseContext {
   /* Sorry for the naming, will get rid of this in future diff */
   virtual DeviceType device_type() const = 0;
 
-  virtual void SwitchToDevice(int /*stream_id*/) = 0;
+  virtual void SwitchToDevice(int64_t /*stream_id*/) = 0;
 
   inline void SwitchToDevice() {
     SwitchToDevice(0);
 
@@ -23,7 +23,7 @@ class IDEEPContext final : public BaseContext {
 
   ~IDEEPContext() noexcept override {}
 
-  inline void SwitchToDevice(int /*stream_id*/) {}
+  inline void SwitchToDevice(int64_t /*stream_id*/) {}
   using BaseContext::SwitchToDevice;
 
   inline void WaitEvent(const Event& ev) {
 
@@ -1,6 +1,8 @@
 from itertools import repeat, chain, product
 from typing import NamedTuple
 import collections
+import contextlib
+import ctypes
 import gc
 import io
 import os
@@ -1314,6 +1316,37 @@ def test_record_stream_on_shifted_view(self):
 
         self.assertNotEqual(try_realloc.data_ptr(), data_ptr)
 
+    @contextlib.contextmanager
+    def _get_external_stream(self, device):
+        lib = ctypes.cdll.LoadLibrary(None)
+        p = ctypes.c_void_p()
+        with device:
+            try:
+                out = lib.cudaStreamCreate(ctypes.byref(p))
+                yield p.value
+            finally:
+                out = lib.cudaStreamDestroy(ctypes.c_ulonglong(p.value))
+
+    @skipIfRocm
+    @unittest.skipIf(IS_SANDCASTLE or IS_REMOTE_GPU, "Does not work on Sandcastle")
+    def test_external_streams(self):
+        device = torch.cuda.device(0)
+        with self._get_external_stream(device) as stream_v:
+            ext_stream = torch.cuda.streams.ExternalStream(stream_v)
+            self.assertEqual(stream_v, ext_stream.cuda_stream)
+            self.assertEqual(ext_stream.device.index, device.idx)
+
+    @skipIfRocm
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @unittest.skipIf(IS_SANDCASTLE or IS_REMOTE_GPU, "Does not work on Sandcastle")
+    def test_external_streams_multi_device(self):
+        device = torch.cuda.device(1)
+        with self._get_external_stream(device) as stream_v:
+            ext_stream = torch.cuda.streams.ExternalStream(
+                stream_v, device=device)
+            self.assertEqual(stream_v, ext_stream.cuda_stream)
+            self.assertEqual(ext_stream.device.index, device.idx)
+
     def test_noncontiguous_pinned_memory(self):
         # See issue #3266
         x = torch.arange(0, 10).view((2, 5))
 
@@ -840,7 +840,7 @@ class _CudaStreamBase:
     cuda_stream: _int
     priority: _int
 
-    def __new__(self, priority: _int = 0, _cdata: _int = 0) -> _CudaStreamBase: ...
+    def __new__(self, priority: _int = 0, _cdata: _int = 0, stream_ptr: _int = 0) -> _CudaStreamBase: ...
     def query(self) -> _bool: ...
     def synchronize(self) -> None: ...
     def priority_range(self) -> Tuple[_int, _int]: ...
 
@@ -22,11 +22,12 @@ static PyObject * THCPStream_pynew(
 
   int priority = 0;
   uint64_t cdata = 0;
+  uint64_t stream_ptr = 0;
 
   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-  static char *kwlist[] = {"priority", "_cdata", nullptr};
+  static char *kwlist[] = {"priority", "_cdata", "stream_ptr", nullptr};
   if (!PyArg_ParseTupleAndKeywords(
-      args, kwargs, "|iK", kwlist, &priority, &cdata)) {
+      args, kwargs, "|iKK", kwlist, &priority, &cdata, &stream_ptr)) {
     return nullptr;
   }
 
@@ -35,11 +36,17 @@ static PyObject * THCPStream_pynew(
     return nullptr;
   }
 
+  if (stream_ptr) {
+    TORCH_CHECK(priority == 0, "Priority was explicitly set for a external stream")
+  }
+
   at::cuda::CUDAStream stream =
     cdata ?
     at::cuda::CUDAStream::unpack(cdata) :
-    at::cuda::getStreamFromPool(
-      /* isHighPriority */ priority < 0 ? true : false);
+      stream_ptr ?
+      at::cuda::getStreamFromExternal(reinterpret_cast<cudaStream_t>(stream_ptr), current_device) :
+      at::cuda::getStreamFromPool(
+        /* isHighPriority */ priority < 0 ? true : false);
 
   THCPStream* self = (THCPStream *)ptr.get();
   self->cdata = stream.pack();
 
@@ -112,6 +112,29 @@ def __repr__(self):
                 .format(self.device, self.cuda_stream))
 
 
+class ExternalStream(Stream):
+    r"""Wrapper around an externally allocated CUDA stream.
+
+    This class is used to wrap streams allocated in other libraries in order
+    to facilitate data exchange and multi-library interactions.
+
+    .. note:: This class doesn't manage the stream life-cycle, it is the user
+       responsibility to keep the referenced stream alive while this class is
+       being used.
+
+    Args:
+        stream_ptr(int): Integer representation of the `cudaStream_t` value.
+            allocated externally.
+        device(torch.device or int, optional): the device where the stream
+            was originally allocated. if device is specified incorrectly,
+            subsequent launches using this stream may fail.
+    """
+
+    def __new__(cls, stream_ptr, device=None, **kwargs):
+        with torch.cuda.device(device):
+            return super(Stream, cls).__new__(cls, stream_ptr=stream_ptr, **kwargs)
+
+
 class Event(torch._C._CudaEventBase):
     r"""Wrapper around a CUDA event.
Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,11 @@ inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, De`
`90`	`90`	`return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device));`
`91`	`91`	`}`
`92`	`92`
	`93`	`+HIPStreamMasqueradingAsCUDA`
	`94`	`+inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {`
	`95`	`+ return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device));`
	`96`	`+}`
	`97`	`+`
`93`	`98`	`inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {`
`94`	`99`	`return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index));`
`95`	`100`	`}`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ class OpenCLContext final {`
`64`	`64`	`CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);`
`65`	`65`	`}`
`66`	`66`
`67`		`- void SwitchToDevice(int a, ...) {`
	`67`	`+ void SwitchToDevice(int64_t a, ...) {`
`68`	`68`	`auto& ctx = GetSingleton();`
`69`	`69`	`CAFFE_ENFORCE(a < ctx.devices.size());`
`70`	`70`	`ctx.device = ctx.devices[a];`