pytorch
diff --git a/‎test/test_nestedtensor.py
Lines changed: 116 additions & 15 deletions b/‎test/test_nestedtensor.py
Lines changed: 116 additions & 15 deletions
diff --git a/‎tools/autograd/derivatives.yaml
Lines changed: 8 additions & 0 deletions b/‎tools/autograd/derivatives.yaml
Lines changed: 8 additions & 0 deletions
diff --git a/‎tools/autograd/gen_variable_type.py
Lines changed: 2 additions & 0 deletions b/‎tools/autograd/gen_variable_type.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch/csrc/autograd/FunctionsManual.cpp
Lines changed: 24 additions & 0 deletions b/‎torch/csrc/autograd/FunctionsManual.cpp
Lines changed: 24 additions & 0 deletions
diff --git a/‎torch/csrc/autograd/FunctionsManual.h
Lines changed: 6 additions & 0 deletions b/‎torch/csrc/autograd/FunctionsManual.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎torch/nested/_internal/ops.py
Lines changed: 84 additions & 3 deletions b/‎torch/nested/_internal/ops.py
Lines changed: 84 additions & 3 deletions
diff --git a/‎torch/testing/_internal/opinfo/definitions/nested.py
Lines changed: 4 additions & 2 deletions b/‎torch/testing/_internal/opinfo/definitions/nested.py
Lines changed: 4 additions & 2 deletions
@@ -6018,6 +6018,71 @@ def test_narrow(self, device):
                 nt.values()[nt.offsets()[i] : (nt.offsets()[i] + nt.lengths()[i])],
             )
 
+    # TODO: Test this case with narrow()'s error_inputs when that is supported
+    @skipIfTorchDynamo("Test compiles internally")
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    @torch._dynamo.utils.disable_cache_limit()
+    @dtypes(torch.float32)
+    @parametrize("env", ["eager", "compile", "compile_dynamic"])
+    def test_narrow_on_batch_dim_input_validation(self, device, dtype, env):
+        nt = torch.nested.nested_tensor(
+            [
+                torch.randn(2, 5, device=device, dtype=dtype),
+                torch.randn(3, 5, device=device, dtype=dtype),
+                torch.randn(4, 5, device=device, dtype=dtype),
+                torch.randn(6, 5, device=device, dtype=dtype),
+                torch.randn(7, 5, device=device, dtype=dtype),
+            ],
+            layout=torch.jagged,
+            requires_grad=True,
+        )
+
+        def f(nt, start, length):
+            return nt.narrow(0, start, length)
+
+        if "compile" in env:
+            # required to avoid data-dependent guard errors
+            torch._dynamo.config.capture_scalar_outputs = True
+            f = torch.compile(f, dynamic=(env == "compile_dynamic"), fullgraph=True)
+
+        with self.assertRaisesRegex(RuntimeError, "exceeds dimension size"):
+            out = f(nt, 3, 3)
+
+    @skipIfTorchDynamo("Test compiles internally")
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    @torch._dynamo.utils.disable_cache_limit()
+    @dtypes(torch.float32)
+    @parametrize("env", ["eager", "compile", "compile_dynamic"])
+    def test_narrow_on_batch_dim_narrow_of_narrow(self, device, dtype, env):
+        nt = torch.nested.nested_tensor(
+            [
+                torch.randn(2, 5, device=device, dtype=dtype),
+                torch.randn(3, 5, device=device, dtype=dtype),
+                torch.randn(4, 5, device=device, dtype=dtype),
+                torch.randn(6, 5, device=device, dtype=dtype),
+                torch.randn(7, 5, device=device, dtype=dtype),
+            ],
+            layout=torch.jagged,
+            requires_grad=True,
+        )
+
+        def f(nt, start, length):
+            intermediate = nt.narrow(0, start, length)
+            return intermediate.narrow(0, 1, length - 2)
+
+        if "compile" in env:
+            # required to avoid data-dependent guard errors
+            torch._dynamo.config.capture_scalar_outputs = True
+            f = torch.compile(f, dynamic=(env == "compile_dynamic"), fullgraph=True)
+
+        # narrow() of narrow()ed NJT
+        # first narrow(): 1:5
+        # second narrow() 1+1:4-2 == 2:4
+        out = f(nt, 1, 4)
+        self.assertEqual(out.shape[0], 2)
+        for out_comp, nt_comp in zip(out.unbind(), nt.unbind()[2:4]):
+            self.assertEqual(out_comp, nt_comp)
+
     def test_njt_cat(self, device):
         offsets = torch.tensor([0, 2, 3], device=device, dtype=torch.int64)
         values_1 = torch.randn(
@@ -8108,7 +8173,6 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
             in {
                 "chunk",
                 "masked_select",
-                "narrow",
                 "split",
                 "split_with_sizes",
                 "squeeze",
@@ -8135,6 +8199,17 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         sample_match_fn=lambda device, sample: "ragged_dim" in sample.name,
         name="ragged_dim_unsupported",
     ),
+    # narrow(): not supported with non-contig on dims other than the batch dim
+    XFailRule(
+        error_type=RuntimeError,
+        error_msg="not yet supported on dim != 0 for non-contiguous nested tensors",
+        op_match_fn=lambda device, op: (op.full_name == "narrow"),
+        sample_match_fn=lambda device, sample: (
+            sample.kwargs["dim"] != 0
+            and (sample.input._lengths is not None or sample.input._ragged_idx != 1)
+        ),
+        name="narrow_missing_noncontig_support_on_batch_dim",
+    ),
     XFailRule(
         error_type=RuntimeError,
         # error comes from usage of view() in the decomp
@@ -8150,7 +8225,6 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         op_match_fn=lambda device, op: (
             op.full_name
             in {
-                "narrow",
                 "split",
                 "split_with_sizes",
                 "unsqueeze",
@@ -8342,13 +8416,6 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         sample_match_fn=lambda device, sample: ("with bias" in sample.name),
         name="broken_linear_backward",
     ),
-    # narrow(): unimplemented backward
-    XFailRule(
-        error_type=RuntimeError,
-        error_msg="derivative for aten::narrow is not implemented",
-        op_match_fn=lambda device, op: (op.full_name == "narrow"),
-        name="broken_narrow_backward",
-    ),
     # min / max: need factory function support for ragged dim reductions
     # where the output is dense but sizes still contain a nested int
     XFailRule(
@@ -8430,6 +8497,14 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
 
 COMPILE_FORWARD_SKIPS_AND_XFAILS = [
     *FORWARD_SKIPS_AND_XFAILS,
+    # select(): pending unbacked symints not in returned output (needs fix)
+    XFailRule(
+        error_type=torch._dynamo.exc.InternalTorchDynamoError,
+        error_msg="Pending unbacked symbols",
+        op_match_fn=lambda device, op: (op.full_name == "select"),
+        sample_match_fn=lambda device, sample: ("batch_dim" in sample.name),
+        name="broken_select_backward_unbacked",
+    ),
     # Needs investigation in AOTAutograd: len(unwrapped_args) == num_args_tallied assertion fails
     # e.g. Expected 5 == 4
     XFailRule(
@@ -8459,12 +8534,16 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         ),
         name="clone_unbind_data_dependency",
     ),
-    # chunk(): broken in several ways on the batch dim; revisit after similar
-    # data-dependency issues are handled for narrow()
-    SkipRule(
+    # chunk() on the batch dim with chunks=1 causes an unbacked SymInt problem; this
+    # needs to be investigated
+    XFailRule(
+        error_type=AssertionError,
+        error_msg="s1",
         op_match_fn=lambda device, op: (op.full_name == "chunk"),
-        sample_match_fn=lambda device, sample: ("batch_dim" in sample.name),
-        name="broken_chunk_compile_backward_on_batch_dim",
+        sample_match_fn=lambda device, sample: (
+            "batch_dim" in sample.name and sample.kwargs["chunks"] == 1
+        ),
+        name="chunk_batch_dim_data_dependency",
     ),
     # select on batch dim currently uses unbind(), leading to data-dependent error in
     # torch.compile that needs to be addressed via torch._check()
@@ -8497,6 +8576,26 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         sample_match_fn=lambda device, sample: ("noncontig_holes" in sample.name),
         name="noncontig_holes_data_dependency",
     ),
+    # narrow(): non-contig on the batch dim has some problems when not spanning
+    # the entire batch dim (nearly all the time). This needs some investigation.
+    XFailRule(
+        error_type=torch._dynamo.exc.BackendCompilerFailed,
+        # GuardOnDataDependentSymNode: Could not guard on data-dependent expression
+        # Eq(IsNonOverlappingAndDenseIndicator(5, 3, u9, 81, 27, 1), 1)
+        # (unhinted: Eq(IsNonOverlappingAndDenseIndicator(5, 3, u9, 3*s1, s1, 1), 1)).
+        # (Size-like symbols: u9)
+        error_msg="Could not guard on data-dependent expression",
+        op_match_fn=lambda device, op: (op.full_name == "narrow"),
+        sample_match_fn=lambda device, sample: (
+            (sample.input._lengths is not None or sample.input._ragged_idx != 1)
+            and sample.kwargs["dim"] == 0
+            and (
+                sample.kwargs["start"] != 0
+                or sample.kwargs["length"] != sample.input.shape[0]
+            )
+        ),
+        name="narrow_noncontig_on_batch_dim_broken",
+    ),
     # mean(): weird bug
     XFailRule(
         error_type=torch._dynamo.exc.BackendCompilerFailed,
@@ -8545,8 +8644,10 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
 ]
 
 COMPARE_TENSOR_COMPONENT_EQUALITY = {
-    # masked_select is expected to output a different shape
+    # these ops are expected to output a different shape
+    "chunk",
     "masked_select",
+    "narrow",
 }
 
 
 
@@ -1698,6 +1698,14 @@
       # TODO: replace this function once semantics for nested tensor expand have been settled on
       self: _nested_sum_backward(grad, self, dim, keepdim)
 
+- name: narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
+  dispatch:
+    Default:
+      # CompositeImplicit for dense tensors
+      self: not_implemented("narrow()")
+    AutogradNestedTensor:
+      self: _nested_narrow_backward(grad, self, dim, start, length)
+
 - name: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   self: nansum_backward(grad.to(self.scalar_type()), self, dim, keepdim)
   result: at::where(self_p.isnan(), 0, self_t).sum(dim, keepdim, dtype)
 
@@ -199,6 +199,8 @@
     "rot90",
     "nanmean",
     "nansum",
+    "narrow",
+    "narrow_copy",
     "transpose",
     "transpose_copy",
     "permute",
 
@@ -2162,6 +2162,30 @@ Tensor split_backward(
   return split_with_sizes_backward(grads, split_sizes, dim, sym_sizes, options);
 }
 
+Tensor _nested_narrow_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    int64_t dim,
+    const c10::SymInt& start,
+    const c10::SymInt& length) {
+  Tensor grad_input = at::zeros_like(self);
+  Tensor narrowed_grad = grad_input.narrow_symint(dim, start, length);
+  Tensor grad_values = at::_nested_get_values(grad);
+  Tensor narrowed_grad_values = at::_nested_get_values(narrowed_grad);
+  TORCH_INTERNAL_ASSERT(
+      grad_values.dim() == narrowed_grad_values.dim(),
+      "Bug encountered in _nested_narrow_backward()");
+  for (int i = 0; i < grad_values.dim(); ++i) {
+    auto narrowed_grad_size = narrowed_grad_values.sym_size(i);
+    auto grad_size = grad_values.sym_size(i);
+    TORCH_SYM_CHECK(
+        narrowed_grad_size.sym_eq(grad_size),
+        "Bug encountered in _nested_narrow_backward()");
+  }
+  narrowed_grad_values.copy_(grad_values);
+  return grad_input;
+}
+
 Tensor max_pool_double_backward(
     const Tensor& grad,
     const Tensor& indices,
 
@@ -447,6 +447,12 @@ at::Tensor split_backward(
     int64_t dim,
     c10::SymIntArrayRef sizes,
     const at::TensorOptions& options);
+at::Tensor _nested_narrow_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    int64_t dim,
+    const c10::SymInt& start,
+    const c10::SymInt& length);
 at::Tensor max_pool_double_backward(
     const at::Tensor& grad,
     const at::Tensor& indices,
 
@@ -947,16 +947,97 @@ def split_with_sizes_default(func, *args, **kwargs):
     ]
 
 
+# TODO: Implement slice() instead and narrow() in terms of slice()
 @register_jagged_func(
-    torch.ops.aten.narrow.default, "self: jt, dim: any, start: any, length: any"
+    torch.ops.aten.narrow.default, "self: jt_all, dim: any, start: any, length: any"
 )
 def narrow(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
     inp = new_kwargs.pop("input")
 
-    dim = _wrap_jagged_dim(inp.dim(), new_kwargs["dim"], inp._ragged_idx, "narrow")
+    dim, operating_on_batch = _wrap_jagged_dim(
+        inp.dim(), new_kwargs["dim"], inp._ragged_idx, "narrow", allow_batch_dim=True
+    )
+    if operating_on_batch:
+        # batch dim narrowing requires custom logic involving offsets
+        out_kwargs = extract_kwargs(inp)
+        start_val, length_val = new_kwargs["start"], new_kwargs["length"]
+        end_val = start_val + length_val
+        batch = inp.size(0)
+        if end_val > batch:
+            raise RuntimeError(
+                f"narrow(): start ({start_val}) + length ({length_val}) "
+                f"exceeds dimension size ({batch})"
+            )
+
+        # clamp start, end values to batch dim boundaries
+        # NB: all of these are in outer batch dim space
+        if start_val < 0:
+            start_val += batch
+        if end_val < 0:
+            end_val += batch
+        start_val = max(min(start_val, batch), 0)
+        end_val = max(min(end_val, batch), 0)
+        length_val = max(min(length_val, end_val - start_val), 0)
+
+        # shortcut if no actual narrowing is happening; this helps us ensure
+        # that length < batch size if we don't take this path
+        if length_val == batch:
+            return inp.detach()
+
+        # +1 to include last offset. Also normalize offsets to start at 0.
+        out_kwargs["offsets"] = (
+            inp._offsets[start_val : start_val + length_val + 1]
+            - inp._offsets[start_val]
+        )
+        # metadata cache may no longer be accurate since offsets have changed
+        if "_metadata_cache" in out_kwargs:
+            del out_kwargs["_metadata_cache"]
+
+        if inp._lengths is not None:
+            out_kwargs["lengths"] = inp._lengths[start_val : start_val + length_val]
+
+        # NB: Unbacked SymInts must be directly accessible from the returned tensor's sizes,
+        # strides, and storage offset. To ensure this property, we compute the storage offset
+        # manually as an unbacked SymInt and utilize as_strided() to get the view. If narrow()
+        # was used instead with unbacked SymInt args, the storage offset would be an expression
+        # involving unbacked SymInts, making it not directly accessible from the returned tensor's
+        # metadata and triggering a "pending unbacked symbols" error.
+        new_storage_offset = (
+            inp._values.storage_offset()
+            + (inp._offsets[start_val] * inp._values.stride(dim))
+        ).item()
+        torch._check_is_size(new_storage_offset)
+
+        # compute symbolic start involving unbacked SymInt
+        start = (
+            new_storage_offset - inp._values.storage_offset()
+        ) // inp._values.stride(dim)
+        torch._check_is_size(start)
+        torch._check(start <= inp._values.size(dim))
+
+        # unbacked SymInt for length
+        length = (inp._offsets[start_val + length_val] - inp._offsets[start_val]).item()
+        torch._check_is_size(length)
+        # we can say this because we short-circuit earlier if length == inp._values.size(dim)
+        torch._check(length < inp._values.size(dim))
+        torch._check(start + length <= inp._values.size(dim))
+
+        # compute new sizes / strides from symbolic values
+        new_sizes = list(inp._values.size())
+        new_sizes[dim] = length
+        new_strides = list(inp._values.stride())
+
+        # apply view with new sizes / strides / storage offset
+        new_values = inp._values.as_strided(new_sizes, new_strides, new_storage_offset)
+        return NestedTensor(new_values, **out_kwargs)
+
+    if inp._lengths is not None or inp._ragged_idx != 1:
+        raise RuntimeError(
+            "narrow(): not yet supported on dim != 0 for non-contiguous nested tensors"
+        )
     values = func(
         inp._values,
         dim=dim,
@@ -1632,7 +1713,7 @@ def view_default(func, *args, **kwargs):
         )
 
     # Ensure specified size still includes batch and ragged dims
-    if len(size) < 3 or not raggedness_matches(inp, size):
+    if len(size) < 2 or not raggedness_matches(inp, size):
         raise RuntimeError(f"view(): cannot view shape {inp._size} as {size}")
 
     # outer size: the size of the NT, e.g. [3, j0, 10]
 
@@ -835,8 +835,10 @@ def batchwise_reference_chunk(op, sample):
 
 
 def batchwise_reference_narrow(op, sample):
-    # TODO: write this!
-    raise NotImplementedError
+    start, length = sample.kwargs["start"], sample.kwargs["length"]
+    components = list(sample.input.unbind())
+    narrowed = components[start : start + length]
+    return torch.nested.as_n
484F
ested_tensor(narrowed, layout=torch.jagged)
 
 
 def batchwise_reference_select(op, sample):