IBMZ-Linux-OSS-Python
diff --git a/‎third_party/xla/xla/service/latency_hiding_scheduler.cc
Lines changed: 16 additions & 2 deletions b/‎third_party/xla/xla/service/latency_hiding_scheduler.cc
Lines changed: 16 additions & 2 deletions
diff --git a/‎third_party/xla/xla/service/latency_hiding_scheduler.h
Lines changed: 8 additions & 3 deletions b/‎third_party/xla/xla/service/latency_hiding_scheduler.h
Lines changed: 8 additions & 3 deletions
diff --git a/‎third_party/xla/xla/service/latency_hiding_scheduler_test.cc
Lines changed: 56 additions & 0 deletions b/‎third_party/xla/xla/service/latency_hiding_scheduler_test.cc
Lines changed: 56 additions & 0 deletions
@@ -1843,6 +1843,7 @@ absl::Status DefaultSchedulerCore::ScheduleAnnotation(
     }
   }
   int64_t num_scheduled = 0;
+  int64_t non_ready_instr = 0;
   int64_t annotation_size =
       annotation_tracker_->GetNumInstructions(computation, annotation);
   while (!sched_state->annotation_ready.empty()) {
@@ -1866,6 +1867,17 @@ absl::Status DefaultSchedulerCore::ScheduleAnnotation(
 
     TF_RET_CHECK(node != nullptr)
         << "Couldn't find an annotated node to schedule.";
+    // Delay last instruction of annotation maybe.
+    if (config_.flexible_scheduling_annotation_scheduling &&
+        num_scheduled == annotation_size - 1 &&
+        async_tracker_->IsSupportedAsyncStart(node->GetInstr())) {
+      // Give instruction back to the scheduler to schedule.
+      VLOG(2) << "Non ready instr: " << node->GetInstr().name();
+      ++non_ready_instr;
+      node->ClearAnnotation();
+      sched_state->nodes_holding_annotations.insert(node);
+      continue;
+    }
     // Delete the node from the ready set.
     auto node_it = std::find(sched_state->ready_set.begin(),
                              sched_state->ready_set.end(), node);
@@ -1883,7 +1895,7 @@ absl::Status DefaultSchedulerCore::ScheduleAnnotation(
             << annotation_size << "): " << node->GetInstr().name();
   }
   // Check that we scheduled all the nodes in the annotation.
-  TF_RET_CHECK(num_scheduled == annotation_size)
+  TF_RET_CHECK(num_scheduled == annotation_size - non_ready_instr)
       << "Couldn't schedule all annotated nodes in one go.";
   return absl::OkStatus();
 }
@@ -1910,6 +1922,7 @@ absl::StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
   sched_state->new_sequence_reversed.push_back(
       const_cast<HloInstruction*>(&n->GetInstr()));
   n->SetScheduled();
+  sched_state->nodes_holding_annotations.erase(n);
 
   // If this node was a successor to one or more scheduling groups, update the
   // number of scheduled successors for each of those groups and add the group
@@ -2691,7 +2704,8 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
       };
       return absl::StrJoin(sched_state.ready_set, "\n", LogFormatter());
     }());
-    if (!sched_state.ready_annotations.empty()) {
+    if (!sched_state.ready_annotations.empty() &&
+        sched_state.nodes_holding_annotations.empty()) {
       // Pick the first ready annotation whose scheduling will not cross the
       // overlap limit. If there is no such annotation, continue with scheduling
       // non-annotated ops.
 
@@ -137,6 +137,9 @@ struct SchedulerConfig {
   int64_t send_recv_host_overlap_limit = 1;
   int64_t copy_overlap_limit = 1;
   uint64_t memory_limit = UINT64_MAX;
+  int64_t max_hops_to_closest_selective_overlap = 0;
+  int64_t rerun = 0;
+  int64_t parallel_collective_overlap_limit = 1;
   bool schedule_send_recvs = false;
   // Consider send recv as the same resource. Some platforms do not take well
   // overlapping the send/recv ops between themselves.
@@ -149,9 +152,7 @@ struct SchedulerConfig {
   bool resource_serializing = false;
   bool depth_based_memory_pressure_reduction = false;
   bool enable_selective_resources = false;
-  int64_t max_hops_to_closest_selective_overlap = 0;
-  int64_t rerun = 0;
-  int64_t parallel_collective_overlap_limit = 1;
+  bool flexible_scheduling_annotation_scheduling = false;
 };
 
 // Class used estimate latency between instructions and cost of HLOs.
@@ -682,6 +683,7 @@ class HloGraphNode {
     annotation_ = annotation;
     return absl::OkStatus();
   }
+  void ClearAnnotation() { annotation_ = -1; }
   std::string ToString(const AsyncTracker* async_tracker = nullptr) const {
     std::string result;
     absl::StrAppend(&result, "Instr: ", instr_->ToShortString(), "\n");
@@ -1244,6 +1246,9 @@ class DefaultSchedulerCore : public SchedulerCore {
     ReadyQueueSet annotation_ready;
     // Annotation that is currently being scheduled.
     int64_t ongoing_annotation = kInvalidAnnotation;
+    // If this set is not empty it means that we shouldn't schedule any more
+    // annotated nodes until empty.
+    absl::flat_hash_set<HloGraphNode*> nodes_holding_annotations;
     // Reference to this scheduler run configuration.
     const SchedulerConfig& config;
     SchedulingState(const HloInstructionSequence* instr_sequence,
 
@@ -4444,4 +4444,60 @@ TEST_F(LatencyHidingSchedulerTest, ValidScheduleWithRandomPreferences) {
   // schedule.
   TF_EXPECT_OK(hlo_module->schedule().Verify());
 }
+// Check that "keep_original_sequence_order_in_group" frontend attribute takes
+// effect.
+TEST_F(LatencyHidingSchedulerTest, FlexibleSchedulingAnnotationScheduling) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[16,64,256]{2,1,0} parameter(0)
+  p1 = f32[128,2048,2048]{2,1,0} parameter(1)
+  p2 = f32[512,2048,2048]{2,1,0} parameter(2)
+  p3 = f32[16,256,256]{2,1,0} parameter(3)
+  cp1s = (f32[512,2048,2048]{2,1,0}, f32[512,2048,2048]{2,1,0}, u32[], u32[]) collective-permute-start(p2), source_target_pairs={{1,0},{0,3},{3,2}}, frontend_attributes={_scheduling_group_id="0", keep_original_sequence_order_in_group="true"}
+  cp1d = f32[512,2048,2048]{2,1,0} collective-permute-done(cp1s), frontend_attributes={_scheduling_group_id="0", keep_original_sequence_order_in_group="true"}
+  cp2s = (f32[128,2048,2048]{2,1,0}, f32[128,2048,2048]{2,1,0}, u32[], u32[]) collective-permute-start(p1), source_target_pairs={{1,0},{0,3},{3,2}}, frontend_attributes={_scheduling_group_id="0", keep_original_sequence_order_in_group="true"}
+  c0 = f32[16,256,256]{2,1,0} convolution(p0, p0),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="0", keep_original_sequence_order_in_group="true"}
+  c1 = f32[16,256,256]{2,1,0} convolution(p3, p3),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+  cp2d = f32[128,2048,2048]{2,1,0} collective-permute-done(cp2s), frontend_attributes={_scheduling_group_id="0", keep_original_sequence_order_in_group="true"}
+  ROOT tuple.2 = (f32[16,256,256]{2,1,0}, f32[512,2048,2048]{2,1,0}, f32[16,256,256]{2,1,0}) tuple(c0, cp1d, c1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  auto sched_config = GetDefaultSchedConfig();
+  sched_config.flexible_scheduling_annotation_scheduling = true;
+  sched_config.aggressive_scheduling_policies = true;
+  TF_EXPECT_OK(RunScheduler(hlo_module.get(), sched_config,
+                            std::make_unique<TestLatencyEstimator>()));
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(hlo_module->entry_computation()).instructions();
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+
+  // Check that the original sequence order is kept in the annotation group.
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cp1s"),
+            GetIndex(new_instruction_sequence, "c1"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "c1"),
+            GetIndex(new_instruction_sequence, "c0"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cp1s"),
+            GetIndex(new_instruction_sequence, "cp1d"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cp1d"),
+            GetIndex(new_instruction_sequence, "cp2s"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cp2s"),
+            GetIndex(new_instruction_sequence, "c0"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "c0"),
+            GetIndex(new_instruction_sequence, "cp2d"));
+}
+
 }  // namespace xla