8000 FIX: Reduce false positives in AI LLM status problem check (#35304) · discourse/discourse@376ba06 · GitHub
[go: up one dir, main page]

Skip to content

Commit 376ba06

Browse files
authored
FIX: Reduce false positives in AI LLM status problem check (#35304)
- Skip seeded LLMs from health checks - Detect and ignore rate limit errors (429, 503, quota exceeded) - Use framework retry mechanism (max_retries, retry_after, max_blips) - Remove blocking sleep calls that could tie up Sidekiq workers - Add transient error detection for network timeouts
1 parent aa2c407 commit 376ba06

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

plugins/discourse-ai/app/services/problem_check/ai_llm_status.rb

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
class ProblemCheck::AiLlmStatus < ProblemCheck
44
self.priority = "high"
55
self.perform_every = 6.hours
6+
self.max_retries = 2
7+
self.retry_after = 1.minute
8+
self.max_blips = 2
69

710
def call
811
llm_errors
@@ -13,6 +16,7 @@ def call
1316
def llm_errors
1417
return [] if !SiteSetting.discourse_ai_enabled
1518
LlmModel.in_use.find_each.filter_map do |model|
19+
next if model.seeded?
1620
try_validate(model) { validator.run_test(model) }
1721
end
1822
end
@@ -22,6 +26,23 @@ def try_validate(model, &blk)
2226
blk.call
2327
nil
2428
rescue => e
29+
# Skip problem reporting for rate limiting and temporary service issues
30+
# These are expected to resolve on their own
31+
if rate_limit_error?(e)
32+
Rails.logger.info(
33+
"AI LLM Status Check: Rate limit detected for model #{model.display_name} (#{model.id}), skipping problem report",
34+
)
35+
return nil
36+
end
37+
38+
# Log transient errors but still return a problem
39+
# The framework's max_retries and max_blips will handle retries and alert suppression
40+
if transient_error?(e)
41+
Rails.logger.info(
42+
"AI LLM Status Check: Transient error for model #{model.display_name} (#{model.id}): #{e.message}",
43+
)
44+
end
45+
2546
details = {
2647
model_id: model.id,
2748
model_name: model.display_name,
@@ -52,4 +73,40 @@ def parse_error_message(message)
5273
message.to_s
5374
end
5475
end
76+
77+
def rate_limit_error?(error)
78+
error_message = error.message.to_s.downcase
79+
80+
# Check for rate limit indicators in the error message
81+
rate_limit_indicators = [
82+
"rate limit",
83+
"rate_limit",
84+
"ratelimit",
85+
"too many requests",
86+
"quota exceeded",
87+
"retry after",
88+
"throttled",
89+
"429",
90+
"503",
91+
"temporarily unavailable",
92+
"service unavailable",
93+
"overloaded",
94+
]
95+
96+
rate_limit_indicators.any? { |indicator| error_message.include?(indicator) }
97+
end
98+
99+
def transient_error?(error)
100+
# Network errors and timeouts are transient - may succeed on retry
101+
transient_errors = [
102+
Errno::ECONNREFUSED,
103+
Errno::ECONNRESET,
104+
Errno::ETIMEDOUT,
105+
Net::OpenTimeout,
106+
Net::ReadTimeout,
107+
IOError,
108+
]
109+
110+
transient_errors.any? { |error_class| error.is_a?(error_class) }
111+
end
55112
end

plugins/discourse-ai/spec/services/problem_check/ai_llm_status_spec.rb

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,66 @@
7070
stub_request(:post, post_url).to_return(status: 200, body: success_response, headers: {})
7171
expect(check).to be_chill_about_it
7272
end
73+
74+
it "skips seeded LLMs" do
75+
SiteSetting.ai_summarization_enabled = false
76+
77+
seeded_llm = Fabricate(:seeded_model)
78+
ai_persona_seeded = Fabricate(:ai_persona, default_llm_id: seeded_llm.id)
79+
SiteSetting.ai_summarization_persona = ai_persona_seeded.id
80+
SiteSetting.ai_summarization_enabled = true
81+
82+
stub_request(:post, "https://cdck.test/").to_return(
83+
status: 403,
84+
body: error_response,
85+
headers: {
86+
},
87+
)
88+
expect(check).to be_chill_about_it
89+
end
90+
91+
it "does not report problems for rate limit errors" do
92+
rate_limit_response = { message: "Rate limit exceeded. Please retry after 60s." }.to_json
93+
94+
stub_request(:post, post_url).to_return(status: 429, body: rate_limit_response, headers: {})
95+
expect(check).to be_chill_about_it
96+
end
97+
98+
it "does not report problems for 503 errors (service unavailable)" do
99+
service_unavailable_response = { message: "Service temporarily unavailable" }.to_json
100+
101+
stub_request(:post, post_url).to_return(
102+
status: 503,
103+
body: service_unavailable_response,
104+
headers: {
105+
},
106+
)
107+
expect(check).to be_chill_about_it
108+
end
109+
110+
it "reports problem for network timeout errors" do
111+
stub_request(:post, post_url).to_timeout
112+
113+
problems = described_class.new.call
114+
expect(problems.length).to eq(1)
115+
expect(problems.first).to have_attributes(
116+
identifier: "ai_llm_status",
117+
target: llm_model.id,
118+
priority: "high",
119+
)
120+
end
121+
122+
it "reports problem for authentication errors" do
123+
stub_request(:post, post_url).to_return(status: 401, body: error_response, headers: {})
124+
125+
problems = described_class.new.call
126+
expect(problems.length).to eq(1)
127+
expect(problems.first).to have_attributes(
128+
identifier: "ai_llm_status",
129+
target: llm_model.id,
130+
priority: "high",
131+
)
132+
end
73133
end
74134
end
75135
end

0 commit comments

Comments
 (0)
0