22
22
PROGRESS_UPDATE = 30.0 # seconds
23
23
assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
24
24
25
+ # Kill the main process after 5 minutes. It is supposed to write an update
26
+ # every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
27
+ # buildbot workers.
28
+ MAIN_PROCESS_TIMEOUT = 5 * 60.0
29
+ assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
30
+
25
31
# Time to wait until a worker completes: should be immediate
26
32
JOIN_TIMEOUT = 30.0 # seconds
27
33
@@ -159,20 +165,6 @@ def mp_result_error(self, test_name, error_type, stdout='', stderr='',
159
165
result = TestResult (test_name , error_type , test_time , None )
160
166
return MultiprocessResult (result , stdout , stderr , err_msg )
161
167
162
- def _timedout (self , test_name ):
163
- self ._kill ()
164
-
165
- stdout = stderr = ''
166
- popen = self ._popen
167
- try :
168
- stdout , stderr = popen .communicate (timeout = JOIN_TIMEOUT )
169
- except (subprocess .TimeoutExpired , OSError ) as exc :
170
- print_warning (f"Failed to read { self } output "
171
- f"(timeout={ format_duration (JOIN_TIMEOUT )} ): "
172
- f"{ exc !r} " )
173
-
174
- return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
175
-
176
168
def _run_process (self , test_name ):
177
169
self .start_time = time .monotonic ()
178
170
@@ -195,23 +187,32 @@ def _run_process(self, test_name):
195
187
196
188
try :
197
189
stdout , stderr = popen .communicate (timeout = self .timeout )
190
+ retcode = popen .returncode
191
+ assert retcode is not None
198
192
except subprocess .TimeoutExpired :
199
193
if self ._stopped :
200
194
# kill() has been called: communicate() fails
201
195
# on reading closed stdout/stderr
202
196
raise ExitThread
203
197
204
- return self ._timedout (test_name )
198
+ # On timeout, kill the process
199
+ self ._kill ()
200
+
201
+ # None means TIMEOUT for the caller
202
+ retcode = None
203
+ # bpo-38207: Don't attempt to call communicate() again: on it
204
+ # can hang until all child processes using stdout and stderr
205
+ # pipes completes.
206
+ stdout = stderr = ''
205
207
except OSError :
206
208
if self ._stopped :
207
209
# kill() has been called: communicate() fails
208
210
# on reading closed stdout/stderr
209
211
raise ExitThread
210
212
raise
211
-
212
- retcode = popen .returncode
213
- stdout = stdout .strip ()
214
- stderr = stderr .rstrip ()
213
+ else :
214
+ stdout = stdout .strip ()
215
+ stderr = stderr .rstrip ()
215
216
216
217
return (retcode , stdout , stderr )
217
218
except :
@@ -223,13 +224,10 @@ def _run_process(self, test_name):
223
224
self .current_test_name = None
224
225
225
226
def _runtest (self , test_name ):
226
- result = self ._run_process (test_name )
227
-
228
- if isinstance (result , MultiprocessResult ):
229
- # _timedout() case
230
- return result
227
+ retcode , stdout , stderr = self ._run_process (test_name )
231
228
232
- retcode , stdout , stderr = result
229
+ if retcode is None :
230
+ return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
233
231
234
232
err_msg = None
235
233
if retcode != 0 :
@@ -248,7 +246,8 @@ def _runtest(self, test_name):
248
246
err_msg = "Failed to parse worker JSON: %s" % exc
249
247
250
248
if err_msg is not None :
251
- return self .mp_result_error (test_name , CHILD_ERROR , stdout , stderr , err_msg )
249
+ return self .mp_result_error (test_name , CHILD_ERROR ,
250
+ stdout , stderr , err_msg )
252
251
253
252
return MultiprocessResult (result , stdout , stderr , err_msg )
254
253
@@ -287,6 +286,16 @@ def _wait_completed(self):
287
286
f"{ exc !r} " )
288
287
289
288
def wait_stopped (self , start_time ):
289
+ # bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
290
+ # which killed the process. Sometimes, killing the process from the
291
+ # main thread does not interrupt popen.communicate() in
292
+ # TestWorkerProcess thread. This loop with a timeout is a workaround
293
+ # for that.
294
+ #
295
+ # Moreover, if this method fails to join the thread, it is likely
296
+ # that Python will hang at exit while calling threading._shutdown()
297
+ # which tries again to join the blocked thread. Regrtest.main()
298
+ # uses EXIT_TIMEOUT to workaround this second bug.
290
299
while True :
291
300
# Write a message every second
292
301
self .join (1.0 )
@@ -353,7 +362,8 @@ def _get_result(self):
353
362
timeout = PROGRESS_UPDATE
354
363
while True :
355
364
if use_faulthandler :
356
- faulthandler .dump_traceback_later (timeout * 2.0 , exit = True )
365
+ faulthandler .dump_traceback_later (MAIN_PROCESS_TIMEOUT ,
366
+ exit = True )
357
367
358
368
# wait for a thread
359
369
try :
0 commit comments