8000 Feature/wer crashdump analysis (#10633) · nginxpre/arangodb@c364291 · GitHub
[go: up one dir, main page]

Skip to content

Commit c364291

Browse files
dothebartKVS85
authored andcommitted
Feature/wer crashdump analysis (arangodb#10633)
* add code to check for WER generated crashes * boom * bla * stat core dir * whats this? * fix env * more core fiddling * cleanup test code * undo * undo * lint * fix linux gdb logfile creating & reading * add option to separate crashdumps to second file
1 parent c107229 commit c364291

File tree

4 files changed

+144
-96
lines changed

4 files changed

+144
-96
lines changed

js/client/modules/@arangodb/crash-utils.js

Lines changed: 123 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,23 @@
2828
const fs = require('fs');
2929
const yaml = require('js-yaml');
3030
const internal = require('internal');
31+
const platform = internal.platform;
32+
const executeExternal = internal.executeExternal;
3133
const executeExternalAndWait = internal.executeExternalAndWait;
3234
const statusExternal = internal.statusExternal;
3335
const killExternal = internal.killExternal;
3436
const sleep = internal.sleep;
3537
const pu = require('@arangodb/process-utils');
3638

37-
let GDB_OUTPUT = '';
3839
const abortSignal = 6;
40+
const termSignal = 15;
3941

40-
const platform = internal.platform;
4142

4243
const RED = internal.COLORS.COLOR_RED;
4344
const RESET = internal.COLORS.COLOR_RESET;
4445

46+
let GDB_OUTPUT = '';
47+
4548
// //////////////////////////////////////////////////////////////////////////////
4649
// / @brief analyzes a core dump using gdb (Unix)
4750
// /
@@ -67,7 +70,13 @@ function analyzeCoreDump (instanceInfo, options, storeArangodPath, pid) {
6770

6871
let command;
6972
command = '(';
70-
command += 'printf \'bt full\\n thread apply all bt\\n\';';
73+
command += 'printf \'' +
74+
'set logging file ' + gdbOutputFile + '\\n' +
75+
'set logging on\\n' +
76+
'bt full\\n' +
77+
'thread apply all bt\\n'+
78+
'\';';
79+
7180
command += 'sleep 10;';
7281
command += 'echo quit;';
7382
command += 'sleep 2';
@@ -78,7 +87,7 @@ function analyzeCoreDump (instanceInfo, options, storeArangodPath, pid) {
7887
} else {
7988
command += options.coreDirectory;
8089
}
81-
command += ' > ' + gdbOutputFile + ' 2>&1';
90+
8291
const args = ['-c', command];
8392
print(JSON.stringify(args));
8493

@@ -146,6 +155,104 @@ Crash analysis of: ` + JSON.stringify(instanceInfo) + '\n';
146155
// / cdb is part of the WinDBG package.
147156
// //////////////////////////////////////////////////////////////////////////////
148157

158+
159+
// //////////////////////////////////////////////////////////////////////////////
160+
// / @brief check whether process does bad on the wintendo
161+
// //////////////////////////////////////////////////////////////////////////////
162+
163+
function runProcdump (options, instanceInfo, rootDir, pid, instantDump = false) {
164+
let procdumpArgs = [ ];
165+
let dumpFile = fs.join(rootDir, 'core_' + pid + '.dmp');
166+
if (options.exceptionFilter != null) {
167+
procdumpArgs = [
168+
'-accepteula',
169+
'-64',
170+
];
171+
if (!instantDump) {
172+
procdumpArgs.push('-e');
173+
procdumpArgs.push(options.exceptionCount);
174+
}
175+
let filters = options.exceptionFilter.split(',');
176+
for (let which in filters) {
177+
procdumpArgs.push('-f');
178+
procdumpArgs.push(filters[which]);
179+
}
180+
procdumpArgs.push('-ma');
181+
procdumpArgs.push(pid);
182+
procdumpArgs.push(dumpFile);
183+
} else {
184+
procdumpArgs = [
185+
'-accepteula',
186+
];
187+
if (!instantDump) {
188+
procdumpArgs.push('-e');
189+
}
190+
procdumpArgs.push('-ma');
191+
procdumpArgs.push(pid);
192+
procdumpArgs.push(dumpFile);
193+
}
194+
try {
195+
if (options.extremeVerbosity) {
196+
print(Date() + " Starting procdump: " + JSON.stringify(procdumpArgs));
197+
}
198+
instanceInfo.coreFilePattern = dumpFile;
199+
if (instantDump) {
200+
// Wait for procdump to have written the dump before we attempt to kill the process:
201+
instanceInfo.monitor = executeExternalAndWait('procdump', procdumpArgs);
202+
} else {
203+
instanceInfo.monitor = executeExternal('procdump', procdumpArgs);
204+
// try to give procdump a little time to catch up with the process
205+
sleep(0.25);
206+
let status = statusExternal(instanceInfo.monitor.pid, false);
207+
if (status.hasOwnProperty('signal')) {
208+
print(RED + 'procdump didn\'t come up: ' + JSON.stringify(status));
209+
instanceInfo.monitor.status = status;
210+
return false;
211+
}
212+
}
213+
} catch (x) {
214+
print(Date() + ' failed to start procdump - is it installed?');
215+
// throw x;
216+
return false;
217+
}
218+
return true;
219+
}
220+
221+
function stopProcdump (options, instanceInfo, force = false) {
222+
if (instanceInfo.hasOwnProperty('monitor') &&
223+
instanceInfo.monitor.pid !== null) {
224+
if (force) {
225+
print(Date() + " sending TERM to procdump to make it exit");
226+
instanceInfo.monitor.status = killExternal(instanceInfo.monitor.pid, termSignal);
227+
} else {
228+
print(Date() + " waiting for procdump to exit");
229+
statusExternal(instanceInfo.monitor.pid, true);
230+
}
231+
instanceInfo.monitor.pid = null;
232+
}
233+
}
234+
235+
function calculateMonitorValues(options, instanceInfo, pid, cmd) {
236+
237+
if (platform.substr(0, 3) === 'win') {
238+
if (process.env.hasOwnProperty('WORKSPACE') &&
239+
fs.isDirectory(fs.join(process.env['WORKSPACE'], 'core'))) {
240+
let spcmd = fs.normalize(cmd).split(fs.pathSeparator);
241+
let executeable = spcmd[spcmd.length - 1];
242+
instanceInfo.coreFilePattern = fs.join(process.env['WORKSPACE'],
243+
'core',
244+
executeable + '.' + pid.toString() + '.dmp');
245+
}
246+
}
247+
}
248+
function isEnabledWindowsMonitor(options, instanceInfo, pid, cmd) {
249+
calculateMonitorValues(options, instanceInfo, pid, cmd);
250+
9920 if (platform.substr(0, 3) === 'win' && !options.disableMonitor) {
251+
return true;
252+
}
253+
return false;
254+
}
255+
149256
function analyzeCoreDumpWindows (instanceInfo) {
150257
let cdbOutputFile = fs.getTempFile();
151258

@@ -275,12 +382,18 @@ function analyzeCrash (binary, instanceInfo, options, checkStr) {
275382

276383
let hint = '';
277384
if (platform.substr(0, 3) === 'win') {
278-
if (!instanceInfo.hasOwnProperty('monitor')) {
385+
if (instanceInfo.hasOwnProperty('monitor')) {
386+
pu.stopProcdump(options, instanceInfo);
387+
}
388+
if (!instanceInfo.hasOwnProperty('coreFilePattern') ) {
279389
print("your process wasn't monitored by procdump, won't have a coredump!");
280390
instanceInfo.exitStatus['gdbHint'] = "coredump unavailable";
281391
return;
392+
} else if (!fs.exists(instanceInfo.coreFilePattern)) {
393+
print("No coredump exists at " + instanceInfo.coreFilePattern);
394+
instanceInfo.exitStatus['gdbHint'] = "coredump unavailable";
395+
return;
282396
}
283-
pu.stopProcdump(options, instanceInfo);
284397
hint = analyzeCoreDumpWindows(instanceInfo);
285398
} else if (platform === 'darwin') {
286399
// fs.copyFile(binary, storeArangodPath);
@@ -295,4 +408,8 @@ function analyzeCrash (binary, instanceInfo, options, checkStr) {
295408

296409
exports.checkMonitorAlive = checkMonitorAlive;
297410
exports.analyzeCrash = analyzeCrash;
411+
exports.runProcdump = runProcdump;
412+
exports.stopProcdump = stopProcdump;
413+
exports.isEnabledWindowsMonitor = isEnabledWindowsMonitor;
414+
exports.calculateMonitorValues = calculateMonitorValues;
298415
Object.defineProperty(exports, 'GDB_OUTPUT', {get: () => GDB_OUTPUT});

js/client/modules/@arangodb/process-utils.js

Lines changed: 12 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -512,87 +512,10 @@ function makeArgsArangod (options, appDir, role, tmpDir) {
512512
return args;
513513
}
514514

515-
516-
// //////////////////////////////////////////////////////////////////////////////
517-
// / @brief check whether process does bad on the wintendo
518-
// //////////////////////////////////////////////////////////////////////////////
519-
520-
function runProcdump (options, instanceInfo, rootDir, pid, instantDump = false) {
521-
let procdumpArgs = [ ];
522-
let dumpFile = fs.join(rootDir, 'core_' + pid + '.dmp');
523-
if (options.exceptionFilter != null) {
524-
procdumpArgs = [
525-
'-accepteula',
526-
'-64',
527-
];
528-
if (!instantDump) {
529-
procdumpArgs.push('-e');
530-
procdumpArgs.push(options.exceptionCount);
531-
}
532-
let filters = options.exceptionFilter.split(',');
533-
for (let which in filters) {
534-
procdumpArgs.push('-f');
535-
procdumpArgs.push(filters[which]);
536-
}
537-
procdumpArgs.push('-ma');
538-
procdumpArgs.push(pid);
539-
procdumpArgs.push(dumpFile);
540-
} else {
541-
procdumpArgs = [
542-
'-accepteula',
543-
];
544-
if (!instantDump) {
545-
procdumpArgs.push('-e');
546-
}
547-
procdumpArgs.push('-ma');
548-
procdumpArgs.push(pid);
549-
procdumpArgs.push(dumpFile);
550-
}
551-
try {
552-
if (options.extremeVerbosity) {
553-
print(Date() + " Starting procdump: " + JSON.stringify(procdumpArgs));
554-
}
555-
instanceInfo.coreFilePattern = dumpFile;
556-
if (instantDump) {
557-
// Wait for procdump to have written the dump before we attempt to kill the process:
558-
instanceInfo.monitor = executeExternalAndWait('procdump', procdumpArgs);
559-
} else {
560-
instanceInfo.monitor = executeExternal('procdump', procdumpArgs);
561-
// try to give procdump a little time to catch up with the process
562-
sleep(0.25);
563-
let status = statusExternal(instanceInfo.monitor.pid, false);
564-
if (status.hasOwnProperty('signal')) {
565-
print(RED + 'procdump didn\'t come up: ' + JSON.stringify(status));
566-
instanceInfo.monitor.status = status;
567-
return false;
568-
}
569-
}
570-
} catch (x) {
571-
print(Date() + ' failed to start procdump - is it installed?');
572-
// throw x;
573-
return false;
574-
}
575-
return true;
576-
}
577-
578-
function stopProcdump (options, instanceInfo, force = false) {
579-
if (instanceInfo.hasOwnProperty('monitor') &&
580-
instanceInfo.monitor.pid !== null) {
581-
if (force) {
582-
print(Date() + " sending TERM to procdump to make it exit");
583-
instanceInfo.monitor.status = killExternal(instanceInfo.monitor.pid, termSignal);
584-
} else {
585-
print(Date() + " waiting for procdump to exit");
586-
statusExternal(instanceInfo.monitor.pid, true);
587-
}
588-
instanceInfo.monitor.pid = null;
589-
}
590-
}
591-
592515
function killWithCoreDump (options, instanceInfo) {
593516
if (platform.substr(0, 3) === 'win' && !options.disableMonitor) {
594-
stopProcdump (options, instanceInfo, true);
595-
runProcdump (options, instanceInfo, instanceInfo.rootDir, instanceInfo.pid, true);
517+
crashUtils.stopProcdump (options, instanceInfo, true);
518+
crashUtils.runProcdump (options, instanceInfo, instanceInfo.rootDir, instanceInfo.pid, true);
596519
}
597520
instanceInfo.exitStatus = killExternal(instanceInfo.pid, abortSignal);
598521
}
@@ -733,14 +656,14 @@ function executeAndWait (cmd, args, options, valgrindTest, rootDir, coreCheck =
733656
res = executeExternal(cmd, args, false, coverageEnvironment());
734657
instanceInfo.pid = res.pid;
735658
instanceInfo.exitStatus = res;
736-
if (runProcdump(options, instanceInfo, rootDir, res.pid)) {
659+
if (crashUtils.runProcdump(options, instanceInfo, rootDir, res.pid)) {
737660
Object.assign(instanceInfo.exitStatus,
738661
statusExternal(res.pid, true, timeout * 1000));
739662
if (instanceInfo.exitStatus.status === 'TIMEOUT') {
740663
print('Timeout while running ' + cmd + ' - will kill it now! ' + JSON.stringify(args));
741664
executeExternalAndWait('netstat', ['-aonb']);
742665
killExternal(res.pid);
743-
stopProcdump(options, instanceInfo);
666+
crashUtils.stopProcdump(options, instanceInfo);
744667
instanceInfo.exitStatus.status = 'ABORTED';
745668
const deltaTime = time() - startTime;
746669
return {
@@ -750,7 +673,7 @@ function executeAndWait (cmd, args, options, valgrindTest, rootDir, coreCheck =
750673
duration: deltaTime
751674
};
752675
}
753-
stopProcdump(options, instanceInfo);
676+
crashUtils.stopProcdump(options, instanceInfo);
754677
} else {
755678
print('Killing ' + cmd + ' - ' + JSON.stringify(args));
756679
res = killExternal(res.pid);
@@ -762,6 +685,7 @@ function executeAndWait (cmd, args, options, valgrindTest, rootDir, coreCheck =
762685
res = executeExternalAndWait(cmd, args, false, timeout*100, coverageEnvironment());
763686
instanceInfo.pid = res.pid;
764687
instanceInfo.exitStatus = res;
688+
crashUtils.calculateMonitorValues(options, instanceInfo, res.pid, cmd);
765689
}
766690
const deltaTime = time() - startTime;
767691

@@ -1486,13 +1410,13 @@ function shutdownInstance (instanceInfo, options, forceTerminate) {
14861410
serverCrashedLocal = true;
14871411
shutdownSuccess = false;
14881412
}
1489-
stopProcdump(options, arangod);
1413+
crashUtils.stopProcdump(options, arangod);
14901414
} else {
14911415
if (arangod.role !== 'agent') {
14921416
nonAgenciesCount--;
14931417
}
14941418
print(Date() + ' Server "' + arangod.role + '" shutdown: Success: pid', arangod.pid);
1495-
stopProcdump(options, arangod);
1419+
crashUtils.stopProcdump(options, arangod);
14961420
return false;
14971421
}
14981422
});
@@ -1918,8 +1842,8 @@ function startArango (protocol, options, addArgs, rootDir, role) {
19181842
instanceInfo.role = role;
19191843
instanceInfo['name'] = role + ' - ' + port;
19201844

1921-
if (platform.substr(0, 3) === 'win' && !options.disableMonitor) {
1922-
if (!runProcdump(options, instanceInfo, rootDir, instanceInfo.pid)) {
1845+
if (crashUtils.isEnabledWindowsMonitor(options, instanceInfo, instanceInfo.pid, ARANGOD_BIN)) {
1846+
if (!crashUtils.runProcdump(options, instanceInfo, rootDir, instanceInfo.pid)) {
19231847
print('Killing ' + ARANGOD_BIN + ' - ' + JSON.stringify(args));
19241848
let res = killExternal(instanceInfo.pid);
19251849
instanceInfo.pid = res.pid;
@@ -2062,8 +1986,8 @@ function reStartInstance(options, instanceInfo, moreArgs) {
20621986

20631987
throw x;
20641988
}
2065-
if (platform.substr(0, 3) === 'win' && !options.disableMonitor) {
2066-
if (!runProcdump(options, oneInstanceInfo, oneInstanceInfo.rootDir, oneInstanceInfo.pid)) {
1989+
if (crashUtils.isEnabledWindowsMonitor(options, oneInstanceInfo, oneInstanceInfo.pid, ARANGOD_BIN)) {
1990+
if (!crashUtils.runProcdump(options, oneInstanceInfo, oneInstanceInfo.rootDir, oneInstanceInfo.pid)) {
20671991
print('Killing ' + ARANGOD_BIN + ' - ' + JSON.stringify(oneInstanceInfo.args));
20681992
let res = killExternal(oneInstanceInfo.pid);
20691993
oneInstanceInfo.pid = res.pid;
@@ -2156,7 +2080,6 @@ exports.coverageEnvironment = coverageEnvironment;
21562080
exports.executeArangod = executeArangod;
21572081
exports.executeAndWait = executeAndWait;
21582082
exports.killRemainingProcesses = killRemainingProcesses;
2159-
exports.stopProcdump = stopProcdump;
21602083

21612084
exports.createBaseConfig = createBaseConfigBuilder;
21622085
exports.run = {

js/client/modules/@arangodb/result-processing.js

Lines changed: 7 additions & 1 deletion
DEA3
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ function unitTestPrettyPrintResults (options, results) {
495495
onlyFailedMessages += failText + '\n';
496496
failText = RED + failText + RESET;
497497
}
498-
if (cu.GDB_OUTPUT !== '') {
498+
if (cu.GDB_OUTPUT !== '' && options.crashAnalysisText === options.testFailureText) {
499499
// write more verbose failures to the testFailureText file
500500
onlyFailedMessages += '\n\n' + cu.GDB_OUTPUT;
501501
}
@@ -511,6 +511,12 @@ ${failedMessages}${color} * Overall state: ${statusMessage}${RESET}${crashText}$
511511
onlyFailedMessages += '\n' + crashedText;
512512
}
513513
fs.write(options.testOutputDirectory + options.testFailureText, onlyFailedMessages);
514+
515+
if (cu.GDB_OUTPUT !== '' && options.crashAnalysisText !== options.testFailureText ) {
516+
// write more verbose failures to the testFailureText file
517+
fs.write(options.testOutputDirectory + options.crashAnalysisText, cu.GDB_OUTPUT);
518+
}
519+
514520
}
515521

516522
// //////////////////////////////////////////////////////////////////////////////

js/client/modules/@arangodb/testing.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ let optionsDocumentation = [
143143
' - `extraArgs`: list of extra commandline arguments to add to arangod',
144144
'',
145145
' - `testFailureText`: filename of the testsummary file',
146+
' - `crashAnalysisText`: output of debugger in case of crash',
146147
' - `getSockStat`: on linux collect socket stats before shutdown',
147148
' - `verbose`: if set to true, be more verbose',
148149
' - `extremeVerbosity`: if set to true, then there will be more test run',
@@ -215,6 +216,7 @@ const optionsDefaults = {
215216
'walFlushTimeout': 30000,
216217
'writeXmlReport': false,
217218
'testFailureText': 'testfailures.txt',
219+
'crashAnalysisText': 'testfailures.txt',
218220
'testCase': undefined,
219221
'disableMonitor': false,
220222
'disableClusterMonitor': true,

0 commit comments

Comments
 (0)
0