8000 PG-1465 Add ASAN and UBSAN sanitizers by artemgavrilov · Pull Request #285 · percona/postgres · GitHub
[go: up one dir, main page]

Skip to content

PG-1465 Add ASAN and UBSAN sanitizers #285

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: TDE_REL_17_STABLE
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
run: ci_scripts/setup-keyring-servers.sh

- name: Test postgres with TDE to generate coverage
run: ci_scripts/make-test.sh --tde-only
run: ci_scripts/make-test.sh tde

- name: Collect coverage data
run: find . -type f -name "*.c" ! -path '*libkmip*' | xargs -t gcov -abcfu
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/psp-reusable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ jobs:
run: src/ci_scripts/setup-keyring-servers.sh

- name: Test postgres
run: src/ci_scripts/${{ inputs.build_script }}-test.sh
run: src/ci_scripts/${{ inputs.build_script }}-test.sh all

- name: Report on test fail
uses: actions/upload-artifact@v4
Expand Down
69 changes: 69 additions & 0 deletions .github/workflows/sanitizers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: Sanitizers
on:
pull_request:
push:
branches:
- TDE_REL_17_STABLE

env:
CC: clang
LD: clang
UBSAN_OPTIONS: log_path=${{ github.workspace }}/sanitize.log print_suppressions=0 print_stacktrace=1 print_summary=1 halt_on_error=1
ASAN_OPTIONS: log_path=${{ github.workspace }}/sanitize.log print_suppressions=0 abort_on_error=1
LSAN_OPTIONS: log_path=${{ github.workspace }}/sanitize.log print_suppressions=0 suppressions=${{ github.workspace }}/ci_scripts/suppressions/lsan.supp
ASAN_SYMBOLIZER_PATH: /usr/bin/llvm-symbolizer-14
EXTRA_REGRESS_OPTS: "--temp-config=${{ github.workspace }}/test_postgresql.conf"
PGCTLTIMEOUT: 120

jobs:
run:
name: Run
runs-on: ubuntu-22.04
steps:
- name: Clone repository
uses: actions/checkout@v4
with:
submodules: recursive

- name: Install dependencies
run: ci_scripts/ubuntu-deps.sh

- name: Build postgres
run: ci_scripts/make-build.sh sanitize

- name: Setup kmip and vault
run: ci_scripts/setup-keyring-servers.sh

- name: Configure test postgres
run: echo 'max_stack_depth=8MB' > test_postgresql.conf

- name: Run tests pg_tde tests
run: ci_scripts/make-test.sh tde

- name: Run PG regression tests
run: ci_scripts/make-test.sh server

- name: Print sanitize logs
if: ${{ !cancelled() }}
run: cat sanitize.log.*

- name: Report on test fail
uses: actions/upload-artifact@v4
if: ${{ failure() }}
with:
name: sanitizers-testlog
path: |
build/testrun
contrib/**/log
contrib/**/regression.diffs
contrib/**/regression.out
contrib/**/results
contrib/**/tmp_check
contrib/**/t/results
src/test/**/log
src/test/**/regression.diffs
src/test/**/regression.out
src/test/**/results
src/test/**/tmp_check
sanitize.log.*
retention-days: 3
35 changes: 27 additions & 8 deletions ci_scripts/make-build.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/bin/bash

ENABLE_COVERAGE=
ARGS=

for arg in "$@"
do
case "$arg" in
--enable-coverage)
ENABLE_COVERAGE="--enable-coverage"
shift;;
ARGS +=" --enable-coverage"
;;
esac
done

Expand All @@ -17,10 +17,29 @@ source "$SCRIPT_DIR/env.sh"

cd "$SCRIPT_DIR/.."

if [ "$1" = "debugoptimized" ]; then
export CFLAGS="-O2"
export CXXFLAGS="-O2"
fi
case "$1" in
debug)
echo "Building with debug option"
ARGS+=" --enable-cassert"
;;

./configure --prefix="$INSTALL_DIR" --enable-debug --enable-cassert --enable-tap-tests $ENABLE_COVERAGE
debugoptimized)
echo "Building with debugoptimized option"
export CFLAGS="-O2"
ARGS+=" --enable-cassert"
;;

sanitize)
echo "Building with sanitize option"
export CFLAGS="-fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer -fno-inline-functions"
;;

*)
echo "Unknown build type: $1"
echo "Please use one of the following: debug, debugoptimized, sanitize"
exit 1
;;
esac

./configure --prefix="$INSTALL_DIR" --enable-debug --enable-tap-tests $ARGS
make install-world -j
44 changes: 26 additions & 18 deletions ci_scripts/make-test.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,33 @@
#!/bin/bash

set -e
TDE_ONLY=0

for arg in "$@"
do
case "$arg" in
--tde-only)
TDE_ONLY=1
shift;;
esac
done

SCRIPT_DIR="$(cd -- "$(dirname "$0")" >/dev/null 2>&1; pwd -P)"
source "$SCRIPT_DIR/env.sh"

if [ "$TDE_ONLY" -eq 1 ];
then
cd "$SCRIPT_DIR/../contrib/pg_tde"
make -s check
else
cd "$SCRIPT_DIR/.."
make -s check-world
fi

case "$1" in
server)
echo "Run server regression tests"
cd "$SCRIPT_DIR/.."
make -s check
;;

tde)
echo "Run tde tests"
cd "$SCRIPT_DIR/../contrib/pg_tde"
make -s check
;;

all)
echo "Run all tests"
cd "$SCRIPT_DIR/.."
make -s check-world
;;

*)
echo "Unknown test suite: $1"
echo "Please use one of the following: server, tde, all"
exit 1
;;
esac
28 changes: 27 additions & 1 deletion ci_scripts/meson-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,31 @@ source "$SCRIPT_DIR/env.sh"

cd "$SCRIPT_DIR/.."

meson setup build --prefix "$INSTALL_DIR" --buildtype="$1" -Dcassert=true -Dtap_tests=enabled $ENABLE_COVERAGE
BUILD_TYPE=

case "$1" in
debug)
echo "Building with debug option"
BUILD_TYPE=$1
;;

debugoptimized)
echo "Building with debugoptimized option"
BUILD_TYPE=$1
;;

sanitize)
echo "Building with sanitize option"
export CFLAGS="-fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer -fno-inline-functions"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why you disable inlining? LLVM's documentation says nothing about that as far as I can see.

https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#stack-traces-and-report-symbolization

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://clang.llvm.org/docs/AddressSanitizer.html#usage

To get a reasonable performance add -O1 or higher. To get nicer stack traces in error messages add -fno-omit-frame-pointer. To get perfect stack traces you may need to disable inlining (just use -O1) and tail call elimination (-fno-optimize-sibling-calls).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for ASAN. BTW it looks like it's worth to add -fno-optimize-sibling-calls as well.

BUILD_TYPE=debug
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the best practices about optimization and sanitizers? Our production build will be -O2 but maybe that fucks up sanitizers.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't see information that -O2 could harm sanitizer checks, however lower optimization levels simplify report reviews. I hope there should be no difference between -O0 and -O2 in terms of leaks and UBs, am I wrong?

;;

*)
echo "Unknown build type: $1"
echo "Please use one of the following: debug, debugoptimized, sanitize"
exit 1
;;
esac

meson setup build --prefix "$INSTALL_DIR" --buildtype="$BUILD_TYPE" -Dcassert=true -Dtap_tests=enabled $ENABLE_COVERAGE
cd build && ninja && ninja install
10000
39 changes: 23 additions & 16 deletions ci_scripts/meson-test.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
#!/bin/bash

set -e
TDE_ONLY=0

for arg in "$@"
do
case "$arg" in
--tde-only)
TDE_ONLY=1
shift;;
esac
done

SCRIPT_DIR="$(cd -- "$(dirname "$0")" >/dev/null 2>&1; pwd -P)"
source "$SCRIPT_DIR/env.sh"

cd "$SCRIPT_DIR/../build"

if [ "$TDE_ONLY" -eq 1 ];
then
meson test --suite setup --suite pg_tde
else
meson test
fi

case "$1" in
server)
echo "Run server regression tests"
meson test --suite setup --suite regress
;;

tde)
echo "Run tde tests"
meson test --suite setup --suite pg_tde
;;

all)
echo "Run all tests"
meson test
;;

*)
echo "Unknown test suite: $1"
echo "Please use one of the following: server, tde, all"
exit 1
;;
esac
15 changes: 15 additions & 0 deletions ci_scripts/suppressions/lsan.supp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
leak:save_ps_display_args
leak:initdb.c
leak:fe_memutils.c
leak:fe-exec.c
leak:fe-connect.c
leak:pqexpbuffer.c
leak:strdup
leak:preproc.y
leak:pgtypeslib/common.c
leak:ecpglib/memory.c
leak:kmip.c
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we create a ticket/tickets about this? Some of these supressions definitely seem like issues.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of them are not real issues, but "design features" or just allocation functions. I used timescale as a reference and they mute the same things: https://github.com/timescale/timescaledb/blob/main/scripts/suppressions/suppr_leak.txt

But I'm agree that this list should be reviewed. Probably some mutes scope can be reduced. Here is the ticket: https://perconadev.atlassian.net/browse/PG-1588

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The kmip.c one is likely to be an actual bug. But, yeah, that can be outside the scope of this PR:

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and strdup also seems to be a possible issue, or at least it can hide possible issues, as it suppresses any leaked strings created by strdup - even if it's a leak in postgres itself, we should look at that.


#pg_dump
leak:getSchemaData
leak:dumpDumpableObject
2 changes: 1 addition & 1 deletion contrib/pg_tde/documentation/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ To run the tests, use the following command:

```
source ci_scripts/setup-keyring-servers.sh
ci_scripts/make-test.sh
ci_scripts/make-test.sh all
```

You can run tests on your local machine with whatever operating system you have. After you submit the pull request, we will check your patch on multiple operating systems.
Expand Down
2 changes: 1 addition & 1 deletion contrib/pg_tde/documentation/docs/contribute.md F438
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ To run the tests, use the following command:

```
source ci_scripts/setup-keyring-servers.sh
ci_scripts/make-test.sh
ci_scripts/make-test.sh all
```

You can run tests on your local machine with whatever operating system you have. After you submit the pull request, we will check your patch on multiple operating systems.
Expand Down
2 changes: 1 addition & 1 deletion contrib/pg_tde/src/catalog/tde_keyring.c
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@ scan_key_provider_file(ProviderScanType scanType, void *scanKey, Oid dbOid)
providers_list = lappend(providers_list, keyring);
#else
if (providers_list == NULL)
providers_list = palloc_object(SimplePtrList);
providers_list = palloc0_object(SimplePtrList);
simple_ptr_list_append(providers_list, keyring);
#endif
}
Expand Down
1 change: 1 addition & 0 deletions contrib/pg_tde/t/011_unlogged_tables.pl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

PGTDE::append_to_result_file("-- kill -9");
$node->kill9();
sleep(3); # sanitizers slow down the kill, let server die in peace
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I am missing something but how can kill -9 be slow?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the thread where exact same problem discussed: https://www.postgresql.org/message-id/flat/1709793.1644523137%40sss.pgh.pa.us#c4caef6ad812ecaceeb5b711b02ee8f4

So as I understood kill -9 could wait for program to terminate syscals/coredumps and it looks like ASAN doing something that slightly delays node death.


PGTDE::append_to_result_file("-- server start");
$node->start;
Expand Down
1 change: 1 addition & 0 deletions contrib/pg_tde/t/012_replication.pl
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
PGTDE::psql($primary, 'postgres',
"ALTER SYSTEM SET pg_tde.wal_encrypt = 'on';");
$primary->kill9;
sleep(3); # sanitizers slow down the kill, let server die in peace

PGTDE::append_to_result_file("-- primary start");
$primary->start;
Expand Down
7 changes: 7 additions & 0 deletions contrib/pg_tde/t/013_crash_recovery.pl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

PGTDE::append_to_result_file("-- kill -9");
$node->kill9;
sleep(3); # sanitizers slow down the kill, let server die in peace

PGTDE::append_to_result_file("-- server start");
$node->start;
Expand All @@ -61,6 +62,8 @@
PGTDE::psql($node, 'postgres', "INSERT INTO test_enc (x) VALUES (3), (4);");
PGTDE::append_to_result_file("-- kill -9");
$node->kill9;
sleep(3); # sanitizers slow down the kill, let server die in peace

PGTDE::append_to_result_file("-- server start");
PGTDE::append_to_result_file(
"-- check that pg_tde_save_principal_key_redo hasn't destroyed a WAL key created during the server start"
Expand All @@ -77,6 +80,8 @@
PGTDE::psql($node, 'postgres', "INSERT INTO test_enc (x) VALUES (5), (6);");
PGTDE::append_to_result_file("-- kill -9");
$node->kill9;
sleep(3); # sanitizers slow down the kill, let server die in peace

PGTDE::append_to_result_file("-- server start");
PGTDE::append_to_result_file(
"-- check that the key rotation hasn't destroyed a WAL key created during the server start"
Expand All @@ -89,6 +94,8 @@
"CREATE TABLE test_enc2 (x int PRIMARY KEY) USING tde_heap;");
PGTDE::append_to_result_file("-- kill -9");
$node->kill9;
sleep(3); # sanitizers slow down the kill, let server die in peace

PGTDE::append_to_result_file("-- server start");
PGTDE::append_to_result_file(
"-- check redo of the smgr internal key creation when the key is on disk"
Expand Down
4 changes: 4 additions & 0 deletions src/test/recovery/t/012_subtransactions.pl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
'postgresql.conf', qq(
max_prepared_transactions = 10
log_checkpoints = true

## Workaround to make tests pass with enabled UBSAN
## https://www.postgresql.org/message-id/flat/1775221.1658699459%40sss.pgh.pa.us#18ac2074d2383f232a20bf8b7b32c526
max_stack_depth = 8MB
));
$node_primary->start;
$node_primary->backup('primary_backup');
Expand Down
0