@@ -176,6 +176,7 @@ static void CheckArchiveTimeout(void);
176
176
static void BgWriterNap (void );
177
177
static bool IsCheckpointOnSchedule (double progress );
178
178
static bool ImmediateCheckpointRequested (void );
179
+ static bool CompactBgwriterRequestQueue (void );
179
180
180
181
/* Signal handlers */
181
182
@@ -979,14 +980,15 @@ RequestCheckpoint(int flags)
979
980
* use high values for special flags; that's all internal to md.c, which
980
981
* see for details.)
981
982
*
982
- * If we are unable to pass over the request (at present, this can happen
983
- * if the shared memory queue is full), we return false. That forces
984
- * the backend to do its own fsync. We hope that will be even more seldom.
985
- *
986
- * Note: we presently make no attempt to eliminate duplicate requests
987
- * in the requests[] queue. The bgwriter will have to eliminate dups
988
- * internally anyway, so we may as well avoid holding the lock longer
989
- * than we have to here.
983
+ * To avoid holding the lock for longer than necessary, we normally write
984
+ * to the requests[] queue without checking for duplicates. The bgwriter
985
+ * will have to eliminate dups internally anyway. However, if we discover
986
+ * that the queue is full, we make a pass over the entire queue to compact
987
+ * it. This is somewhat expensive, but the alternative is for the backend
988
+ * to perform its own fsync, which is far more expensive in practice. It
989
+ * is theoretically possible a backend fsync might still be necessary, if
990
+ * the queue is full and contains no duplicate entries. In that case, we
991
+ * let the backend know by returning false.
990
992
*/
991
993
bool
992
994
ForwardFsyncRequest (RelFileNode rnode , BlockNumber segno )
@@ -1004,8 +1006,15 @@ ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
1004
1006
/* we count non-bgwriter writes even when the request queue overflows */
1005
1007
BgWriterShmem -> num_backend_writes ++ ;
1006
1008
1009
+ /*
1010
+ * If the background writer isn't running or the request queue is full,
1011
+ * the backend will have to perform its own fsync request. But before
1012
+ * forcing that to happen, we can try to compact the background writer
1013
+ * request queue.
1014
+ */
1007
1015
if (BgWriterShmem -> bgwriter_pid == 0 ||
1008
- BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests )
1016
+ (BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests
1017
+ && !CompactBgwriterRequestQueue ()))
1009
1018
{
1010
1019
LWLockRelease (BgWriterCommLock );
1011
1020
return false;
@@ -1017,6 +1026,108 @@ ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
1017
1026
return true;
1018
1027
}
1019
1028
1029
+ /*
1030
+ * CompactBgwriterRequestQueue
1031
+ * Remove duplicates from the request queue to avoid backend fsyncs.
1032
+ *
1033
+ * Although a full fsync request queue is not common, it can lead to severe
1034
+ * performance problems when it does happen. So far, this situation has
1035
+ * only been observed to occur when the system is under heavy write load,
1036
+ * and especially during the "sync" phase of a checkpoint. Without this
1037
+ * logic, each backend begins doing an fsync for every block written, which
1038
+ * gets very expensive and can slow down the whole system.
1039
+ *
1040
+ * Trying to do this every time the queue is full could lose if there
1041
+ * aren't any removable entries. But should be vanishingly rare in
1042
+ * practice: there's one queue entry per shared buffer.
1043
+ */
1044
+ static bool
1045
+ CompactBgwriterRequestQueue ()
1046
+ {
1047
+ struct BgWriterSlotMapping {
1048
+ BgWriterRequest request ;
1049
+ int slot ;
1050
+ };
1051
+
1052
+ int n ,
1053
+ preserve_count ;
1054
+ int num_skipped = 0 ;
1055
+ HASHCTL ctl ;
1056
+ HTAB * htab ;
1057
+ bool * skip_slot ;
1058
+
1059
+ /* must hold BgWriterCommLock in exclusive mode */
1060
+ Assert (LWLockHeldByMe (BgWriterCommLock ));
1061
+
1062
+ /* Initialize temporary hash table */
1063
+ MemSet (& ctl , 0 , sizeof (ctl ));
1064
+ ctl .keysize = sizeof (BgWriterRequest );
1065
+ ctl .entrysize = sizeof (struct BgWriterSlotMapping );
1066
+ ctl .hash = tag_hash ;
1067
+ htab = hash_create ("CompactBgwriterRequestQueue" ,
1068
+ BgWriterShmem -> num_requests ,
1069
+ & ctl ,
1070
+ HASH_ELEM | HASH_FUNCTION );
1071
+
1072
+ /* Initialize skip_slot array */
1073
+ skip_slot = palloc0 (sizeof (bool ) * BgWriterShmem -> num_requests );
1074
+
1075
+ /*
1076
+ * The basic idea here is that a request can be skipped if it's followed
1077
+ * by a later, identical request. It might seem more sensible to work
1078
+ * backwards from the end of the queue and check whether a request is
1079
+ * *preceded* by an earlier, identical request, in the hopes of doing less
1080
+ * copying. But that might change the semantics, if there's an
1081
+ * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1082
+ * we do it this way. It would be possible to be even smarter if we made
1083
+ * the code below understand the specific semantics of such requests (it
1084
+ * could blow away preceding entries that would end up being cancelled
1085
+ * anyhow), but it's not clear that the extra complexity would buy us
1086
+ * anything.
1087
+ */
1088
+ for (n = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1089
+ {
1090
+ BgWriterRequest * request ;
1091
+ struct BgWriterSlotMapping * slotmap ;
1092
+ bool found ;
1093
+
1094
+ request = & BgWriterShmem -> requests [n ];
1095
+ slotmap = hash_search (htab , request , HASH_ENTER , & found );
1096
+ if (found )
1097
+ {
1098
+ skip_slot [slotmap -> slot ] = true;
1099
+ ++ num_skipped ;
1100
+ }
1101
+ slotmap -> slot = n ;
1102
+ }
1103
+
1104
+ /* Done with the hash table. */
1105
+ hash_destroy (htab );
1106
+
1107
+ /* If no duplicates, we're out of luck. */
1108
+ if (!num_skipped )
1109
+ {
1110
+ pfree (skip_slot );
1111
+ return false;
1112
+ }
1113
+
1114
+ /* We found some duplicates; remove them. */
1115
+ for (n = 0 , preserve_count = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1116
+ {
1117
+ if (skip_slot [n ])
1118
+ continue ;
1119
+ BgWriterShmem -> requests [preserve_count ++ ] = BgWriterShmem -> requests [n ];
1120
+ }
1121
+ ereport (DEBUG1 ,
1122
+ (errmsg ("compacted fsync request queue from %d entries to %d entries" ,
1123
+ BgWriterShmem -> num_requests , preserve_count )));
1124
+ BgWriterShmem -> num_requests = preserve_count ;
1125
+
1126
+ /* Cleanup. */
1127
+ pfree (skip_slot );
1128
+ return true;
1129
+ }
1130
+
1020
1131
/*
1021
1132
* AbsorbFsyncRequests
1022
1133
* Retrieve queued fsync requests and pass them to local smgr.
0 commit comments