@@ -306,7 +306,10 @@ static void walkdir(const char *path,
306
306
#ifdef PG_FLUSH_DATA_WORKS
307
307
static void pre_sync_fname (const char * fname , bool isdir , int elevel );
308
308
#endif
309
- static void fsync_fname_ext (const char * fname , bool isdir , int elevel );
309
+ static void datadir_fsync_fname (const char * fname , bool isdir , int elevel );
310
+
311
+ static int fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel );
312
+ static int fsync_parent_path (const char * fname , int elevel );
310
313
311
314
312
315
/*
@@ -413,54 +416,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
413
416<
10000
/code>
* indicate the OS just doesn't allow/require fsyncing directories.
414
417
*/
415
418
void
416
- fsync_fname (char * fname , bool isdir )
419
+ fsync_fname (const char * fname , bool isdir )
420
+ {
421
+ fsync_fname_ext (fname , isdir , false, ERROR );
422
+ }
423
+
424
+ /*
425
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
426
+ *
427
+ * This routine ensures that, after returning, the effect of renaming file
428
+ * persists in case of a crash. A crash while this routine is running will
429
+ * leave you with either the pre-existing or the moved file in place of the
430
+ * new file; no mixed state or truncated files are possible.
431
+ *
432
+ * It does so by using fsync on the old filename and the possibly existing
433
+ * target filename before the rename, and the target file and directory after.
434
+ *
435
+ * Note that rename() cannot be used across arbitrary directories, as they
436
+ * might not be on the same filesystem. Therefore this routine does not
437
+ * support renaming across directories.
438
+ *
439
+ * Log errors with the caller specified severity.
440
+ *
441
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
442
+ * valid upon return.
443
+ */
444
+ int
445
+ durable_rename (const char * oldfile , const char * newfile , int elevel )
417
446
{
418
447
int fd ;
419
- int returncode ;
420
448
421
449
/*
422
- * Some OSs require directories to be opened read-only whereas other
423
- * systems don't allow us to fsync files opened read-only; so we need both
424
- * cases here
450
+ * First fsync the old and target path (if it exists), to ensure that they
451
+ * are properly persistent on disk. Syncing the target file is not
452
+ * strictly necessary, but it makes it easier to reason about crashes;
453
+ * because it's then guaranteed that either source or target file exists
454
+ * after a crash.
425
455
*/
426
- if (!isdir )
427
- fd = OpenTransientFile (fname ,
428
- O_RDWR | PG_BINARY ,
429
- S_IRUSR | S_IWUSR );
456
+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
457
+ return -1 ;
458
+
459
+ fd = OpenTransientFile ((char * ) newfile , PG_BINARY | O_RDWR , 0 );
460
+ if (fd < 0 )
461
+ {
462
+ if (errno != ENOENT )
463
+ {
464
+ ereport (elevel ,
465
+ (errcode_for_file_access (),
466
+ errmsg ("could not open file \"%s\": %m" , newfile )));
467
+ return -1 ;
468
+ }
469
+ }
430
470
else
431
- fd = OpenTransientFile (fname ,
432
- O_RDONLY | PG_BINARY ,
433
- S_IRUSR | S_IWUSR );
471
+ {
472
+ if (pg_fsync (fd ) != 0 )
473
+ {
474
+ int save_errno ;
475
+
476
+ /* close file upon error, might not be in transaction context */
477
+ save_errno = errno ;
478
+ CloseTransientFile (fd );
479
+ errno = save_errno ;
480
+
481
+ ereport (elevel ,
482
+ (errcode_for_file_access (),
483
+ errmsg ("could not fsync file \"%s\": %m" , newfile )));
484
+ return -1 ;
485
+ }
486
+ CloseTransientFile (fd );
487
+ }
488
+
489
+ /* Time to do the real deal... */
490
+ if (rename (oldfile , newfile ) < 0 )
491
+ {
492
+ ereport (elevel ,
493
+ (errcode_for_file_access (),
494
+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
495
+ oldfile , newfile )));
496
+ return -1 ;
497
+ }
434
498
435
499
/*
436
- * Some OSs don't allow us to open directories at all (Windows returns
437
- * EACCES)
500
+ * To guarantee renaming the file is persistent, fsync the file with its
501
+ * new name, and its containing directory.
438
502
*/
439
- if (fd < 0 && isdir && ( errno == EISDIR || errno == EACCES ) )
440
- return ;
503
+ if (fsync_fname_ext ( newfile , false, false, elevel ) != 0 )
504
+ return -1 ;
441
505
442
- else if (fd < 0 )
443
- ereport (ERROR ,
444
- (errcode_for_file_access (),
445
- errmsg ("could not open file \"%s\": %m" , fname )));
506
+ if (fsync_parent_path (newfile , elevel ) != 0 )
507
+ return -1 ;
446
508
447
- returncode = pg_fsync (fd );
509
+ return 0 ;
510
+ }
511
+
512
+ /*
513
+ * durable_link_or_rename -- rename a file in a durable manner.
514
+ *
515
+ * Similar to durable_rename(), except that this routine tries (but does not
516
+ * guarantee) not to overwrite the target file.
517
+ *
518
+ * Note that a crash in an unfortunate moment can leave you with two links to
519
+ * the target file.
520
+ *
521
+ * Log errors with the caller specified severity.
522
+ *
523
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
524
+ * valid upon return.
525
+ */
526
+ int
527
+ durable_link_or_rename (const char * oldfile , const char * newfile , int elevel )
528
+ {
529
+ /*
530
+ * Ensure that, if we crash directly after the rename/link, a file with
531
+ * valid contents is moved into place.
532
+ */
533
+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
534
+ return -1 ;
448
535
449
- /* Some OSs don't allow us to fsync directories at all */
450
- if (returncode != 0 && isdir && errno == EBADF )
536
+ #if HAVE_WORKING_LINK
537
+ if (link ( oldfile , newfile ) < 0 )
451
538
{
452
- CloseTransientFile (fd );
453
- return ;
539
+ ereport (elevel ,
540
+ (errcode_for_file_access (),
541
+ errmsg ("could not link file \"%s\" to \"%s\": %m" ,
542
+ oldfile , newfile )));
543
+ return -1 ;
454
544
}
455
-
456
- if (returncode != 0 )
457
- ereport (ERROR ,
545
+ unlink (oldfile );
546
+ #else
547
+ /* XXX: Add racy file existence check? */
548
+ if (rename (oldfile , newfile ) < 0 )
549
+ {
550
+ ereport (elevel ,
458
551
(errcode_for_file_access (),
459
- errmsg ("could not fsync file \"%s\": %m" , fname )));
552
+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
553
+ oldfile , newfile )));
554
+ return -1 ;
555
+ }
556
+ #endif
460
557
461
- CloseTransientFile (fd );
462
- }
558
+ /*
559
+ * Make change persistent in case of an OS crash, both the new entry and
560
+ * its parent directory need to be flushed.
561
+ */
562
+ if (fsync_fname_ext (newfile , false, false, elevel ) != 0 )
563
+ return -1 ;
564
+
565
+ /* Same for parent directory */
566
+ if (fsync_parent_path (newfile , elevel ) != 0 )
567
+ return -1 ;
463
568
569
+ return 0 ;
570
+ }
464
571
465
572
/*
466
573
* InitFileAccess --- initialize this module during backend startup
@@ -2553,10 +2660,10 @@ SyncDataDirectory(void)
2553
2660
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
2554
2661
* so we don't worry about optimizing it.
2555
2662
*/
2556
- walkdir ("." , fsync_fname_ext , false, LOG );
2663
+ walkdir ("." , datadir_fsync_fname , false, LOG );
2557
2664
if (xlog_is_symlink )
2558
- walkdir ("pg_xlog" , fsync_fname_ext , false, LOG );
2559
- walkdir ("pg_tblspc" , fsync_fname_ext , true, LOG );
2665
+ walkdir ("pg_xlog" , datadir_fsync_fname , false, LOG );
2666
+ walkdir ("pg_tblspc" , datadir_fsync_fname , true, LOG );
2560
2667
}
2561
2668
2562
2669
/*
@@ -2670,15 +2777,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
2670
2777
2671
2778
#endif /* PG_FLUSH_DATA_WORKS */
2672
2779
2780
+ static void
2781
+ datadir_fsync_fname (const char * fname , bool isdir , int elevel )
2782
+ {
2783
+ /*
2784
+ * We want to silently ignoring errors about unreadable files. Pass that
2785
+ * desire on to fsync_fname_ext().
2786
+ */
2787
+ fsync_fname_ext (fname , isdir , true, elevel );
2788
+ }
2789
+
2673
2790
/*
2674
2791
* fsync_fname_ext -- Try to fsync a file or directory
2675
2792
*
2676
- * Ignores errors trying to open unreadable files, or trying to fsync
2677
- * directories on systems where that isn't allowed/required, and logs other
2678
- * errors at a caller-specified level.
2793
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
2794
+ * files. Logs other errors at a caller-specified level.
2795
+ *
2796
+ * Returns 0 if the operation succeeded, -1 otherwise.
2679
2797
F438
*/
2680
- static void
2681
- fsync_fname_ext (const char * fname , bool isdir , int elevel )
2798
+ static int
2799
+ fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel )
2682
2800
{
2683
2801
int fd ;
2684
2802
int flags ;
@@ -2696,20 +2814,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
2696
2814
else
2697
2815
flags |= O_RDONLY ;
2698
2816
2817
+ fd = OpenTransientFile ((char * ) fname , flags , 0 );
2818
+
2699
2819
/*
2700
- * Open the file, silently ignoring errors about unreadable files (or
2701
- * unsupported operations, e.g. opening a directory under Windows), and
2702
- * logging others.
2820
+ * Some OSs don't allow us to open directories at all (Windows returns
2821
+ * EACCES), just ignore the error in that case. If desired also silently
2822
+ * ignoring errors about unreadable files. Log others.
2703
2823
*/
2704
- fd = OpenTransientFile ((char * ) fname , flags , 0 );
2705
- if (fd < 0 )
2824
+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES ))
2825
+ return 0 ;
2826
+ else if (fd < 0 && ignore_perm && errno == EACCES )
2827
+ return 0 ;
2828
+ else if (fd < 0 )
2706
2829
{
2707
- if (errno == EACCES || (isdir && errno == EISDIR ))
2708
- return ;
2709
2830
ereport (elevel ,
2710
2831
(errcode_for_file_access (),
2711
2832
errmsg ("could not open file \"%s\": %m" , fname )));
2712
- return ;
2833
+ return -1 ;
2713
2834
}
2714
2835
2715
2836
returncode = pg_fsync (fd );
@@ -2719,9 +2840,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
2719
2840
* those errors. Anything else needs to be logged.
2720
2841
*/
2721
2842
if (returncode != 0 && !(isdir && errno == EBADF ))
2843
+ {
2844
+ int save_errno ;
2845
+
2846
+ /* close file upon error, might not be in transaction context */
2847
+ save_errno = errno ;
2848
+ (void ) CloseTransientFile (fd );
2849
+ errno = save_errno ;
2850
+
2722
2851
ereport (elevel ,
2723
2852
(errcode_for_file_access (),
2724
2853
errmsg ("could not fsync file \"%s\": %m" , fname )));
2854
+ return -1 ;
2855
+ }
2725
2856
2726
2857
(void ) CloseTransientFile (fd );
2858
+
2859
+ return 0 ;
2860
+ }
2861
+
2862
+ /*
2863
+ * fsync_parent_path -- fsync the parent path of a file or directory
2864
+ *
2865
+ * This is aimed at making file operations persistent on disk in case of
2866
+ * an OS crash or power failure.
2867
+ */
2868
+ static int
2869
+ fsync_parent_path (const char * fname , int elevel )
2870
+ {
2871
+ char parentpath [MAXPGPATH ];
2872
+
2873
+ strlcpy (parentpath , fname , MAXPGPATH );
2874
+ get_parent_directory (parentpath );
2875
+
2876
+ /*
2877
+ * get_parent_directory() returns an empty string if the input argument is
2878
+ * just a file name (see comments in path.c), so handle that as being the
2879
+ * current directory.
2880
+ */
2881
+ if (strlen (parentpath ) == 0 )
2882
+ strlcpy (parentpath , "." , MAXPGPATH );
2883
+
2884
+ if (fsync_fname_ext (parentpath , true, false, elevel ) != 0 )
2885
+ return -1 ;
2886
+
2887
+ return 0 ;
2727
2888
}
0 commit comments