@@ -306,7 +306,10 @@ static void walkdir(const char *path,
306
306
#ifdef PG_FLUSH_DATA_WORKS
307
307
static void pre_sync_fname (const char * fname , bool isdir , int elevel );
308
308
#endif
309
- static void fsync_fname_ext (const char * fname , bool isdir , int elevel );
309
+ static void datadir_fsync_fname (const char * fname , bool isdir , int elevel );
310
+
311
+ static int fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel );
312
+ static int fsync_parent_path (const char * fname , int elevel );
310
313
311
314
312
315
/*
@@ -413,54 +416,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
413
416
* indicate the OS just doesn't allow/require fsyncing directories.
414
417
*/
415
418
void
416
- fsync_fname (char * fname , bool isdir )
419
+ fsync_fname (const char * fname , bool isdir )
420
+ {
421
+ fsync_fname_ext (fname , isdir , false, ERROR );
422
+ }
423
+
424
+ /*
425
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
426
+ *
427
+ * This routine ensures that, after returning, the effect of renaming file
428
+ * persists in case of a crash. A crash while this routine is running will
429
+ * leave you with either the pre-existing or the moved file in place of the
430
+ * new file; no mixed state or truncated files are possible.
431
+ *
432
+ * It does so by using fsync on the old filename and the possibly existing
433
+ * target filename before the rename, and the target file and directory after.
434
+ *
435
+ * Note that rename() cannot be used across arbitrary directories, as they
436
+ * might not be on the same filesystem. Therefore this routine does not
437
+ * support renaming across directories.
438
+ *
439
+ * Log errors with the caller specified severity.
440
+ *
441
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
442
+ * valid upon return.
443
+ */
444
+ int
445
+ durable_rename (const char * oldfile , const char * newfile , int elevel )
417
446
{
418
447
int fd ;
419
- int returncode ;
420
448
421
449
/*
422
- * Some OSs require directories to be opened read-only whereas other
423
- * systems don't allow us to fsync files opened read-only; so we need both
424
- * cases here
450
+ * First fsync the old and target path (if it exists), to ensure that they
451
+ * are properly persistent on disk. Syncing the target file is not
452
+ * strictly necessary, but it makes it easier to reason about crashes;
453
+ * because it's then guaranteed that either source or target file exists
454
+ * after a crash.
425
455
*/
426
- if (!isdir )
427
- fd = OpenTransientFile (fname ,
428
- O_RDWR | PG_BINARY ,
429
- S_IRUSR | S_IWUSR );
456
+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
457
+ return -1 ;
458
+
459
+ fd = OpenTransientFile ((char * ) newfile , PG_BINARY | O_RDWR , 0 );
460
+ if (fd < 0 )
461
+ {
462
+ if (errno != ENOENT )
463
+ {
464
+ ereport (elevel ,
465
+ (errcode_for_file_access (),
466
+ errmsg ("could not open file \"%s\": %m" , newfile )));
467
+ return -1 ;
468
+ }
469
+ }
430
470
else
431
- fd = OpenTransientFile (fname ,
432
- O_RDONLY | PG_BINARY ,
433
- S_IRUSR | S_IWUSR );
471
+ {
472
+ if (pg_fsync (fd ) != 0 )
473
+ {
474
+ int save_errno ;
475
+
476
+ /* close file upon error, might not be in transaction context */
477
+ save_errno = errno ;
478
+ CloseTransientFile (fd );
479
+ errno = save_errno ;
480
+
481
+ ereport (elevel ,
482
+ (errcode_for_file_access (),
483
+ errmsg ("could not fsync file \"%s\": %m" , newfile )));
484
+ return -1 ;
485
+ }
486
+ CloseTransientFile (fd );
487
+ }
488
+
489
+ /* Time to do the real deal... */
490
+ if (rename (oldfile , newfile ) < 0 )
491
+ {
492
+ ereport (elevel ,
493
+ (errcode_for_file_access (),
494
+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
495
+ oldfile , newfile )));
496
+ return -1 ;
497
+ }
434
498
435
499
/*
436
- * Some OSs don't allow us to open directories at all (Windows returns
437
- * EACCES)
500
+ * To guarantee renaming the file is persistent, fsync the file with its
501
+ * new name, and its containing directory.
438
502
*/
439
- if (fd < 0 && isdir && ( errno == EISDIR || errno == EACCES ) )
440
- return ;
503
+ if (fsync_fname_ext ( newfile , false, false, elevel ) != 0 )
504
+ return -1 ;
441
505
442
- else if (fd < 0 )
443
- ereport (ERROR ,
444
- (errcode_for_file_access (),
445
- errmsg ("could not open file \"%s\": %m" , fname )));
506
+ if (fsync_parent_path (newfile , elevel ) != 0 )
507
+ return -1 ;
446
508
447
- returncode = pg_fsync (fd );
509
+ return 0 ;
510
+ }
511
+
512
+ /*
513
+ * durable_link_or_rename -- rename a file in a durable manner.
514
+ *
515
+ * Similar to durable_rename(), except that this routine tries (but does not
516
+ * guarantee) not to overwrite the target file.
517
+ *
518
+ * Note that a crash in an unfortunate moment can leave you with two links to
519
+ * the target file.
520
+ *
521
+ * Log errors with the caller specified severity.
522
+ *
523
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
524
+ * valid upon return.
525
+ */
526
+ int
527
+ durable_link_or_rename (const char * oldfile , const char * newfile , int elevel )
528
+ {
529
+ /*
530
+ * Ensure that, if we crash directly after the rename/link, a file with
531
+ * valid contents is moved into place.
532
+ */
533
+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
534
+ return -1 ;
448
535
449
- /* Some OSs don't allow us to fsync directories at all */
450
- if (returncode != 0 && isdir && errno == EBADF )
536
+ #if HAVE_WORKING_LINK
537
+ if (link ( oldfile , newfile ) < 0 )
451
538
{
452
- CloseTransientFile (fd );
453
- return ;
539
+ ereport (elevel ,
540
+ (errcode_for_file_access (),
541
+ errmsg ("could not link file \"%s\" to \"%s\": %m" ,
542
+ oldfile , newfile )));
543
+ return -1 ;
454
544
}
455
-
456
- if (returncode != 0 )
457
- ereport (ERROR ,
545
+ unlink (oldfile );
546
+ #else
547
+ /* XXX: Add racy file existence check? */
548
+ if (rename (oldfile , newfile ) < 0 )
549
+ {
550
+ ereport (elevel ,
458
551
(errcode_for_file_access (),
459
- errmsg ("could not fsync file \"%s\": %m" , fname )));
552
+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
553
+ oldfile , newfile )));
554
+ return -1 ;
555
+ }
556
+ #endif
460
557
461
- CloseTransientFile (fd );
462
- }
558
+ /*
559
+ * Make change persistent in case of an OS crash, both the new entry and
560
+ * its parent directory need to be flushed.
561
+ */
562
+ if (fsync_fname_ext (newfile , false, false, elevel ) != 0 )
563
+ return -1 ;
564
+
565
+ /* Same for parent directory */
566
+ if (fsync_parent_path (newfile , elevel ) != 0 )
567
+ return -1 ;
463
568
569
+ return 0 ;
570
+ }
464
571
465
572
/*
466
573
* InitFileAccess --- initialize this module during backend startup
@@ -2547,10 +2654,10 @@ SyncDataDirectory(void)
2547
2654
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
2548
2655
* so we don't worry about optimizing it.
2549
2656
*/
2550
- walkdir ("." , fsync_fname_ext , false, LOG );
2657
+ walkdir ("." , datadir_fsync_fname , false, LOG );
2551
2658
if (xlog_is_symlink )
2552
- walkdir ("pg_xlog" , fsync_fname_ext , false, LOG );
2553
- walkdir ("pg_tblspc" , fsync_fname_ext , true, LOG );
2659
+ walkdir ("pg_xlog" , datadir_fsync_fname , false, LOG );
2660
+ walkdir ("pg_tblspc" , datadir_fsync_fname , true, LOG );
2554
2661
}
2555
2662
2556
2663
/*
@@ -2664,15 +2771,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
2664
2771
2665
2772
#endif /* PG_FLUSH_DATA_WORKS */
2666
2773
2774
+ static void
2775
+ datadir_fsync_fname (const char * fname , bool isdir , int elevel )
2776
+ {
2777
+ /*
2778
+ * We want to silently ignoring errors about unreadable files. Pass that
2779
+ * desire on to fsync_fname_ext().
2780
+ */
2781
+ fsync_fname_ext (fname , isdir , true, elevel );
2782
+ }
2783
+
2667
2784
/*
2668
2785
* fsync_fname_ext -- Try to fsync a file or directory
2669
2786
*
2670
- * Ignores errors trying to open unreadable files, or trying to fsync
2671
- * directories on systems where that isn't allowed/required, and logs other
2672
- * errors at a caller-specified level.
2787
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
2788
+ * files. Logs other errors at a caller-specified level.
2789
+ *
2790
+ * Returns 0 if the operation succeeded, -1 otherwise.
2673
2791
*/
2674
- static void
2675
- fsync_fname_ext (const char * fname , bool isdir , int elevel )
2792
+ static int
2793
+ fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel )
2676
2794
{
2677
2795
int fd ;
2678
2796
int flags ;
@@ -2690,20 +2808,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
2690
2808
else
2691
2809
flags |= O_RDONLY ;
2692
2810
2811
+ fd = OpenTransientFile ((char * ) fname , flags , 0 );
2812
+
2693
2813
/*
2694
- * Open the file, silently ignoring errors about unreadable files (or
2695
- * unsupported operations, e.g. opening a directory under Windows), and
2696
- * logging others.
2814
+ * Some OSs don't allow us to open directories at all (Windows returns
2815
+ * EACCES), just ignore the error in that case. If desired also silently
2816
+ * ignoring errors about unreadable files. Log others.
2697
2817
*/
2698
- fd = OpenTransientFile ((char * ) fname , flags , 0 );
2699
- if (fd < 0 )
2818
+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES ))
2819
+ return 0 ;
2820
+ else if (fd < 0 && ignore_perm && errno == EACCES )
2821
+ return 0 ;
2822
+ else if (fd < 0 )
2700
2823
{
2701
- if (errno == EACCES || (isdir && errno == EISDIR ))
2702
- return ;
2703
2824
ereport (elevel ,
2704
2825
(errcode_for_file_access (),
2705
2826
errmsg ("could not open file \"%s\": %m" , fname )));
2706
- return ;
2827
+ return -1 ;
2707
2828
}
2708
2829
2709
2830
returncode = pg_fsync (fd );
@@ -2713,9 +2834,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
2713
2834
* those errors. Anything else needs to be logged.
2714
2835
*/
2715
2836
if (returncode != 0 && !(isdir && errno == EBADF ))
2837
+ {
2838
+ int save_errno ;
2839
+
2840
+ /* close file upon error, might not be in transaction context */
2841
+ save_errno = errno ;
2842
+ (void ) CloseTransientFile (fd );
2843
+ errno = save_errno ;
2844
+
2716
2845
ereport (elevel ,
2717
2846
(errcode_for_file_access (),
2718
2847
errmsg ("could not fsync file \"%s\": %m" , fname )));
2848
+ return -1 ;
2849
+ }
2719
2850
2720
2851
(void ) CloseTransientFile (fd );
2852
+
2853
+ return 0 ;
2854
+ }
2855
+
2856
+ /*
2857
+ * fsync_parent_path -- fsync the parent path of a file or directory
2858
+ *
2859
+ * This is aimed at making file operations persistent on disk in case of
2860
+ * an OS crash or power failure.
2861
+ */
2862
+ static int
2863
+ fsync_parent_path (const char * fname , int elevel )
2864
+ {
2865
+ char parentpath [MAXPGPATH ];
2866
+
2867
+ strlcpy (parentpath , fname , MAXPGPATH );
2868
+ get_parent_directory (parentpath );
2869
+
2870
+ /*
2871
+ * get_parent_directory() returns an empty string if the input argument is
2872
+ * just a file name (see comments in path.c), so handle that as being the
2873
+ * current directory.
2874
+ */
2875
+ if (strlen (parentpath ) == 0 )
2876
+ strlcpy (parentpath , "." , MAXPGPATH );
2877
+
2878
+ if (fsync_fname_ext (parentpath , true, false, elevel ) != 0 )
2879
+ return -1 ;
2880
+
2881
+ return 0 ;
2721
2882
}