diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 75b5ad1b1a47d2..18849b309b8605 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -242,14 +242,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, buffering = -1 line_buffering = True if buffering < 0: - buffering = DEFAULT_BUFFER_SIZE - try: - bs = os.fstat(raw.fileno()).st_blksize - except (OSError, AttributeError): - pass - else: - if bs > 1: - buffering = bs + buffering = raw._blksize if buffering < 0: raise ValueError("invalid buffering size") if buffering == 0: @@ -1565,19 +1558,15 @@ def __init__(self, file, mode='r', closefd=True, opener=None): os.set_inheritable(fd, False) self._closefd = closefd - fdfstat = os.fstat(fd) + self._stat_atopen = os.fstat(fd) try: - if stat.S_ISDIR(fdfstat.st_mode): + if stat.S_ISDIR(self._stat_atopen.st_mode): raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), file) except AttributeError: # Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR # don't exist. pass - self._blksize = getattr(fdfstat, 'st_blksize', 0) - if self._blksize <= 1: - self._blksize = DEFAULT_BUFFER_SIZE - self._estimated_size = fdfstat.st_size if _setmode: # don't translate newlines (\r\n <=> \n) @@ -1623,6 +1612,17 @@ def __repr__(self): return ('<%s name=%r mode=%r closefd=%r>' % (class_name, name, self.mode, self._closefd)) + @property + def _blksize(self): + if self._stat_atopen is None: + return DEFAULT_BUFFER_SIZE + + blksize = getattr(self._stat_atopen, "st_blksize", 0) + # WASI sets blsize to 0 + if not blksize: + return DEFAULT_BUFFER_SIZE + return blksize + def _checkReadable(self): if not self._readable: raise UnsupportedOperation('File not open for reading') @@ -1655,16 +1655,20 @@ def readall(self): """ self._checkClosed() self._checkReadable() - if self._estimated_size <= 0: + if self._stat_atopen is None or self._stat_atopen.st_size <= 0: bufsize = DEFAULT_BUFFER_SIZE else: - bufsize = self._estimated_size + 1 + # In order to detect end of file, need a read() of at least 1 + # byte which returns size 0. Oversize the buffer by 1 byte so the + # I/O can be completed with two read() calls (one for all data, one + # for EOF) without needing to resize the buffer. + bufsize = self._stat_atopen.st_size + 1 - if self._estimated_size > 65536: + if self._stat_atopen.st_size > 65536: try: pos = os.lseek(self._fd, 0, SEEK_CUR) - if self._estimated_size >= pos: - bufsize = self._estimated_size - pos + 1 + if self._stat_atopen.st_size >= pos: + bufsize = self._stat_atopen.st_size - pos + 1 except OSError: pass @@ -1742,7 +1746,7 @@ def truncate(self, size=None): if size is None: size = self.tell() os.ftruncate(self._fd, size) - self._estimated_size = size + self._stat_atopen = None return size def close(self): diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 5d9d87d6118a75..865b0e3634f3b4 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -74,8 +74,13 @@ typedef struct { signed int seekable : 2; /* -1 means unknown */ unsigned int closefd : 1; char finalizing; - unsigned int blksize; - Py_off_t estimated_size; + /* Stat result which was grabbed at file open, useful for optimizing common + File I/O patterns to be more efficient. This is only guidance / an + estimate, as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU) + issues / bugs. Both the underlying file descriptor and file may be + modified outside of the fileio object / Python (ex. gh-90102, GH-121941, + gh-109523). */ + struct _Py_stat_struct *stat_atopen; PyObject *weakreflist; PyObject *dict; } fileio; @@ -199,8 +204,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds) self->writable = 0; self->appending = 0; self->seekable = -1; - self->blksize = 0; - self->estimated_size = -1; + self->stat_atopen = NULL; self->closefd = 1; self->weakreflist = NULL; } @@ -256,7 +260,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #elif !defined(MS_WINDOWS) int *atomic_flag_works = NULL; #endif - struct _Py_stat_struct fdfstat; int fstat_result; int async_err = 0; @@ -454,9 +457,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #endif } - self->blksize = DEFAULT_BUFFER_SIZE; + self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1); + if (self->stat_atopen == NULL) { + PyErr_NoMemory(); + goto error; + } Py_BEGIN_ALLOW_THREADS - fstat_result = _Py_fstat_noraise(self->fd, &fdfstat); + fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen); Py_END_ALLOW_THREADS if (fstat_result < 0) { /* Tolerate fstat() errors other than EBADF. See Issue #25717, where @@ -471,25 +478,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #endif goto error; } + + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; } else { #if defined(S_ISDIR) && defined(EISDIR) /* On Unix, open will succeed for directories. In Python, there should be no file objects referring to directories, so we need a check. */ - if (S_ISDIR(fdfstat.st_mode)) { + if (S_ISDIR(self->stat_atopen->st_mode)) { errno = EISDIR; PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj); goto error; } #endif /* defined(S_ISDIR) */ -#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE - if (fdfstat.st_blksize > 1) - self->blksize = fdfstat.st_blksize; -#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ - if (fdfstat.st_size < PY_SSIZE_T_MAX) { - self->estimated_size = (Py_off_t)fdfstat.st_size; - } } #if defined(MS_WINDOWS) || defined(__CYGWIN__) @@ -521,6 +524,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, internal_close(self); _PyErr_ChainExceptions1(exc); } + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } done: #ifdef MS_WINDOWS @@ -553,6 +560,10 @@ fileio_dealloc(fileio *self) if (_PyIOBase_finalize((PyObject *) self) < 0) return; _PyObject_GC_UNTRACK(self); + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } if (self->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *) self); (void)fileio_clear(self); @@ -725,20 +736,27 @@ _io_FileIO_readall_impl(fileio *self) return err_closed(); } - end = self->estimated_size; + if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX) { + end = (Py_off_t)self->stat_atopen->st_size; + } + else { + end = -1; + } if (end <= 0) { /* Use a default size and resize as needed. */ bufsize = SMALLCHUNK; } else { - /* This is probably a real file, so we try to allocate a - buffer one byte larger than the rest of the file. If the - calculation is right then we should get EOF without having - to enlarge the buffer. */ + /* This is probably a real file. */ if (end > _PY_READ_MAX - 1) { bufsize = _PY_READ_MAX; } else { + /* In order to detect end of file, need a read() of at + least 1 byte which returns size 0. Oversize the buffer + by 1 byte so the I/O can be completed with two read() + calls (one for all data, one for EOF) without needing + to resize the buffer. */ bufsize = (size_t)end + 1; } @@ -1094,11 +1112,13 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj) return NULL; } - /* Sometimes a large file is truncated. While estimated_size is used as a - estimate, that it is much larger than the actual size can result in a - significant over allocation and sometimes a MemoryError / running out of - memory. */ - self->estimated_size = pos; + /* Since the file was truncated, its size at open is no longer accurate + as an estimate. Clear out the stat result, and rely on dynamic resize + code if a readall is requested. */ + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } return posobj; } @@ -1229,16 +1249,27 @@ get_mode(fileio *self, void *closure) return PyUnicode_FromString(mode_string(self)); } +static PyObject * +get_blksize(fileio *self, void *closure) +{ +#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE + if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1) { + return PyLong_FromLong(self->stat_atopen->st_blksize); + } +#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ + return PyLong_FromLong(DEFAULT_BUFFER_SIZE); +} + static PyGetSetDef fileio_getsetlist[] = { {"closed", (getter)get_closed, NULL, "True if the file is closed"}, {"closefd", (getter)get_closefd, NULL, "True if the file descriptor will be closed by close()."}, {"mode", (getter)get_mode, NULL, "String giving the file mode"}, + {"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"}, {NULL}, }; static PyMemberDef fileio_members[] = { - {"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0}, {"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0}, {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY}, {"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY},