REF: StringArray._from_sequence

pandas-dev · jreback · Aug 17, 2020 · Aug 2, 2020 · Aug 3, 2020 · Aug 3, 2020
commit df8e4d6498ee63fef64e4d7dd704b65f78e21e12
diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst
@@ -74,6 +74,11 @@ Categorical
 - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`)
 -
 
+**Strings**
+
+- fix memory usage issue when instantiating large  :class:`pandas.arrays.StringArray` (:issue:`35499`)
+
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_111.contributors:

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1698,6 +1698,20 @@ cpdef bint is_string_array(ndarray values, bint skipna=False):
     return validator.validate(values)
 
 
+cpdef ndarray ensure_string_array(ndarray values, object na_value):
+    cdef:
+        Py_ssize_t i = 0, n = len(values)
+
+    for i in range(n):
+        val = values[i]
+        if not checknull(val):
+            values[i] = str(val)
+        else:
+            values[i] = na_value
+
+    return values
+
+
 cdef class BytesValidator(Validator):
     cdef inline bint is_value_typed(self, object value) except -1:
         return isinstance(value, bytes)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -177,11 +177,10 @@ class StringArray(PandasArray):
 
     def __init__(self, values, copy=False):
         values = extract_array(values)
-        skip_validation = isinstance(values, type(self))
 
         super().__init__(values, copy=copy)
         self._dtype = StringDtype()
-        if not skip_validation:
+        if not isinstance(values, type(self)):
             self._validate()
 
     def _validate(self):
@@ -195,28 +194,16 @@ def _validate(self):
             )
 
     @classmethod
-    def _from_sequence(cls, scalars, dtype=None, copy=False):
+    def _from_sequence(cls, scalars, dtype=None, copy=True):
         if dtype:
             assert dtype == "string"
 
         result = np.asarray(scalars, dtype="object")
         if copy and result is scalars:
             result = result.copy()
 
-        # Standardize all missing-like values to NA
-        # TODO: it would be nice to do this in _validate / lib.is_string_array
-        # We are already doing a scan over the values there.
-        na_values = isna(result)
-        has_nans = na_values.any()
-        if has_nans and result is scalars:
-            # force a copy now, if we haven't already
-            result = result.copy()
-
-        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
-        result = np.asarray(result, dtype=str)
-        result = np.asarray(result, dtype="object")
-        if has_nans:
-            result[na_values] = StringDtype.na_value
+        # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+        result = lib.ensure_string_array(result, StringDtype.na_value)
 
         return cls(result)
 

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -206,12 +206,16 @@ def test_constructor_raises():
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy):
-    a = np.array(["a", np.nan], dtype=object)
-    original = a.copy()
-    result = pd.arrays.StringArray._from_sequence(a, copy=copy)
-    expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
+    nan_arr = np.array(["a", np.nan], dtype=object)
+    na_arr = np.array(["a", pd.NA], dtype=object)
+
+    result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
+    expected = pd.arrays.StringArray(na_arr)
+
     tm.assert_extension_array_equal(result, expected)
-    tm.assert_numpy_array_equal(a, original)
+
+    expected = nan_arr if copy else na_arr
+    tm.assert_numpy_array_equal(nan_arr, expected)
 
 
 def test_astype_int():