Backport PR #51871 on branch 2.0.x (ERR: Check that dtype_backend is valid) (#51964)

phofl · web-flow · commit ee7e30c570ec · 2023-03-14T17:43:03.000+01:00
ERR: Check that dtype_backend is valid (#51871)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -94,6 +94,7 @@
 from pandas.util._decorators import doc
 from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
+    check_dtype_backend,
     validate_ascending,
     validate_bool_kwarg,
     validate_fillna_kwargs,
@@ -6534,8 +6535,8 @@ def convert_dtypes(
 
             .. versionadded:: 1.2.0
         dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable"
-            Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
-            arrays, nullable dtypes are used for all dtypes that have a nullable
+            Which dtype_backend to use, e.g. whether a DataFrame should use nullable
+            dtypes for all dtypes that have a nullable
             implementation when "numpy_nullable" is set, pyarrow is used for all
             dtypes if "pyarrow" is set.
 
@@ -6654,6 +6655,7 @@ def convert_dtypes(
         2    <NA>
         dtype: string
         """
+        check_dtype_backend(dtype_backend)
         if self.ndim == 1:
             return self._convert_dtypes(
                 infer_objects,
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -991,7 +991,7 @@ def convert_object_array(
     ----------
     content: List[np.ndarray]
     dtype: np.dtype or ExtensionDtype
-    dtype_backend: Controls if nullable dtypes are returned.
+    dtype_backend: Controls if nullable/pyarrow dtypes are returned.
     coerce_float: Cast floats that are integers to int.
 
     Returns
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -10,6 +10,7 @@
     DtypeBackend,
     npt,
 )
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.cast import maybe_downcast_numeric
 from pandas.core.dtypes.common import (
@@ -161,6 +162,8 @@ def to_numeric(
     if errors not in ("ignore", "raise", "coerce"):
         raise ValueError("invalid error value specified")
 
+    check_dtype_backend(dtype_backend)
+
     is_series = False
     is_index = False
     is_scalars = False
diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py
@@ -7,6 +7,7 @@
 
 from pandas._libs import lib
 from pandas.util._exceptions import find_stack_level
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.generic import ABCDataFrame
 
@@ -58,6 +59,8 @@ def read_clipboard(
     if encoding is not None and encoding.lower().replace("-", "") != "utf8":
         raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
 
+    check_dtype_backend(dtype_backend)
+
     from pandas.io.clipboard import clipboard_get
     from pandas.io.parsers import read_csv
 
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -45,6 +45,7 @@
     Appender,
     doc,
 )
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import (
     is_bool,
@@ -469,6 +470,8 @@ def read_excel(
     storage_options: StorageOptions = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 ) -> DataFrame | dict[IntStrT, DataFrame]:
+    check_dtype_backend(dtype_backend)
+
     should_close = False
     if not isinstance(io, ExcelFile):
         should_close = True
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -16,6 +16,7 @@
 )
 from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import doc
+from pandas.util._validators import check_dtype_backend
 
 import pandas as pd
 from pandas.core.api import (
@@ -138,6 +139,8 @@ def read_feather(
     import_optional_dependency("pyarrow")
     from pyarrow import feather
 
+    check_dtype_backend(dtype_backend)
+
     with get_handle(
         path, "rb", storage_options=storage_options, is_text=False
     ) as handles:
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -30,6 +30,7 @@
     AbstractMethodError,
     EmptyDataError,
 )
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import is_list_like
 
@@ -1204,6 +1205,7 @@ def read_html(
             f'"{extract_links}"'
         )
     validate_header_arg(header)
+    check_dtype_backend(dtype_backend)
 
     io = stringify_path(io)
 
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -42,6 +42,7 @@
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import (
     ensure_str,
@@ -744,6 +745,8 @@ def read_json(
     if orient == "table" and convert_axes:
         raise ValueError("cannot pass both convert_axes and orient='table'")
 
+    check_dtype_backend(dtype_backend)
+
     if dtype is None and orient != "table":
         # error: Incompatible types in assignment (expression has type "bool", variable
         # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],
@@ -944,14 +947,18 @@ def read(self) -> DataFrame | Series:
             if self.engine == "pyarrow":
                 pyarrow_json = import_optional_dependency("pyarrow.json")
                 pa_table = pyarrow_json.read_json(self.data)
+
+                mapping: type[ArrowDtype] | None | Callable
                 if self.dtype_backend == "pyarrow":
-                    return pa_table.to_pandas(types_mapper=ArrowDtype)
+                    mapping = ArrowDtype
                 elif self.dtype_backend == "numpy_nullable":
                     from pandas.io._util import _arrow_dtype_mapping
 
-                    mapping = _arrow_dtype_mapping()
-                    return pa_table.to_pandas(types_mapper=mapping.get)
-                return pa_table.to_pandas()
+                    mapping = _arrow_dtype_mapping().get
+                else:
+                    mapping = None
+
+                return pa_table.to_pandas(types_mapper=mapping)
             elif self.engine == "ujson":
                 if self.lines:
                     if self.chunksize:
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -16,6 +16,7 @@
     WriteBuffer,
 )
 from pandas.compat._optional import import_optional_dependency
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
@@ -78,6 +79,8 @@ def read_orc(
 
     orc = import_optional_dependency("pyarrow.orc")
 
+    check_dtype_backend(dtype_backend)
+
     with get_handle(path, "rb", is_text=False) as handles:
         orc_file = orc.ORCFile(handles.handle)
         pa_table = orc_file.read(columns=columns, **kwargs)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -22,6 +22,7 @@
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 from pandas.util._exceptions import find_stack_level
+from pandas.util._validators import check_dtype_backend
 
 import pandas as pd
 from pandas import (
@@ -513,6 +514,7 @@ def read_parquet(
     DataFrame
     """
     impl = get_engine(engine)
+
     if use_nullable_dtypes is not lib.no_default:
         msg = (
             "The argument 'use_nullable_dtypes' is deprecated and will be removed "
@@ -525,6 +527,7 @@ def read_parquet(
         warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
     else:
         use_nullable_dtypes = False
+    check_dtype_backend(dtype_backend)
 
     return impl.read(
         path,
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -42,6 +42,7 @@
 )
 from pandas.util._decorators import Appender
 from pandas.util._exceptions import find_stack_level
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import (
     is_file_like,
@@ -1346,6 +1347,8 @@ def read_fwf(
     kwds["colspecs"] = colspecs
     kwds["infer_nrows"] = infer_nrows
     kwds["engine"] = "python-fwf"
+
+    check_dtype_backend(dtype_backend)
     kwds["dtype_backend"] = dtype_backend
     return _read(filepath_or_buffer, kwds)
 
@@ -1999,6 +2002,8 @@ def _refine_defaults_read(
     else:
         raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
 
+    check_dtype_backend(dtype_backend)
+
     kwds["dtype_backend"] = dtype_backend
 
     return kwds
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
@@ -8,6 +8,7 @@
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.inference import is_list_like
 
@@ -51,6 +52,7 @@ def read_spss(
     DataFrame
     """
     pyreadstat = import_optional_dependency("pyreadstat")
+    check_dtype_backend(dtype_backend)
 
     if usecols is not None:
         if not is_list_like(usecols):
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -45,6 +45,7 @@
     DatabaseError,
 )
 from pandas.util._exceptions import find_stack_level
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import (
     is_datetime64tz_dtype,
@@ -326,6 +327,7 @@ def read_sql_table(
     >>> pd.read_sql_table('table_name', 'postgres:///db_name')  # doctest:+SKIP
     """
 
+    check_dtype_backend(dtype_backend)
     if dtype_backend is lib.no_default:
         dtype_backend = "numpy"  # type: ignore[assignment]
 
@@ -457,6 +459,7 @@ def read_sql_query(
     parameter will be converted to UTC.
     """
 
+    check_dtype_backend(dtype_backend)
     if dtype_backend is lib.no_default:
         dtype_backend = "numpy"  # type: ignore[assignment]
 
@@ -621,6 +624,7 @@ def read_sql(
     1           1  2010-11-12
     """
 
+    check_dtype_backend(dtype_backend)
     if dtype_backend is lib.no_default:
         dtype_backend = "numpy"  # type: ignore[assignment]
 
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -30,6 +30,7 @@
     ParserError,
 )
 from pandas.util._decorators import doc
+from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import is_list_like
 
@@ -1112,6 +1113,7 @@ def read_xml(
     1    circle      360    NaN
     2  triangle      180    3.0
     """
+    check_dtype_backend(dtype_backend)
 
     return _parse(
         path_or_buffer=path_or_buffer,
diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -124,3 +124,13 @@ def test_pyarrow_dtype_empty_object(self):
         expected = pd.DataFrame(columns=[0])
         result = expected.convert_dtypes(dtype_backend="pyarrow")
         tm.assert_frame_equal(result, expected)
+
+    def test_pyarrow_engine_lines_false(self):
+        # GH 48893
+        df = pd.DataFrame({"a": [1, 2, 3]})
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.convert_dtypes(dtype_backend="numpy")
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -1944,6 +1944,14 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
 
         tm.assert_series_equal(result, expected)
 
+    def test_invalid_dtype_backend(self):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_json("test", dtype_backend="numpy")
+
 
 def test_invalid_engine():
     # GH 48893
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -1001,3 +1001,12 @@ def test_dtype_backend(string_storage, dtype_backend):
         expected["i"] = ArrowExtensionArray(pa.array([None, None]))
 
     tm.assert_frame_equal(result, expected)
+
+
+def test_invalid_dtype_backend():
+    msg = (
+        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+        "'pyarrow' are allowed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        read_fwf("test", dtype_backend="numpy")
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -200,3 +200,13 @@ def test_invalid_file_inputs(request, all_parsers):
 
     with pytest.raises(ValueError, match="Invalid"):
         parser.read_csv([])
+
+
+def test_invalid_dtype_backend(all_parsers):
+    parser = all_parsers
+    msg = (
+        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+        "'pyarrow' are allowed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv("test", dtype_backend="numpy")
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
@@ -467,3 +467,11 @@ def test_read_clipboard_dtype_backend(
             expected["g"] = ArrowExtensionArray(pa.array([None, None]))
 
         tm.assert_frame_equal(result, expected)
+
+    def test_invalid_dtype_backend(self):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_clipboard(dtype_backend="numpy")
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -244,3 +244,14 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
             )
 
         tm.assert_frame_equal(result, expected)
+
+    def test_invalid_dtype_backend(self):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        df = pd.DataFrame({"int": list(range(1, 4))})
+        with tm.ensure_clean("tmp.feather") as path:
+            df.to_feather(path)
+            with pytest.raises(ValueError, match=msg):
+                read_feather(path, dtype_backend="numpy")
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1469,3 +1469,11 @@ def test_extract_links_all_no_header(self):
         result = self.read_html(data, extract_links="all")[0]
         expected = DataFrame([[("Google.com", "https://google.com")]])
         tm.assert_frame_equal(result, expected)
+
+    def test_invalid_dtype_backend(self):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_html("test", dtype_backend="numpy")
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@`
`30`	`30`	`AbstractMethodError,`
`31`	`31`	`EmptyDataError,`
`32`	`32`	`)`
	`33`	`+from pandas.util._validators import check_dtype_backend`
`33`	`34`
`34`	`35`	`from pandas.core.dtypes.common import is_list_like`
`35`	`36`
`@@ -1204,6 +1205,7 @@ def read_html(`
`1204`	`1205`	`f'"{extract_links}"'`
`1205`	`1206`	`)`
`1206`	`1207`	`validate_header_arg(header)`
	`1208`	`+ check_dtype_backend(dtype_backend)`
`1207`	`1209`
`1208`	`1210`	`io = stringify_path(io)`
`1209`	`1211`