GH-35264: [Python] Interchange protocol: test clean-up (#35530)

AlenkaF · web-flow · commit 14f9bf923a56 · 2023-05-11T21:36:29.000+02:00
The diff is a bit confusing so I will add some notes here: - `test_categorical_roundtrip` is not removed but renamed to `test_pandas_roundtrip_categorical` so that all tests checking `pyarrow` -> `pandas` -> `pyarrow` start with `test_pandas_roundtip_*` - the skip for the `test_pandas_roundtrip_categorical` is removed - `test_pandas_to_pyarrow_categorical_with_missing` is removed as the conversion for categorical with missing values is now checked in `test_pandas_roundtrip_categorical` - `test_roundtrip_pandas_boolean` is removed and the check for boolean has been added to `test_pandas_roundtrip` - `test_pandas_assertion_error_large_string` and `test_pandas_to_pyarrow_string_with_missing` are removed as the update for these is done separately in #35504 * Closes: #35264 Authored-by: Alenka Frim <frim.alenka@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
@@ -99,53 +99,6 @@ def test_offset_of_sliced_array():
     #                        check_index=False, check_names=False)
 
 
-# Currently errors due to string conversion
-# as col.size is called as a property not method in pandas
-# see L255-L257 in pandas/core/interchange/from_dataframe.py
-@pytest.mark.pandas
-def test_categorical_roundtrip():
-    pytest.skip("Bug in pandas implementation")
-
-    if Version(pd.__version__) < Version("1.5.0"):
-        pytest.skip("__dataframe__ added to pandas in 1.5.0")
-
-    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
-    table = pa.table(
-        {"weekday": pa.array(arr).dictionary_encode()}
-    )
-
-    pandas_df = table.to_pandas()
-    result = pi.from_dataframe(pandas_df)
-
-    # Checking equality for the values
-    # As the dtype of the indices is changed from int32 in pa.Table
-    # to int64 in pandas interchange protocol implementation
-    assert result[0].chunk(0).dictionary == table[0].chunk(0).dictionary
-
-    table_protocol = table.__dataframe__()
-    result_protocol = result.__dataframe__()
-
-    assert table_protocol.num_columns() == result_protocol.num_columns()
-    assert table_protocol.num_rows() == result_protocol.num_rows()
-    assert table_protocol.num_chunks() == result_protocol.num_chunks()
-    assert table_protocol.column_names() == result_protocol.column_names()
-
-    col_table = table_protocol.get_column(0)
-    col_result = result_protocol.get_column(0)
-
-    assert col_result.dtype[0] == DtypeKind.CATEGORICAL
-    assert col_result.dtype[0] == col_table.dtype[0]
-    assert col_result.size == col_table.size
-    assert col_result.offset == col_table.offset
-
-    desc_cat_table = col_result.describe_categorical
-    desc_cat_result = col_result.describe_categorical
-
-    assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
-    assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
-    assert isinstance(desc_cat_result["categories"]._col, pa.Array)
-
-
 @pytest.mark.pandas
 @pytest.mark.parametrize(
     "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
@@ -170,6 +123,7 @@ def test_pandas_roundtrip(uint, int, float, np_float):
             "a": pa.array(arr, type=uint),
             "b": pa.array(arr, type=int),
             "c": pa.array(np.array(arr, dtype=np_float), type=float),
+            "d": [True, False, True],
         }
     )
     from pandas.api.interchange import (
@@ -189,10 +143,10 @@ def test_pandas_roundtrip(uint, int, float, np_float):
 
 
 @pytest.mark.pandas
-def test_roundtrip_pandas_string():
+def test_pandas_roundtrip_string():
     # See https://github.com/pandas-dev/pandas/issues/50554
     if Version(pd.__version__) < Version("1.6"):
-        pytest.skip(" Column.size() called as a method in pandas 2.0.0")
+        pytest.skip("Column.size() bug in pandas")
 
     arr = ["a", "", "c"]
     table = pa.table({"a": pa.array(arr)})
@@ -218,10 +172,10 @@ def test_roundtrip_pandas_string():
 
 
 @pytest.mark.pandas
-def test_roundtrip_pandas_large_string():
+def test_pandas_roundtrip_large_string():
     # See https://github.com/pandas-dev/pandas/issues/50554
     if Version(pd.__version__) < Version("1.6"):
-        pytest.skip(" Column.size() called as a method in pandas 2.0.0")
+        pytest.skip("Column.size() bug in pandas")
 
     arr = ["a", "", "c"]
     table = pa.table({"a_large": pa.array(arr, type=pa.large_string())})
@@ -255,10 +209,10 @@ def test_roundtrip_pandas_large_string():
 
 
 @pytest.mark.pandas
-def test_roundtrip_pandas_string_with_missing():
+def test_pandas_roundtrip_string_with_missing():
     # See https://github.com/pandas-dev/pandas/issues/50554
     if Version(pd.__version__) < Version("1.6"):
-        pytest.skip(" Column.size() called as a method in pandas 2.0.0")
+        pytest.skip("Column.size() bug in pandas")
 
     arr = ["a", "", "c", None]
     table = pa.table({"a": pa.array(arr),
@@ -287,19 +241,28 @@ def test_roundtrip_pandas_string_with_missing():
 
 
 @pytest.mark.pandas
-def test_roundtrip_pandas_boolean():
-    if Version(pd.__version__) < Version("1.5.0"):
-        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+def test_pandas_roundtrip_categorical():
+    if Version(pd.__version__) < Version("2.0.2"):
+        pytest.skip("Bitmasks not supported in pandas interchange implementation")
 
-    table = pa.table({"a": [True, False, True]})
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
 
     from pandas.api.interchange import (
         from_dataframe as pandas_from_dataframe
     )
     pandas_df = pandas_from_dataframe(table)
     result = pi.from_dataframe(pandas_df)
 
-    assert table.equals(result)
+    assert result["weekday"].to_pylist() == table["weekday"].to_pylist()
+    assert pa.types.is_dictionary(table["weekday"].type)
+    assert pa.types.is_dictionary(result["weekday"].type)
+    assert pa.types.is_string(table["weekday"].chunk(0).dictionary.type)
+    assert pa.types.is_large_string(result["weekday"].chunk(0).dictionary.type)
+    assert pa.types.is_int32(table["weekday"].chunk(0).indices.type)
+    assert pa.types.is_int8(result["weekday"].chunk(0).indices.type)
 
     table_protocol = table.__dataframe__()
     result_protocol = result.__dataframe__()
@@ -309,10 +272,25 @@ def test_roundtrip_pandas_boolean():
     assert table_protocol.num_chunks() == result_protocol.num_chunks()
     assert table_protocol.column_names() == result_protocol.column_names()
 
+    col_table = table_protocol.get_column(0)
+    col_result = result_protocol.get_column(0)
+
+    assert col_result.dtype[0] == DtypeKind.CATEGORICAL
+    assert col_result.dtype[0] == col_table.dtype[0]
+    assert col_result.size() == col_table.size()
+    assert col_result.offset == col_table.offset
+
+    desc_cat_table = col_result.describe_categorical
+    desc_cat_result = col_result.describe_categorical
+
+    assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
+    assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+    assert isinstance(desc_cat_result["categories"]._col, pa.Array)
+
 
 @pytest.mark.pandas
 @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
-def test_roundtrip_pandas_datetime(unit):
+def test_pandas_roundtrip_datetime(unit):
     if Version(pd.__version__) < Version("1.5.0"):
         pytest.skip("__dataframe__ added to pandas in 1.5.0")
     from datetime import datetime as dt
@@ -384,45 +362,6 @@ def test_pandas_to_pyarrow_float16_with_missing():
         pi.from_dataframe(df)
 
 
-@pytest.mark.pandas
-def test_pandas_to_pyarrow_string_with_missing():
-    if Version(pd.__version__) < Version("1.5.0"):
-        pytest.skip("__dataframe__ added to pandas in 1.5.0")
-
-    # pandas is using int64 offsets for string dtype so the constructed
-    # pyarrow string column will always be a large_string data type
-    arr = {
-        "Y": ["a", "b", None],  # bool, ColumnNullType.USE_BYTEMASK,
-    }
-    df = pd.DataFrame(arr)
-    expected = pa.table(arr)
-    result = pi.from_dataframe(df)
-
-    assert result[0].to_pylist() == expected[0].to_pylist()
-    assert pa.types.is_string(expected[0].type)
-    assert pa.types.is_large_string(result[0].type)
-
-
-@pytest.mark.pandas
-def test_pandas_to_pyarrow_categorical_with_missing():
-    if Version(pd.__version__) < Version("1.5.0"):
-        pytest.skip("__dataframe__ added to pandas in 1.5.0")
-
-    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
-    df = pd.DataFrame(
-        {"weekday": arr}
-    )
-    df = df.astype("category")
-    result = pi.from_dataframe(df)
-
-    expected_dictionary = ["Fri", "Mon", "Sat", "Thu", "Tue", "Wed"]
-    expected_indices = pa.array([1, 4, 1, 5, 1, 3, 0, 2, None], type=pa.int8())
-
-    assert result[0].to_pylist() == arr
-    assert result[0].chunk(0).dictionary.to_pylist() == expected_dictionary
-    assert result[0].chunk(0).indices.equals(expected_indices)
-
-
 @pytest.mark.parametrize(
     "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
 )