Skip to content

Commit 14f9bf9

Browse files
authored
GH-35264: [Python] Interchange protocol: test clean-up (#35530)
The diff is a bit confusing so I will add some notes here: - `test_categorical_roundtrip` is not removed but renamed to `test_pandas_roundtrip_categorical` so that all tests checking `pyarrow` -> `pandas` -> `pyarrow` start with `test_pandas_roundtip_*` - the skip for the `test_pandas_roundtrip_categorical` is removed - `test_pandas_to_pyarrow_categorical_with_missing` is removed as the conversion for categorical with missing values is now checked in `test_pandas_roundtrip_categorical` - `test_roundtrip_pandas_boolean` is removed and the check for boolean has been added to `test_pandas_roundtrip` - `test_pandas_assertion_error_large_string` and `test_pandas_to_pyarrow_string_with_missing` are removed as the update for these is done separately in #35504 * Closes: #35264 Authored-by: Alenka Frim <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
1 parent b36ff71 commit 14f9bf9

File tree

1 file changed

+37
-98
lines changed

1 file changed

+37
-98
lines changed

python/pyarrow/tests/interchange/test_conversion.py

Lines changed: 37 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -99,53 +99,6 @@ def test_offset_of_sliced_array():
9999
# check_index=False, check_names=False)
100100

101101

102-
# Currently errors due to string conversion
103-
# as col.size is called as a property not method in pandas
104-
# see L255-L257 in pandas/core/interchange/from_dataframe.py
105-
@pytest.mark.pandas
106-
def test_categorical_roundtrip():
107-
pytest.skip("Bug in pandas implementation")
108-
109-
if Version(pd.__version__) < Version("1.5.0"):
110-
pytest.skip("__dataframe__ added to pandas in 1.5.0")
111-
112-
arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
113-
table = pa.table(
114-
{"weekday": pa.array(arr).dictionary_encode()}
115-
)
116-
117-
pandas_df = table.to_pandas()
118-
result = pi.from_dataframe(pandas_df)
119-
120-
# Checking equality for the values
121-
# As the dtype of the indices is changed from int32 in pa.Table
122-
# to int64 in pandas interchange protocol implementation
123-
assert result[0].chunk(0).dictionary == table[0].chunk(0).dictionary
124-
125-
table_protocol = table.__dataframe__()
126-
result_protocol = result.__dataframe__()
127-
128-
assert table_protocol.num_columns() == result_protocol.num_columns()
129-
assert table_protocol.num_rows() == result_protocol.num_rows()
130-
assert table_protocol.num_chunks() == result_protocol.num_chunks()
131-
assert table_protocol.column_names() == result_protocol.column_names()
132-
133-
col_table = table_protocol.get_column(0)
134-
col_result = result_protocol.get_column(0)
135-
136-
assert col_result.dtype[0] == DtypeKind.CATEGORICAL
137-
assert col_result.dtype[0] == col_table.dtype[0]
138-
assert col_result.size == col_table.size
139-
assert col_result.offset == col_table.offset
140-
141-
desc_cat_table = col_result.describe_categorical
142-
desc_cat_result = col_result.describe_categorical
143-
144-
assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
145-
assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
146-
assert isinstance(desc_cat_result["categories"]._col, pa.Array)
147-
148-
149102
@pytest.mark.pandas
150103
@pytest.mark.parametrize(
151104
"uint", [pa.uint8(), pa.uint16(), pa.uint32()]
@@ -170,6 +123,7 @@ def test_pandas_roundtrip(uint, int, float, np_float):
170123
"a": pa.array(arr, type=uint),
171124
"b": pa.array(arr, type=int),
172125
"c": pa.array(np.array(arr, dtype=np_float), type=float),
126+
"d": [True, False, True],
173127
}
174128
)
175129
from pandas.api.interchange import (
@@ -189,10 +143,10 @@ def test_pandas_roundtrip(uint, int, float, np_float):
189143

190144

191145
@pytest.mark.pandas
192-
def test_roundtrip_pandas_string():
146+
def test_pandas_roundtrip_string():
193147
# See https://github.com/pandas-dev/pandas/issues/50554
194148
if Version(pd.__version__) < Version("1.6"):
195-
pytest.skip(" Column.size() called as a method in pandas 2.0.0")
149+
pytest.skip("Column.size() bug in pandas")
196150

197151
arr = ["a", "", "c"]
198152
table = pa.table({"a": pa.array(arr)})
@@ -218,10 +172,10 @@ def test_roundtrip_pandas_string():
218172

219173

220174
@pytest.mark.pandas
221-
def test_roundtrip_pandas_large_string():
175+
def test_pandas_roundtrip_large_string():
222176
# See https://github.com/pandas-dev/pandas/issues/50554
223177
if Version(pd.__version__) < Version("1.6"):
224-
pytest.skip(" Column.size() called as a method in pandas 2.0.0")
178+
pytest.skip("Column.size() bug in pandas")
225179

226180
arr = ["a", "", "c"]
227181
table = pa.table({"a_large": pa.array(arr, type=pa.large_string())})
@@ -255,10 +209,10 @@ def test_roundtrip_pandas_large_string():
255209

256210

257211
@pytest.mark.pandas
258-
def test_roundtrip_pandas_string_with_missing():
212+
def test_pandas_roundtrip_string_with_missing():
259213
# See https://github.com/pandas-dev/pandas/issues/50554
260214
if Version(pd.__version__) < Version("1.6"):
261-
pytest.skip(" Column.size() called as a method in pandas 2.0.0")
215+
pytest.skip("Column.size() bug in pandas")
262216

263217
arr = ["a", "", "c", None]
264218
table = pa.table({"a": pa.array(arr),
@@ -287,19 +241,28 @@ def test_roundtrip_pandas_string_with_missing():
287241

288242

289243
@pytest.mark.pandas
290-
def test_roundtrip_pandas_boolean():
291-
if Version(pd.__version__) < Version("1.5.0"):
292-
pytest.skip("__dataframe__ added to pandas in 1.5.0")
244+
def test_pandas_roundtrip_categorical():
245+
if Version(pd.__version__) < Version("2.0.2"):
246+
pytest.skip("Bitmasks not supported in pandas interchange implementation")
293247

294-
table = pa.table({"a": [True, False, True]})
248+
arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
249+
table = pa.table(
250+
{"weekday": pa.array(arr).dictionary_encode()}
251+
)
295252

296253
from pandas.api.interchange import (
297254
from_dataframe as pandas_from_dataframe
298255
)
299256
pandas_df = pandas_from_dataframe(table)
300257
result = pi.from_dataframe(pandas_df)
301258

302-
assert table.equals(result)
259+
assert result["weekday"].to_pylist() == table["weekday"].to_pylist()
260+
assert pa.types.is_dictionary(table["weekday"].type)
261+
assert pa.types.is_dictionary(result["weekday"].type)
262+
assert pa.types.is_string(table["weekday"].chunk(0).dictionary.type)
263+
assert pa.types.is_large_string(result["weekday"].chunk(0).dictionary.type)
264+
assert pa.types.is_int32(table["weekday"].chunk(0).indices.type)
265+
assert pa.types.is_int8(result["weekday"].chunk(0).indices.type)
303266

304267
table_protocol = table.__dataframe__()
305268
result_protocol = result.__dataframe__()
@@ -309,10 +272,25 @@ def test_roundtrip_pandas_boolean():
309272
assert table_protocol.num_chunks() == result_protocol.num_chunks()
310273
assert table_protocol.column_names() == result_protocol.column_names()
311274

275+
col_table = table_protocol.get_column(0)
276+
col_result = result_protocol.get_column(0)
277+
278+
assert col_result.dtype[0] == DtypeKind.CATEGORICAL
279+
assert col_result.dtype[0] == col_table.dtype[0]
280+
assert col_result.size() == col_table.size()
281+
assert col_result.offset == col_table.offset
282+
283+
desc_cat_table = col_result.describe_categorical
284+
desc_cat_result = col_result.describe_categorical
285+
286+
assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
287+
assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
288+
assert isinstance(desc_cat_result["categories"]._col, pa.Array)
289+
312290

313291
@pytest.mark.pandas
314292
@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
315-
def test_roundtrip_pandas_datetime(unit):
293+
def test_pandas_roundtrip_datetime(unit):
316294
if Version(pd.__version__) < Version("1.5.0"):
317295
pytest.skip("__dataframe__ added to pandas in 1.5.0")
318296
from datetime import datetime as dt
@@ -384,45 +362,6 @@ def test_pandas_to_pyarrow_float16_with_missing():
384362
pi.from_dataframe(df)
385363

386364

387-
@pytest.mark.pandas
388-
def test_pandas_to_pyarrow_string_with_missing():
389-
if Version(pd.__version__) < Version("1.5.0"):
390-
pytest.skip("__dataframe__ added to pandas in 1.5.0")
391-
392-
# pandas is using int64 offsets for string dtype so the constructed
393-
# pyarrow string column will always be a large_string data type
394-
arr = {
395-
"Y": ["a", "b", None], # bool, ColumnNullType.USE_BYTEMASK,
396-
}
397-
df = pd.DataFrame(arr)
398-
expected = pa.table(arr)
399-
result = pi.from_dataframe(df)
400-
401-
assert result[0].to_pylist() == expected[0].to_pylist()
402-
assert pa.types.is_string(expected[0].type)
403-
assert pa.types.is_large_string(result[0].type)
404-
405-
406-
@pytest.mark.pandas
407-
def test_pandas_to_pyarrow_categorical_with_missing():
408-
if Version(pd.__version__) < Version("1.5.0"):
409-
pytest.skip("__dataframe__ added to pandas in 1.5.0")
410-
411-
arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
412-
df = pd.DataFrame(
413-
{"weekday": arr}
414-
)
415-
df = df.astype("category")
416-
result = pi.from_dataframe(df)
417-
418-
expected_dictionary = ["Fri", "Mon", "Sat", "Thu", "Tue", "Wed"]
419-
expected_indices = pa.array([1, 4, 1, 5, 1, 3, 0, 2, None], type=pa.int8())
420-
421-
assert result[0].to_pylist() == arr
422-
assert result[0].chunk(0).dictionary.to_pylist() == expected_dictionary
423-
assert result[0].chunk(0).indices.equals(expected_indices)
424-
425-
426365
@pytest.mark.parametrize(
427366
"uint", [pa.uint8(), pa.uint16(), pa.uint32()]
428367
)

0 commit comments

Comments
 (0)