|
1 | 1 | import pytest
|
2 |
| -import numpy as np |
3 |
| -from conftest import * |
4 |
| - |
5 |
| - |
6 |
| -@pytest.mark.parametrize("test_data", |
7 |
| - [ |
8 |
| - ({'a': [np.array([1, 2, 3]), np.array([4, 5, 6])], |
9 |
| - 'b': [np.array([1.5, 2.0, 3.2]), np.array([4.1, 5.7, 6.9])]}, |
10 |
| - np.object_, None), |
11 |
| - ({'a': [1.5, 2.5, 3.5], 'b': [9.2, 10.5, 11.8]}, np.float64, None), |
12 |
| - ({'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4]}, np.int64, np.float64) |
13 |
| - ], |
14 |
| - ids=["array_data", "float_data", "int_data"]) |
15 |
| -def test_only_one_data(test_data): |
16 |
| - data, dtype, new_dtype = test_data |
17 |
| - columns = list(data.keys()) |
18 |
| - df = constructor_frame(data) |
19 |
| - df2 = df.__dataframe__() |
20 |
| - new_dtype = dtype if new_dtype is None else new_dtype |
21 |
| - assert df.columns.values.tolist() == columns |
22 |
| - val = len(df[columns[0]])-1 |
23 |
| - column_size = df.size |
| 2 | +import math |
| 3 | +import ctypes |
| 4 | + |
| 5 | + |
| 6 | +@pytest.mark.parametrize( |
| 7 | + "test_data", |
| 8 | + [ |
| 9 | + {"a": ["foo", "bar"], "b": ["baz", "qux"]}, |
| 10 | + {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, |
| 11 | + {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, |
| 12 | + ], |
| 13 | + ids=["str_data", "float_data", "int_data"], |
| 14 | +) |
| 15 | +def test_only_one_dtype(test_data, df_from_dict): |
| 16 | + columns = list(test_data.keys()) |
| 17 | + df = df_from_dict(test_data) |
| 18 | + dfX = df.__dataframe__() |
| 19 | + |
| 20 | + column_size = len(test_data[columns[0]]) |
24 | 21 | for column in columns:
|
25 |
| - assert df[column].tolist() == df[column].tolist() |
26 |
| - assert df[column].dtype.type is dtype |
27 |
| - assert df2.get_column_by_name(column).null_count == 0 |
28 |
| - assert df2.get_column_by_name(column).size == column_size |
29 |
| - assert df2.get_column_by_name(column).offset == 0 |
30 |
| - assert not df2["x"].is_masked |
31 |
| - n = np.random.randint(0, val) |
32 |
| - (df[column])[n] = None |
33 |
| - assert df[column].dtype.type is new_dtype |
34 |
| - assert df2.get_column_by_name(column).null_count == 1 |
35 |
| - |
36 |
| - |
37 |
| -def test_float_int(): |
38 |
| - df = constructor_frame({'a': [1, 2, 3], 'b': [3, 4, 5], |
39 |
| - 'c': [1.5, 2.5, 3.5], 'd': [9, 10, 11]}) |
40 |
| - df2 = df.__dataframe__() |
41 |
| - columns = ['a', 'b', 'c', 'd'] |
42 |
| - assert df.columns.values.tolist() == columns |
43 |
| - for column in columns: |
44 |
| - assert df[column].tolist() == df[column].tolist() |
45 |
| - if column is 'c': |
46 |
| - assert df[column].dtype.type is np.float64 |
47 |
| - else: |
48 |
| - assert df[column].dtype.type is np.int64 |
49 |
| - |
50 |
| - assert df2.get_column_by_name(column).null_count == 0 |
51 |
| - assert df2.get_column_by_name(column).size == 3 |
52 |
| - assert df2.get_column_by_name(column).offset == 0 |
53 |
| - |
54 |
| - n = np.random.randint(0, 2) |
55 |
| - (df[column])[n] = None |
56 |
| - assert df[column].dtype.type is np.float64 |
57 |
| - assert df2.get_column_by_name(column).null_count == 1 |
58 |
| - |
59 |
| - |
60 |
| -def test_mixed_intfloatbool(): |
61 |
| - df = constructor_frame({"x": np.array([True, True, False]), |
62 |
| - "y": np.array([1, 2, 0]), |
63 |
| - "z": np.array([9.2, 10.5, 11.8])}) |
64 |
| - df2 = df.__dataframe__() |
65 |
| - columns = ['x', 'y', 'z'] |
66 |
| - assert df.columns.values.tolist() == columns |
67 |
| - for column in columns: |
68 |
| - assert df[column].tolist() == df[column].tolist() |
69 |
| - assert df2.get_column_by_name(column).null_count == 0 |
70 |
| - assert df2.get_column_by_name(column).size == 3 |
71 |
| - assert df2.get_column_by_name(column).offset == 0 |
72 |
| - |
73 |
| - assert df["x"].dtype.type is np.bool_ |
74 |
| - assert df["y"].dtype.type is np.int32 |
75 |
| - assert df["z"].dtype.type is np.float64 |
76 |
| - |
77 |
| - assert df2.get_column_by_name("x")._allow_copy == True |
78 |
| - |
79 |
| - for column in columns: |
80 |
| - n = np.random.randint(0, 2) |
81 |
| - (df[column])[n] = None |
82 |
| - if column is "x": |
83 |
| - assert df[column].dtype.type is np.object_ |
84 |
| - else: |
85 |
| - assert df[column].dtype.type is np.float64 |
86 |
| - assert df2.get_column_by_name(column).null_count == 1 |
87 |
| - |
88 |
| - |
89 |
| -def test_string_dtype(): |
90 |
| - df = constructor_frame({"A": ["a", "b", "cdef", "", "g"]}) |
91 |
| - df2 = df.__dataframe__() |
92 |
| - columns = ['A'] |
93 |
| - assert df.columns.values.tolist() == columns |
94 |
| - for column in columns: |
95 |
| - assert df[column].tolist() == df[column].tolist() |
96 |
| - assert df[column].dtype.type is np.object_ |
97 |
| - assert df2.get_column_by_name(column).null_count == 0 |
98 |
| - |
99 |
| - |
100 |
| -def test_categorical(): |
101 |
| - df = constructor_frame({"year": [2012, 2013, 2015, 2019], "weekday": [0, 1, 4, 6]}) |
102 |
| - df = df.categorize("year", min_value=2012, max_value=2019) |
103 |
| - df = df.categorize("weekday", labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]) |
104 |
| - # Some detailed testing for correctness of dtype and null handling: |
105 |
| - col = df.__dataframe__().get_column_by_name("year") |
106 |
| - assert col.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019}) |
107 |
| - assert col.describe_null == (0, None) |
108 |
| - col2 = df.__dataframe__().get_column_by_name("weekday") |
109 |
| - assert col2.describe_categorical == (False, True, {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) |
110 |
| - assert col2.describe_null == (0, None) |
111 |
| - |
112 |
| - |
113 |
| -def test_dataframe(): |
114 |
| - df = constructor_frame({"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}) |
115 |
| - df2 = df.__dataframe__() |
116 |
| - assert df2._allow_copy == True |
117 |
| - assert df2.num_columns() == 3 |
118 |
| - assert df2.num_rows() == 3 |
119 |
| - assert df2.num_chunks() == 1 |
120 |
| - assert df2.column_names() == ["x", "y", "z"] |
121 |
| - assert df2.select_columns((0, 2))._df[:, 0].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 0].tolist() |
122 |
| - assert df2.select_columns((0, 2))._df[:, 1].tolist() == df2.select_columns_by_name(("x", "z"))._df[:, 1].tolist() |
123 |
| - |
124 |
| - |
125 |
| -def test_chunks(): |
126 |
| - df = constructor_frame({"x": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}) |
127 |
| - df2 = df.__dataframe__() |
128 |
| - chunk_iter = iter(df2.get_chunks(3)) |
129 |
| - chunk = next(chunk_iter) |
130 |
| - assert chunk.num_rows() == 4 |
131 |
| - chunk = next(chunk_iter) |
132 |
| - assert chunk.num_rows() == 4 |
133 |
| - chunk = next(chunk_iter) |
134 |
| - assert chunk.num_rows() == 2 |
135 |
| - with pytest.raises(StopIteration): |
136 |
| - chunk = next(chunk_iter) |
137 |
| - |
138 |
| - |
139 |
| -def test_get_chunks(): |
140 |
| - df = constructor_frame({"x": [1]}) |
141 |
| - df2 = df.__dataframe__() |
142 |
| - assert df2.get_chunks() == 1 |
| 22 | + assert dfX.get_column_by_name(column).null_count == 0 |
| 23 | + assert dfX.get_column_by_name(column).size == column_size |
| 24 | + assert dfX.get_column_by_name(column).offset == 0 |
| 25 | + |
| 26 | + |
| 27 | +def test_float_int(df_from_dict): |
| 28 | + df = df_from_dict( |
| 29 | + { |
| 30 | + "a": [1, 2, 3], |
| 31 | + "b": [3, 4, 5], |
| 32 | + "c": [1.5, 2.5, 3.5], |
| 33 | + "d": [9, 10, 11], |
| 34 | + "e": [True, False, True], |
| 35 | + "f": ["a", "", "c"], |
| 36 | + } |
| 37 | + ) |
| 38 | + dfX = df.__dataframe__() |
| 39 | + columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} |
| 40 | + |
| 41 | + for column, kind in columns.items(): |
| 42 | + colX = dfX.get_column_by_name(column) |
| 43 | + assert colX.null_count == 0 |
| 44 | + assert colX.size == 3 |
| 45 | + assert colX.offset == 0 |
| 46 | + |
| 47 | + assert colX.dtype[0] == kind |
| 48 | + |
| 49 | + assert dfX.get_column_by_name("c").dtype[1] == 64 |
| 50 | + |
| 51 | + |
| 52 | +def test_na_float(df_from_dict): |
| 53 | + df = df_from_dict({"a": [1.0, math.nan, 2.0]}) |
| 54 | + dfX = df.__dataframe__() |
| 55 | + colX = dfX.get_column_by_name("a") |
| 56 | + assert colX.null_count == 1 |
| 57 | + |
| 58 | + |
| 59 | +def test_noncategorical(df_from_dict): |
| 60 | + df = df_from_dict({"a": [1, 2, 3]}) |
| 61 | + dfX = df.__dataframe__() |
| 62 | + colX = dfX.get_column_by_name("a") |
| 63 | + with pytest.raises(TypeError): |
| 64 | + colX.describe_categorical |
| 65 | + |
| 66 | + |
| 67 | +def test_categorical(df_from_dict): |
| 68 | + df = df_from_dict( |
| 69 | + {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, |
| 70 | + is_categorical=True, |
| 71 | + ) |
| 72 | + |
| 73 | + colX = df.__dataframe__().get_column_by_name("weekday") |
| 74 | + is_ordered, is_dictionary, _ = colX.describe_categorical |
| 75 | + assert isinstance(is_ordered, bool) |
| 76 | + assert isinstance(is_dictionary, bool) |
| 77 | + |
| 78 | + |
| 79 | +def test_dataframe(df_from_dict): |
| 80 | + df = df_from_dict( |
| 81 | + {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} |
| 82 | + ) |
| 83 | + dfX = df.__dataframe__() |
| 84 | + |
| 85 | + assert dfX.num_columns() == 3 |
| 86 | + assert dfX.num_rows() == 3 |
| 87 | + assert dfX.num_chunks() == 1 |
| 88 | + assert dfX.column_names() == ["x", "y", "z"] |
| 89 | + assert ( |
| 90 | + dfX.select_columns((0, 2)).column_names() |
| 91 | + == dfX.select_columns_by_name(("x", "z")).column_names() |
| 92 | + ) |
| 93 | + |
| 94 | + |
| 95 | +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) |
| 96 | +def test_df_get_chunks(size, n_chunks, df_from_dict): |
| 97 | + df = df_from_dict({"x": list(range(size))}) |
| 98 | + dfX = df.__dataframe__() |
| 99 | + chunks = list(dfX.get_chunks(n_chunks)) |
| 100 | + assert len(chunks) == n_chunks |
| 101 | + assert sum(chunk.num_rows() for chunk in chunks) == size |
| 102 | + |
| 103 | + |
| 104 | +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) |
| 105 | +def test_column_get_chunks(size, n_chunks, df_from_dict): |
| 106 | + df = df_from_dict({"x": list(range(size))}) |
| 107 | + dfX = df.__dataframe__() |
| 108 | + chunks = list(dfX.get_column(0).get_chunks(n_chunks)) |
| 109 | + assert len(chunks) == n_chunks |
| 110 | + assert sum(chunk.size for chunk in chunks) == size |
| 111 | + |
| 112 | + |
| 113 | +def test_get_columns(df_from_dict): |
| 114 | + df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) |
| 115 | + dfX = df.__dataframe__() |
| 116 | + for colX in dfX.get_columns(): |
| 117 | + assert colX.size == 2 |
| 118 | + assert colX.num_chunks() == 1 |
| 119 | + assert dfX.get_column(0).dtype[0] == 0 |
| 120 | + assert dfX.get_column(1).dtype[0] == 2 |
| 121 | + |
| 122 | + |
| 123 | +def test_buffer(df_from_dict): |
| 124 | + arr = [0, 1, -1] |
| 125 | + df = df_from_dict({"a": arr}) |
| 126 | + dfX = df.__dataframe__() |
| 127 | + colX = dfX.get_column(0) |
| 128 | + bufX = colX.get_buffers() |
| 129 | + |
| 130 | + dataBuf, dataDtype = bufX["data"] |
| 131 | + |
| 132 | + assert dataBuf.bufsize > 0 |
| 133 | + assert dataBuf.ptr != 0 |
| 134 | + device, _ = dataBuf.__dlpack_device__ |
| 135 | + |
| 136 | + assert dataDtype[0] == 0 |
| 137 | + |
| 138 | + if device == 1: # CPU-only as we're going to directly read memory here |
| 139 | + bitwidth = dataDtype[1] |
| 140 | + ctype = { |
| 141 | + 8: ctypes.c_int8, |
| 142 | + 16: ctypes.c_int16, |
| 143 | + 32: ctypes.c_int32, |
| 144 | + 64: ctypes.c_int64, |
| 145 | + }[bitwidth] |
| 146 | + |
| 147 | + for idx, truth in enumerate(arr): |
| 148 | + val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value |
| 149 | + assert val == truth, f"Buffer at index {idx} mismatch" |
0 commit comments