test_rank.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. import numpy as np
  2. from numpy.testing import assert_equal, assert_array_equal
  3. import pytest
  4. from scipy import stats
  5. from scipy.conftest import skip_xp_invalid_arg
  6. from scipy.stats import rankdata, tiecorrect
  7. from scipy._lib._array_api import xp_assert_equal, make_xp_test_case
  8. skip_xp_backends = pytest.mark.skip_xp_backends
  9. class TestTieCorrect:
  10. def test_empty(self):
  11. """An empty array requires no correction, should return 1.0."""
  12. ranks = np.array([], dtype=np.float64)
  13. c = tiecorrect(ranks)
  14. assert_equal(c, 1.0)
  15. def test_one(self):
  16. """A single element requires no correction, should return 1.0."""
  17. ranks = np.array([1.0], dtype=np.float64)
  18. c = tiecorrect(ranks)
  19. assert_equal(c, 1.0)
  20. def test_no_correction(self):
  21. """Arrays with no ties require no correction."""
  22. ranks = np.arange(2.0)
  23. c = tiecorrect(ranks)
  24. assert_equal(c, 1.0)
  25. ranks = np.arange(3.0)
  26. c = tiecorrect(ranks)
  27. assert_equal(c, 1.0)
  28. def test_basic(self):
  29. """Check a few basic examples of the tie correction factor."""
  30. # One tie of two elements
  31. ranks = np.array([1.0, 2.5, 2.5])
  32. c = tiecorrect(ranks)
  33. T = 2.0
  34. N = ranks.size
  35. expected = 1.0 - (T**3 - T) / (N**3 - N)
  36. assert_equal(c, expected)
  37. # One tie of two elements (same as above, but tie is not at the end)
  38. ranks = np.array([1.5, 1.5, 3.0])
  39. c = tiecorrect(ranks)
  40. T = 2.0
  41. N = ranks.size
  42. expected = 1.0 - (T**3 - T) / (N**3 - N)
  43. assert_equal(c, expected)
  44. # One tie of three elements
  45. ranks = np.array([1.0, 3.0, 3.0, 3.0])
  46. c = tiecorrect(ranks)
  47. T = 3.0
  48. N = ranks.size
  49. expected = 1.0 - (T**3 - T) / (N**3 - N)
  50. assert_equal(c, expected)
  51. # Two ties, lengths 2 and 3.
  52. ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
  53. c = tiecorrect(ranks)
  54. T1 = 2.0
  55. T2 = 3.0
  56. N = ranks.size
  57. expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
  58. assert_equal(c, expected)
  59. def test_overflow(self):
  60. ntie, k = 2000, 5
  61. a = np.repeat(np.arange(k), ntie)
  62. n = a.size # ntie * k
  63. out = tiecorrect(rankdata(a))
  64. assert_equal(out, 1.0 - k * (ntie**3 - ntie) / float(n**3 - n))
  65. @make_xp_test_case(stats.rankdata)
  66. class TestRankData:
  67. def desired_dtype(self, method='average', has_nans=False, *, xp):
  68. if has_nans:
  69. return xp.asarray(1.).dtype
  70. return xp.asarray(1.).dtype if method=='average' else xp.asarray(1).dtype
  71. def test_empty(self, xp):
  72. """stats.rankdata of empty array should return an empty array."""
  73. a = xp.asarray([], dtype=xp.int64)
  74. r = rankdata(a)
  75. xp_assert_equal(r, xp.asarray([], dtype=self.desired_dtype(xp=xp)))
  76. def test_list(self):
  77. # test that NumPy still accepts lists
  78. r = rankdata([])
  79. assert_array_equal(r, np.array([]))
  80. r = rankdata([40, 10, 30, 10, 50])
  81. assert_equal(r, [4.0, 1.5, 3.0, 1.5, 5.0])
  82. @pytest.mark.parametrize("shape", [(0, 1, 2)])
  83. @pytest.mark.parametrize("axis", [None, *range(3)])
  84. def test_empty_multidim(self, shape, axis, xp):
  85. a = xp.empty(shape, dtype=xp.int64)
  86. r = rankdata(a, axis=axis)
  87. expected_shape = (0,) if axis is None else shape
  88. xp_assert_equal(r, xp.empty(expected_shape, dtype=self.desired_dtype(xp=xp)))
  89. def test_one(self, xp):
  90. """Check stats.rankdata with an array of length 1."""
  91. data = [100]
  92. a = xp.asarray(data, dtype=xp.int64)
  93. r = rankdata(a)
  94. xp_assert_equal(r, xp.asarray([1.0], dtype=self.desired_dtype(xp=xp)))
  95. def test_basic(self, xp):
  96. """Basic tests of stats.rankdata."""
  97. desired_dtype = self.desired_dtype(xp=xp)
  98. data = [100, 10, 50]
  99. expected = xp.asarray([3.0, 1.0, 2.0], dtype=desired_dtype)
  100. a = xp.asarray(data, dtype=xp.int64)
  101. r = rankdata(a)
  102. xp_assert_equal(r, expected)
  103. data = [40, 10, 30, 10, 50]
  104. expected = xp.asarray([4.0, 1.5, 3.0, 1.5, 5.0], dtype=desired_dtype)
  105. a = xp.asarray(data, dtype=xp.int64)
  106. r = rankdata(a)
  107. xp_assert_equal(r, expected)
  108. data = [20, 20, 20, 10, 10, 10]
  109. expected = xp.asarray([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=desired_dtype)
  110. a = xp.asarray(data, dtype=xp.int64)
  111. r = rankdata(a)
  112. xp_assert_equal(r, expected)
  113. # # The docstring states explicitly that the argument is flattened.
  114. a2d = xp.reshape(a, (2, 3))
  115. r = rankdata(a2d)
  116. xp_assert_equal(r, expected)
  117. @skip_xp_invalid_arg
  118. def test_rankdata_object_string(self):
  119. def min_rank(a):
  120. return [1 + sum(i < j for i in a) for j in a]
  121. def max_rank(a):
  122. return [sum(i <= j for i in a) for j in a]
  123. def ordinal_rank(a):
  124. return min_rank([(x, i) for i, x in enumerate(a)])
  125. def average_rank(a):
  126. return [(i + j) / 2.0 for i, j in zip(min_rank(a), max_rank(a))]
  127. def dense_rank(a):
  128. b = np.unique(a)
  129. return [1 + sum(i < j for i in b) for j in a]
  130. rankf = dict(min=min_rank, max=max_rank, ordinal=ordinal_rank,
  131. average=average_rank, dense=dense_rank)
  132. def check_ranks(a):
  133. for method in 'min', 'max', 'dense', 'ordinal', 'average':
  134. out = rankdata(a, method=method)
  135. assert_array_equal(out, rankf[method](a))
  136. val = ['foo', 'bar', 'qux', 'xyz', 'abc', 'efg', 'ace', 'qwe', 'qaz']
  137. check_ranks(np.random.choice(val, 200))
  138. check_ranks(np.random.choice(val, 200).astype('object'))
  139. val = np.array([0, 1, 2, 2.718, 3, 3.141], dtype='object')
  140. check_ranks(np.random.choice(val, 200).astype('object'))
  141. @pytest.mark.skip_xp_backends("torch", reason="`take_along_axis` fails with uint64")
  142. def test_large_uint(self, xp):
  143. data = xp.asarray([2**60, 2**60+1], dtype=xp.uint64)
  144. r = rankdata(data)
  145. xp_assert_equal(r, xp.asarray([1.0, 2.0], dtype=self.desired_dtype(xp=xp)))
  146. def test_large_int(self, xp):
  147. data = xp.asarray([2**60, 2**60+1], dtype=xp.int64)
  148. r = rankdata(data)
  149. xp_assert_equal(r, xp.asarray([1.0, 2.0], dtype=self.desired_dtype(xp=xp)))
  150. data = xp.asarray([2**60, -2**60+1], dtype=xp.int64)
  151. r = rankdata(data)
  152. xp_assert_equal(r, xp.asarray([2.0, 1.0], dtype=self.desired_dtype(xp=xp)))
  153. @pytest.mark.parametrize('n', [10000, 100000, 1000000])
  154. def test_big_tie(self, n, xp):
  155. data = xp.ones(n)
  156. r = rankdata(data)
  157. expected_rank = 0.5 * (n + 1)
  158. ref = xp.asarray(expected_rank * data, dtype=self.desired_dtype(xp=xp))
  159. xp_assert_equal(r, ref)
  160. def test_axis(self, xp):
  161. data = xp.asarray([[0, 2, 1], [4, 2, 2]])
  162. expected0 = xp.asarray([[1., 1.5, 1.], [2., 1.5, 2.]])
  163. r0 = rankdata(data, axis=0)
  164. xp_assert_equal(r0, expected0)
  165. expected1 = xp.asarray([[1., 3., 2.], [3., 1.5, 1.5]])
  166. r1 = rankdata(data, axis=1)
  167. xp_assert_equal(r1, expected1)
  168. methods= ["average", "min", "max", "dense", "ordinal"]
  169. @pytest.mark.parametrize("axis", [0, 1])
  170. @pytest.mark.parametrize("method", methods)
  171. def test_size_0_axis(self, axis, method, xp):
  172. shape = (3, 0)
  173. desired_dtype = self.desired_dtype(method, xp=xp)
  174. data = xp.zeros(shape)
  175. r = rankdata(data, method=method, axis=axis)
  176. assert_equal(r.shape, shape)
  177. assert_equal(r.dtype, desired_dtype)
  178. xp_assert_equal(r, xp.empty(shape, dtype=desired_dtype))
  179. @pytest.mark.parametrize('axis', range(3))
  180. @pytest.mark.parametrize('method', methods)
  181. def test_nan_policy_omit_3d(self, axis, method):
  182. shape = (20, 21, 22)
  183. rng = np.random.RandomState(23983242)
  184. a = rng.random(size=shape)
  185. i = rng.random(size=shape) < 0.4
  186. j = rng.random(size=shape) < 0.1
  187. k = rng.random(size=shape) < 0.1
  188. a[i] = np.nan
  189. a[j] = -np.inf
  190. a[k] - np.inf
  191. def rank_1d_omit(a, method):
  192. out = np.zeros_like(a)
  193. i = np.isnan(a)
  194. a_compressed = a[~i]
  195. res = rankdata(a_compressed, method)
  196. out[~i] = res
  197. out[i] = np.nan
  198. return out
  199. def rank_omit(a, method, axis):
  200. return np.apply_along_axis(lambda a: rank_1d_omit(a, method),
  201. axis, a)
  202. res = rankdata(a, method, axis=axis, nan_policy='omit')
  203. res0 = rank_omit(a, method, axis=axis)
  204. assert_array_equal(res, res0)
  205. def test_nan_policy_2d_axis_none(self):
  206. # 2 2d-array test with axis=None
  207. data = [[0, np.nan, 3],
  208. [4, 2, np.nan],
  209. [1, 2, 2]]
  210. assert_array_equal(rankdata(data, axis=None, nan_policy='omit'),
  211. [1., np.nan, 6., 7., 4., np.nan, 2., 4., 4.])
  212. assert_array_equal(rankdata(data, axis=None, nan_policy='propagate'),
  213. [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
  214. np.nan, np.nan, np.nan])
  215. def test_nan_policy_raise(self):
  216. # 1 1d-array test
  217. data = [0, 2, 3, -2, np.nan, np.nan]
  218. with pytest.raises(ValueError, match="The input contains nan"):
  219. rankdata(data, nan_policy='raise')
  220. # 2 2d-array test
  221. data = [[0, np.nan, 3],
  222. [4, 2, np.nan],
  223. [np.nan, 2, 2]]
  224. with pytest.raises(ValueError, match="The input contains nan"):
  225. rankdata(data, axis=0, nan_policy="raise")
  226. with pytest.raises(ValueError, match="The input contains nan"):
  227. rankdata(data, axis=1, nan_policy="raise")
  228. def test_nan_policy_propagate(self):
  229. # 1 1d-array test
  230. data = [0, 2, 3, -2, np.nan, np.nan]
  231. assert_array_equal(rankdata(data, nan_policy='propagate'),
  232. [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
  233. # 2 2d-array test
  234. data = [[0, np.nan, 3],
  235. [4, 2, np.nan],
  236. [1, 2, 2]]
  237. assert_array_equal(rankdata(data, axis=0, nan_policy='propagate'),
  238. [[1, np.nan, np.nan],
  239. [3, np.nan, np.nan],
  240. [2, np.nan, np.nan]])
  241. assert_array_equal(rankdata(data, axis=1, nan_policy='propagate'),
  242. [[np.nan, np.nan, np.nan],
  243. [np.nan, np.nan, np.nan],
  244. [1, 2.5, 2.5]])
  245. _rankdata_cases = (
  246. # values, method, expected
  247. ([], 'average', []),
  248. ([], 'min', []),
  249. ([], 'max', []),
  250. ([], 'dense', []),
  251. ([], 'ordinal', []),
  252. #
  253. ([100], 'average', [1.0]),
  254. ([100], 'min', [1.0]),
  255. ([100], 'max', [1.0]),
  256. ([100], 'dense', [1.0]),
  257. ([100], 'ordinal', [1.0]),
  258. #
  259. ([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
  260. ([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
  261. ([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
  262. ([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
  263. ([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
  264. #
  265. ([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
  266. ([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
  267. ([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
  268. ([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
  269. ([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
  270. #
  271. ([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
  272. ([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
  273. ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
  274. ([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
  275. ([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
  276. #
  277. ([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
  278. ([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
  279. ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
  280. ([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
  281. ([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
  282. #
  283. ([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
  284. )
  285. @pytest.mark.parametrize('case', _rankdata_cases)
  286. def test_cases(self, case, xp):
  287. values, method, expected = case
  288. r = rankdata(xp.asarray(values), method=method)
  289. ref = xp.asarray(expected, dtype=self.desired_dtype(method, xp=xp))
  290. xp_assert_equal(r, ref)