test_vq.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. import math
  2. import sys
  3. import warnings
  4. from copy import deepcopy
  5. from threading import Lock
  6. import numpy as np
  7. from numpy.testing import assert_array_equal
  8. import pytest
  9. from pytest import raises as assert_raises
  10. from scipy.cluster.vq import (kmeans, kmeans2, py_vq, vq, whiten,
  11. ClusterError, _krandinit)
  12. from scipy.cluster import _vq
  13. from scipy.sparse._sputils import matrix
  14. from scipy._lib import array_api_extra as xpx
  15. from scipy._lib._array_api import (
  16. SCIPY_ARRAY_API, eager_warns, is_lazy_array, make_xp_test_case,
  17. xp_copy, xp_assert_close, xp_assert_equal
  18. )
  19. xfail_xp_backends = pytest.mark.xfail_xp_backends
  20. skip_xp_backends = pytest.mark.skip_xp_backends
  21. TESTDATA_2D = np.array([
  22. -2.2, 1.17, -1.63, 1.69, -2.04, 4.38, -3.09, 0.95, -1.7, 4.79, -1.68, 0.68,
  23. -2.26, 3.34, -2.29, 2.55, -1.72, -0.72, -1.99, 2.34, -2.75, 3.43, -2.45,
  24. 2.41, -4.26, 3.65, -1.57, 1.87, -1.96, 4.03, -3.01, 3.86, -2.53, 1.28,
  25. -4.0, 3.95, -1.62, 1.25, -3.42, 3.17, -1.17, 0.12, -3.03, -0.27, -2.07,
  26. -0.55, -1.17, 1.34, -2.82, 3.08, -2.44, 0.24, -1.71, 2.48, -5.23, 4.29,
  27. -2.08, 3.69, -1.89, 3.62, -2.09, 0.26, -0.92, 1.07, -2.25, 0.88, -2.25,
  28. 2.02, -4.31, 3.86, -2.03, 3.42, -2.76, 0.3, -2.48, -0.29, -3.42, 3.21,
  29. -2.3, 1.73, -2.84, 0.69, -1.81, 2.48, -5.24, 4.52, -2.8, 1.31, -1.67,
  30. -2.34, -1.18, 2.17, -2.17, 2.82, -1.85, 2.25, -2.45, 1.86, -6.79, 3.94,
  31. -2.33, 1.89, -1.55, 2.08, -1.36, 0.93, -2.51, 2.74, -2.39, 3.92, -3.33,
  32. 2.99, -2.06, -0.9, -2.83, 3.35, -2.59, 3.05, -2.36, 1.85, -1.69, 1.8,
  33. -1.39, 0.66, -2.06, 0.38, -1.47, 0.44, -4.68, 3.77, -5.58, 3.44, -2.29,
  34. 2.24, -1.04, -0.38, -1.85, 4.23, -2.88, 0.73, -2.59, 1.39, -1.34, 1.75,
  35. -1.95, 1.3, -2.45, 3.09, -1.99, 3.41, -5.55, 5.21, -1.73, 2.52, -2.17,
  36. 0.85, -2.06, 0.49, -2.54, 2.07, -2.03, 1.3, -3.23, 3.09, -1.55, 1.44,
  37. -0.81, 1.1, -2.99, 2.92, -1.59, 2.18, -2.45, -0.73, -3.12, -1.3, -2.83,
  38. 0.2, -2.77, 3.24, -1.98, 1.6, -4.59, 3.39, -4.85, 3.75, -2.25, 1.71, -3.28,
  39. 3.38, -1.74, 0.88, -2.41, 1.92, -2.24, 1.19, -2.48, 1.06, -1.68, -0.62,
  40. -1.3, 0.39, -1.78, 2.35, -3.54, 2.44, -1.32, 0.66, -2.38, 2.76, -2.35,
  41. 3.95, -1.86, 4.32, -2.01, -1.23, -1.79, 2.76, -2.13, -0.13, -5.25, 3.84,
  42. -2.24, 1.59, -4.85, 2.96, -2.41, 0.01, -0.43, 0.13, -3.92, 2.91, -1.75,
  43. -0.53, -1.69, 1.69, -1.09, 0.15, -2.11, 2.17, -1.53, 1.22, -2.1, -0.86,
  44. -2.56, 2.28, -3.02, 3.33, -1.12, 3.86, -2.18, -1.19, -3.03, 0.79, -0.83,
  45. 0.97, -3.19, 1.45, -1.34, 1.28, -2.52, 4.22, -4.53, 3.22, -1.97, 1.75,
  46. -2.36, 3.19, -0.83, 1.53, -1.59, 1.86, -2.17, 2.3, -1.63, 2.71, -2.03,
  47. 3.75, -2.57, -0.6, -1.47, 1.33, -1.95, 0.7, -1.65, 1.27, -1.42, 1.09, -3.0,
  48. 3.87, -2.51, 3.06, -2.6, 0.74, -1.08, -0.03, -2.44, 1.31, -2.65, 2.99,
  49. -1.84, 1.65, -4.76, 3.75, -2.07, 3.98, -2.4, 2.67, -2.21, 1.49, -1.21,
  50. 1.22, -5.29, 2.38, -2.85, 2.28, -5.6, 3.78, -2.7, 0.8, -1.81, 3.5, -3.75,
  51. 4.17, -1.29, 2.99, -5.92, 3.43, -1.83, 1.23, -1.24, -1.04, -2.56, 2.37,
  52. -3.26, 0.39, -4.63, 2.51, -4.52, 3.04, -1.7, 0.36, -1.41, 0.04, -2.1, 1.0,
  53. -1.87, 3.78, -4.32, 3.59, -2.24, 1.38, -1.99, -0.22, -1.87, 1.95, -0.84,
  54. 2.17, -5.38, 3.56, -1.27, 2.9, -1.79, 3.31, -5.47, 3.85, -1.44, 3.69,
  55. -2.02, 0.37, -1.29, 0.33, -2.34, 2.56, -1.74, -1.27, -1.97, 1.22, -2.51,
  56. -0.16, -1.64, -0.96, -2.99, 1.4, -1.53, 3.31, -2.24, 0.45, -2.46, 1.71,
  57. -2.88, 1.56, -1.63, 1.46, -1.41, 0.68, -1.96, 2.76, -1.61,
  58. 2.11]).reshape((200, 2))
  59. # Global data
  60. X = np.array([[3.0, 3], [4, 3], [4, 2],
  61. [9, 2], [5, 1], [6, 2], [9, 4],
  62. [5, 2], [5, 4], [7, 4], [6, 5]])
  63. CODET1 = np.array([[3.0000, 3.0000],
  64. [6.2000, 4.0000],
  65. [5.8000, 1.8000]])
  66. CODET2 = np.array([[11.0/3, 8.0/3],
  67. [6.7500, 4.2500],
  68. [6.2500, 1.7500]])
  69. LABEL1 = np.array([0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1])
  70. @make_xp_test_case(whiten)
  71. class TestWhiten:
  72. def test_whiten(self, xp):
  73. desired = xp.asarray([[5.08738849, 2.97091878],
  74. [3.19909255, 0.69660580],
  75. [4.51041982, 0.02640918],
  76. [4.38567074, 0.95120889],
  77. [2.32191480, 1.63195503]])
  78. obs = xp.asarray([[0.98744510, 0.82766775],
  79. [0.62093317, 0.19406729],
  80. [0.87545741, 0.00735733],
  81. [0.85124403, 0.26499712],
  82. [0.45067590, 0.45464607]])
  83. xp_assert_close(whiten(obs), desired, rtol=1e-5)
  84. def test_whiten_zero_std(self, xp):
  85. desired = xp.asarray([[0., 1.0, 2.86666544],
  86. [0., 1.0, 1.32460034],
  87. [0., 1.0, 3.74382172]])
  88. obs = xp.asarray([[0., 1., 0.74109533],
  89. [0., 1., 0.34243798],
  90. [0., 1., 0.96785929]])
  91. with eager_warns(RuntimeWarning, match="Some columns have standard...", xp=xp):
  92. actual = whiten(obs)
  93. xp_assert_close(actual, desired, rtol=1e-5)
  94. @pytest.mark.filterwarnings("ignore:invalid value encountered:RuntimeWarning:dask")
  95. @pytest.mark.parametrize("bad_value", [math.nan, math.inf, -math.inf])
  96. def test_whiten_not_finite(self, bad_value, xp):
  97. obs = xp.asarray([[0.98744510, bad_value],
  98. [0.62093317, 0.19406729],
  99. [0.87545741, 0.00735733],
  100. [0.85124403, 0.26499712],
  101. [0.45067590, 0.45464607]])
  102. if is_lazy_array(obs):
  103. desired = xp.asarray([[5.08738849, math.nan],
  104. [3.19909255, math.nan],
  105. [4.51041982, math.nan],
  106. [4.38567074, math.nan],
  107. [2.32191480, math.nan]])
  108. xp_assert_close(whiten(obs), desired, rtol=1e-5)
  109. else:
  110. assert_raises(ValueError, whiten, obs)
  111. @pytest.mark.skipif(SCIPY_ARRAY_API,
  112. reason='`np.matrix` unsupported in array API mode')
  113. def test_whiten_not_finite_matrix(self):
  114. for bad_value in np.nan, np.inf, -np.inf:
  115. obs = matrix([[0.98744510, bad_value],
  116. [0.62093317, 0.19406729],
  117. [0.87545741, 0.00735733],
  118. [0.85124403, 0.26499712],
  119. [0.45067590, 0.45464607]])
  120. assert_raises(ValueError, whiten, obs)
  121. @make_xp_test_case(vq)
  122. class TestVq:
  123. def test_py_vq(self, xp):
  124. initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
  125. # label1.dtype varies between int32 and int64 over platforms
  126. label1 = py_vq(xp.asarray(X), xp.asarray(initc))[0]
  127. xp_assert_equal(label1, xp.asarray(LABEL1, dtype=xp.int64),
  128. check_dtype=False)
  129. @pytest.mark.skipif(SCIPY_ARRAY_API,
  130. reason='`np.matrix` unsupported in array API mode')
  131. def test_py_vq_matrix(self):
  132. initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
  133. # label1.dtype varies between int32 and int64 over platforms
  134. label1 = py_vq(matrix(X), matrix(initc))[0]
  135. assert_array_equal(label1, LABEL1)
  136. def test_vq(self, xp):
  137. initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
  138. label1, _ = _vq.vq(X, initc)
  139. assert_array_equal(label1, LABEL1)
  140. _, _ = vq(xp.asarray(X), xp.asarray(initc))
  141. @pytest.mark.skipif(SCIPY_ARRAY_API,
  142. reason='`np.matrix` unsupported in array API mode')
  143. def test_vq_matrix(self):
  144. initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
  145. label1, _ = _vq.vq(matrix(X), matrix(initc))
  146. assert_array_equal(label1, LABEL1)
  147. _, _ = vq(matrix(X), matrix(initc))
  148. def test_vq_1d(self, xp):
  149. # Test special rank 1 vq algo, python implementation.
  150. data = X[:, 0]
  151. initc = data[:3]
  152. a, b = _vq.vq(data, initc)
  153. data = xp.asarray(data)
  154. initc = xp.asarray(initc)
  155. ta, tb = py_vq(data[:, np.newaxis], initc[:, np.newaxis])
  156. # ta.dtype varies between int32 and int64 over platforms
  157. xp_assert_equal(ta, xp.asarray(a, dtype=xp.int64), check_dtype=False)
  158. xp_assert_equal(tb, xp.asarray(b))
  159. def test__vq_sametype(self):
  160. a = np.asarray([1.0, 2.0])
  161. b = a.astype(np.float32)
  162. assert_raises(TypeError, _vq.vq, a, b)
  163. def test__vq_invalid_type(self):
  164. a = np.asarray([1, 2], dtype=int)
  165. assert_raises(TypeError, _vq.vq, a, a)
  166. def test_vq_large_nfeat(self, xp):
  167. X = np.random.rand(20, 20)
  168. code_book = np.random.rand(3, 20)
  169. codes0, dis0 = _vq.vq(X, code_book)
  170. codes1, dis1 = py_vq(
  171. xp.asarray(X), xp.asarray(code_book)
  172. )
  173. xp_assert_close(dis1, xp.asarray(dis0), rtol=1e-5)
  174. # codes1.dtype varies between int32 and int64 over platforms
  175. xp_assert_equal(codes1, xp.asarray(codes0, dtype=xp.int64), check_dtype=False)
  176. X = X.astype(np.float32)
  177. code_book = code_book.astype(np.float32)
  178. codes0, dis0 = _vq.vq(X, code_book)
  179. codes1, dis1 = py_vq(
  180. xp.asarray(X), xp.asarray(code_book)
  181. )
  182. xp_assert_close(dis1, xp.asarray(dis0, dtype=xp.float64), rtol=1e-5)
  183. # codes1.dtype varies between int32 and int64 over platforms
  184. xp_assert_equal(codes1, xp.asarray(codes0, dtype=xp.int64), check_dtype=False)
  185. def test_vq_large_features(self, xp):
  186. X = np.random.rand(10, 5) * 1000000
  187. code_book = np.random.rand(2, 5) * 1000000
  188. codes0, dis0 = _vq.vq(X, code_book)
  189. codes1, dis1 = py_vq(
  190. xp.asarray(X), xp.asarray(code_book)
  191. )
  192. xp_assert_close(dis1, xp.asarray(dis0), rtol=1e-5)
  193. # codes1.dtype varies between int32 and int64 over platforms
  194. xp_assert_equal(codes1, xp.asarray(codes0, dtype=xp.int64), check_dtype=False)
  195. # Whole class skipped on GPU for now;
  196. # once pdist/cdist are hooked up for CuPy, more tests will work
  197. @make_xp_test_case(kmeans, kmeans2)
  198. class TestKMeans:
  199. def test_large_features(self, xp):
  200. # Generate a data set with large values, and run kmeans on it to
  201. # (regression for 1077).
  202. d = 300
  203. n = 100
  204. m1 = np.random.randn(d)
  205. m2 = np.random.randn(d)
  206. x = 10000 * np.random.randn(n, d) - 20000 * m1
  207. y = 10000 * np.random.randn(n, d) + 20000 * m2
  208. data = np.empty((x.shape[0] + y.shape[0], d), np.float64)
  209. data[:x.shape[0]] = x
  210. data[x.shape[0]:] = y
  211. # use `seed` to ensure backwards compatibility after SPEC7
  212. kmeans(xp.asarray(data), 2, seed=1)
  213. def test_kmeans_simple(self, xp):
  214. rng = np.random.default_rng(54321)
  215. initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
  216. code1 = kmeans(xp.asarray(X), xp.asarray(initc), iter=1, rng=rng)[0]
  217. xp_assert_close(code1, xp.asarray(CODET2))
  218. @pytest.mark.skipif(SCIPY_ARRAY_API,
  219. reason='`np.matrix` unsupported in array API mode')
  220. def test_kmeans_simple_matrix(self):
  221. rng = np.random.default_rng(54321)
  222. initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
  223. code1 = kmeans(matrix(X), matrix(initc), iter=1, rng=rng)[0]
  224. xp_assert_close(code1, CODET2)
  225. def test_kmeans_lost_cluster(self, xp):
  226. # This will cause kmeans to have a cluster with no points.
  227. data = xp.asarray(TESTDATA_2D)
  228. initk = xp.asarray([[-1.8127404, -0.67128041],
  229. [2.04621601, 0.07401111],
  230. [-2.31149087, -0.05160469]])
  231. kmeans(data, initk)
  232. with warnings.catch_warnings():
  233. warnings.filterwarnings(
  234. "ignore",
  235. ("One of the clusters is empty. Re-run kmeans with a different "
  236. "initialization"),
  237. UserWarning,
  238. )
  239. kmeans2(data, initk, missing='warn')
  240. assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
  241. def test_kmeans2_simple(self, xp):
  242. rng = np.random.default_rng(12345678)
  243. initc = xp.asarray(np.concatenate([[X[0]], [X[1]], [X[2]]]))
  244. arrays = [xp.asarray] if SCIPY_ARRAY_API else [np.asarray, matrix]
  245. for tp in arrays:
  246. code1 = kmeans2(tp(X), tp(initc), iter=1, rng=rng)[0]
  247. code2 = kmeans2(tp(X), tp(initc), iter=2, rng=rng)[0]
  248. xp_assert_close(code1, xp.asarray(CODET1))
  249. xp_assert_close(code2, xp.asarray(CODET2))
  250. @pytest.mark.skipif(SCIPY_ARRAY_API,
  251. reason='`np.matrix` unsupported in array API mode')
  252. def test_kmeans2_simple_matrix(self):
  253. rng = np.random.default_rng(12345678)
  254. initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
  255. code1 = kmeans2(matrix(X), matrix(initc), iter=1, rng=rng)[0]
  256. code2 = kmeans2(matrix(X), matrix(initc), iter=2, rng=rng)[0]
  257. xp_assert_close(code1, CODET1)
  258. xp_assert_close(code2, CODET2)
  259. def test_kmeans2_rank1(self, xp):
  260. data = xp.asarray(TESTDATA_2D)
  261. data1 = data[:, 0]
  262. initc = data1[:3]
  263. code = xp_copy(initc, xp=xp)
  264. # use `seed` to ensure backwards compatibility after SPEC7
  265. kmeans2(data1, code, iter=1, seed=1)[0]
  266. kmeans2(data1, code, iter=2)[0]
  267. def test_kmeans2_rank1_2(self, xp):
  268. data = xp.asarray(TESTDATA_2D)
  269. data1 = data[:, 0]
  270. kmeans2(data1, 2, iter=1)
  271. def test_kmeans2_high_dim(self, xp):
  272. # test kmeans2 when the number of dimensions exceeds the number
  273. # of input points
  274. data = xp.asarray(TESTDATA_2D)
  275. data = xp.reshape(data, (20, 20))[:10, :]
  276. kmeans2(data, 2)
  277. def test_kmeans2_init(self, xp):
  278. rng = np.random.default_rng(12345678)
  279. data = xp.asarray(TESTDATA_2D)
  280. k = 3
  281. kmeans2(data, k, minit='points', rng=rng)
  282. kmeans2(data[:, 1], k, minit='points', rng=rng) # special case (1-D)
  283. kmeans2(data, k, minit='++', rng=rng)
  284. kmeans2(data[:, 1], k, minit='++', rng=rng) # special case (1-D)
  285. # minit='random' can give warnings, filter those
  286. with warnings.catch_warnings():
  287. warnings.filterwarnings("ignore", "One of the clusters is empty. Re-run.")
  288. kmeans2(data, k, minit='random', rng=rng)
  289. kmeans2(data[:, 1], k, minit='random', rng=rng) # special case (1-D)
  290. @pytest.fixture
  291. def krand_lock(self):
  292. return Lock()
  293. @xfail_xp_backends('dask.array', reason="Wrong answer")
  294. @pytest.mark.skipif(sys.platform == 'win32',
  295. reason='Fails with MemoryError in Wine.')
  296. def test_krandinit(self, xp, krand_lock):
  297. data = xp.asarray(TESTDATA_2D)
  298. datas = [xp.reshape(data, (200, 2)),
  299. xp.reshape(data, (20, 20))[:10, :]]
  300. k = int(1e6)
  301. with krand_lock:
  302. for data in datas:
  303. rng = np.random.default_rng(1234)
  304. init = _krandinit(data, k, rng, xp)
  305. orig_cov = xpx.cov(data.T, xp=xp)
  306. init_cov = xpx.cov(init.T, xp=xp)
  307. xp_assert_close(orig_cov, init_cov, atol=1.1e-2)
  308. def test_kmeans2_empty(self, xp):
  309. # Regression test for gh-1032.
  310. assert_raises(ValueError, kmeans2, xp.asarray([]), 2)
  311. def test_kmeans_0k(self, xp):
  312. # Regression test for gh-1073: fail when k arg is 0.
  313. assert_raises(ValueError, kmeans, xp.asarray(X), 0)
  314. assert_raises(ValueError, kmeans2, xp.asarray(X), 0)
  315. assert_raises(ValueError, kmeans2, xp.asarray(X), xp.asarray([]))
  316. def test_kmeans_large_thres(self, xp):
  317. # Regression test for gh-1774
  318. x = xp.asarray([1, 2, 3, 4, 10], dtype=xp.float64)
  319. res = kmeans(x, 1, thresh=1e16)
  320. xp_assert_close(res[0], xp.asarray([4.], dtype=xp.float64))
  321. xp_assert_close(res[1], xp.asarray(2.3999999999999999, dtype=xp.float64)[()])
  322. def test_kmeans2_kpp_low_dim(self, xp):
  323. # Regression test for gh-11462
  324. rng = np.random.default_rng(2358792345678234568)
  325. prev_res = xp.asarray([[-1.95266667, 0.898],
  326. [-3.153375, 3.3945]], dtype=xp.float64)
  327. res, _ = kmeans2(xp.asarray(TESTDATA_2D), 2, minit='++', rng=rng)
  328. xp_assert_close(res, prev_res)
  329. def test_kmeans2_kpp_high_dim(self, xp):
  330. # Regression test for gh-11462
  331. rng = np.random.default_rng(23587923456834568)
  332. n_dim = 100
  333. size = 10
  334. centers = np.vstack([5 * np.ones(n_dim),
  335. -5 * np.ones(n_dim)])
  336. data = np.vstack([
  337. rng.multivariate_normal(centers[0], np.eye(n_dim), size=size),
  338. rng.multivariate_normal(centers[1], np.eye(n_dim), size=size)
  339. ])
  340. data = xp.asarray(data)
  341. res, _ = kmeans2(data, 2, minit='++', rng=rng)
  342. xp_assert_equal(xp.sign(res), xp.sign(xp.asarray(centers)))
  343. def test_kmeans_diff_convergence(self, xp):
  344. # Regression test for gh-8727
  345. obs = xp.asarray([-3, -1, 0, 1, 1, 8], dtype=xp.float64)
  346. res = kmeans(obs, xp.asarray([-3., 0.99]))
  347. xp_assert_close(res[0], xp.asarray([-0.4, 8.], dtype=xp.float64))
  348. xp_assert_close(res[1], xp.asarray(1.0666666666666667, dtype=xp.float64)[()])
  349. def test_kmeans_and_kmeans2_random_seed(self, xp):
  350. seed_list = [
  351. 1234, np.random.RandomState(1234), np.random.default_rng(1234)
  352. ]
  353. for seed in seed_list:
  354. seed1 = deepcopy(seed)
  355. seed2 = deepcopy(seed)
  356. data = xp.asarray(TESTDATA_2D)
  357. # test for kmeans
  358. res1, _ = kmeans(data, 2, seed=seed1)
  359. res2, _ = kmeans(data, 2, seed=seed2)
  360. xp_assert_close(res1, res2) # should be same results
  361. # test for kmeans2
  362. for minit in ["random", "points", "++"]:
  363. res1, _ = kmeans2(data, 2, minit=minit, seed=seed1)
  364. res2, _ = kmeans2(data, 2, minit=minit, seed=seed2)
  365. xp_assert_close(res1, res2) # should be same results