test_arffread.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. import datetime
  2. import os
  3. import sys
  4. from os.path import join as pjoin
  5. from io import StringIO
  6. import numpy as np
  7. from numpy.testing import (assert_array_almost_equal,
  8. assert_array_equal, assert_equal, assert_)
  9. from pytest import raises as assert_raises
  10. from scipy.io.arff import loadarff
  11. from scipy.io.arff._arffread import read_header, ParseArffError
  12. data_path = pjoin(os.path.dirname(__file__), 'data')
  13. test1 = pjoin(data_path, 'test1.arff')
  14. test2 = pjoin(data_path, 'test2.arff')
  15. test3 = pjoin(data_path, 'test3.arff')
  16. test4 = pjoin(data_path, 'test4.arff')
  17. test5 = pjoin(data_path, 'test5.arff')
  18. test6 = pjoin(data_path, 'test6.arff')
  19. test7 = pjoin(data_path, 'test7.arff')
  20. test8 = pjoin(data_path, 'test8.arff')
  21. test9 = pjoin(data_path, 'test9.arff')
  22. test10 = pjoin(data_path, 'test10.arff')
  23. test11 = pjoin(data_path, 'test11.arff')
  24. test_quoted_nominal = pjoin(data_path, 'quoted_nominal.arff')
  25. test_quoted_nominal_spaces = pjoin(data_path, 'quoted_nominal_spaces.arff')
  26. expect4_data = [(0.1, 0.2, 0.3, 0.4, 'class1'),
  27. (-0.1, -0.2, -0.3, -0.4, 'class2'),
  28. (1, 2, 3, 4, 'class3')]
  29. expected_types = ['numeric', 'numeric', 'numeric', 'numeric', 'nominal']
  30. missing = pjoin(data_path, 'missing.arff')
  31. expect_missing_raw = np.array([[1, 5], [2, 4], [np.nan, np.nan]])
  32. expect_missing = np.empty(3, [('yop', float), ('yap', float)])
  33. expect_missing['yop'] = expect_missing_raw[:, 0]
  34. expect_missing['yap'] = expect_missing_raw[:, 1]
  35. class TestData:
  36. def test1(self):
  37. # Parsing trivial file with nothing.
  38. self._test(test4)
  39. def test2(self):
  40. # Parsing trivial file with some comments in the data section.
  41. self._test(test5)
  42. def test3(self):
  43. # Parsing trivial file with nominal attribute of 1 character.
  44. self._test(test6)
  45. def test4(self):
  46. # Parsing trivial file with trailing spaces in attribute declaration.
  47. self._test(test11)
  48. def _test(self, test_file):
  49. data, meta = loadarff(test_file)
  50. for i in range(len(data)):
  51. for j in range(4):
  52. assert_array_almost_equal(expect4_data[i][j], data[i][j])
  53. assert_equal(meta.types(), expected_types)
  54. def test_filelike(self):
  55. # Test reading from file-like object (StringIO)
  56. with open(test1) as f1:
  57. data1, meta1 = loadarff(f1)
  58. with open(test1) as f2:
  59. data2, meta2 = loadarff(StringIO(f2.read()))
  60. assert_(data1 == data2)
  61. assert_(repr(meta1) == repr(meta2))
  62. def test_path(self):
  63. # Test reading from `pathlib.Path` object
  64. from pathlib import Path
  65. with open(test1) as f1:
  66. data1, meta1 = loadarff(f1)
  67. data2, meta2 = loadarff(Path(test1))
  68. assert_(data1 == data2)
  69. assert_(repr(meta1) == repr(meta2))
  70. class TestMissingData:
  71. def test_missing(self):
  72. data, meta = loadarff(missing)
  73. for i in ['yop', 'yap']:
  74. assert_array_almost_equal(data[i], expect_missing[i])
  75. class TestNoData:
  76. def test_nodata(self):
  77. # The file nodata.arff has no data in the @DATA section.
  78. # Reading it should result in an array with length 0.
  79. nodata_filename = os.path.join(data_path, 'nodata.arff')
  80. data, meta = loadarff(nodata_filename)
  81. if sys.byteorder == 'big':
  82. end = '>'
  83. else:
  84. end = '<'
  85. expected_dtype = np.dtype([('sepallength', f'{end}f8'),
  86. ('sepalwidth', f'{end}f8'),
  87. ('petallength', f'{end}f8'),
  88. ('petalwidth', f'{end}f8'),
  89. ('class', 'S15')])
  90. assert_equal(data.dtype, expected_dtype)
  91. assert_equal(data.size, 0)
  92. class TestHeader:
  93. def test_type_parsing(self):
  94. # Test parsing type of attribute from their value.
  95. with open(test2) as ofile:
  96. rel, attrs = read_header(ofile)
  97. expected = ['numeric', 'numeric', 'numeric', 'numeric', 'numeric',
  98. 'numeric', 'string', 'string', 'nominal', 'nominal']
  99. for i in range(len(attrs)):
  100. assert_(attrs[i].type_name == expected[i])
  101. def test_badtype_parsing(self):
  102. # Test parsing wrong type of attribute from their value.
  103. def badtype_read():
  104. with open(test3) as ofile:
  105. _, _ = read_header(ofile)
  106. assert_raises(ParseArffError, badtype_read)
  107. def test_fullheader1(self):
  108. # Parsing trivial header with nothing.
  109. with open(test1) as ofile:
  110. rel, attrs = read_header(ofile)
  111. # Test relation
  112. assert_(rel == 'test1')
  113. # Test numerical attributes
  114. assert_(len(attrs) == 5)
  115. for i in range(4):
  116. assert_(attrs[i].name == f'attr{i}')
  117. assert_(attrs[i].type_name == 'numeric')
  118. # Test nominal attribute
  119. assert_(attrs[4].name == 'class')
  120. assert_(attrs[4].values == ('class0', 'class1', 'class2', 'class3'))
  121. def test_dateheader(self):
  122. with open(test7) as ofile:
  123. rel, attrs = read_header(ofile)
  124. assert_(rel == 'test7')
  125. assert_(len(attrs) == 5)
  126. assert_(attrs[0].name == 'attr_year')
  127. assert_(attrs[0].date_format == '%Y')
  128. assert_(attrs[1].name == 'attr_month')
  129. assert_(attrs[1].date_format == '%Y-%m')
  130. assert_(attrs[2].name == 'attr_date')
  131. assert_(attrs[2].date_format == '%Y-%m-%d')
  132. assert_(attrs[3].name == 'attr_datetime_local')
  133. assert_(attrs[3].date_format == '%Y-%m-%d %H:%M')
  134. assert_(attrs[4].name == 'attr_datetime_missing')
  135. assert_(attrs[4].date_format == '%Y-%m-%d %H:%M')
  136. def test_dateheader_unsupported(self):
  137. def read_dateheader_unsupported():
  138. with open(test8) as ofile:
  139. _, _ = read_header(ofile)
  140. assert_raises(ValueError, read_dateheader_unsupported)
  141. class TestDateAttribute:
  142. def setup_method(self):
  143. self.data, self.meta = loadarff(test7)
  144. def test_year_attribute(self):
  145. expected = np.array([
  146. '1999',
  147. '2004',
  148. '1817',
  149. '2100',
  150. '2013',
  151. '1631'
  152. ], dtype='datetime64[Y]')
  153. assert_array_equal(self.data["attr_year"], expected)
  154. def test_month_attribute(self):
  155. expected = np.array([
  156. '1999-01',
  157. '2004-12',
  158. '1817-04',
  159. '2100-09',
  160. '2013-11',
  161. '1631-10'
  162. ], dtype='datetime64[M]')
  163. assert_array_equal(self.data["attr_month"], expected)
  164. def test_date_attribute(self):
  165. expected = np.array([
  166. '1999-01-31',
  167. '2004-12-01',
  168. '1817-04-28',
  169. '2100-09-10',
  170. '2013-11-30',
  171. '1631-10-15'
  172. ], dtype='datetime64[D]')
  173. assert_array_equal(self.data["attr_date"], expected)
  174. def test_datetime_local_attribute(self):
  175. expected = np.array([
  176. datetime.datetime(year=1999, month=1, day=31, hour=0, minute=1),
  177. datetime.datetime(year=2004, month=12, day=1, hour=23, minute=59),
  178. datetime.datetime(year=1817, month=4, day=28, hour=13, minute=0),
  179. datetime.datetime(year=2100, month=9, day=10, hour=12, minute=0),
  180. datetime.datetime(year=2013, month=11, day=30, hour=4, minute=55),
  181. datetime.datetime(year=1631, month=10, day=15, hour=20, minute=4)
  182. ], dtype='datetime64[m]')
  183. assert_array_equal(self.data["attr_datetime_local"], expected)
  184. def test_datetime_missing(self):
  185. expected = np.array([
  186. 'nat',
  187. '2004-12-01T23:59',
  188. 'nat',
  189. 'nat',
  190. '2013-11-30T04:55',
  191. '1631-10-15T20:04'
  192. ], dtype='datetime64[m]')
  193. assert_array_equal(self.data["attr_datetime_missing"], expected)
  194. def test_datetime_timezone(self):
  195. assert_raises(ParseArffError, loadarff, test8)
  196. class TestRelationalAttribute:
  197. def setup_method(self):
  198. self.data, self.meta = loadarff(test9)
  199. def test_attributes(self):
  200. assert_equal(len(self.meta._attributes), 1)
  201. relational = list(self.meta._attributes.values())[0]
  202. assert_equal(relational.name, 'attr_date_number')
  203. assert_equal(relational.type_name, 'relational')
  204. assert_equal(len(relational.attributes), 2)
  205. assert_equal(relational.attributes[0].name,
  206. 'attr_date')
  207. assert_equal(relational.attributes[0].type_name,
  208. 'date')
  209. assert_equal(relational.attributes[1].name,
  210. 'attr_number')
  211. assert_equal(relational.attributes[1].type_name,
  212. 'numeric')
  213. def test_data(self):
  214. dtype_instance = [('attr_date', 'datetime64[D]'),
  215. ('attr_number', np.float64)]
  216. expected = [
  217. np.array([('1999-01-31', 1), ('1935-11-27', 10)],
  218. dtype=dtype_instance),
  219. np.array([('2004-12-01', 2), ('1942-08-13', 20)],
  220. dtype=dtype_instance),
  221. np.array([('1817-04-28', 3)],
  222. dtype=dtype_instance),
  223. np.array([('2100-09-10', 4), ('1957-04-17', 40),
  224. ('1721-01-14', 400)],
  225. dtype=dtype_instance),
  226. np.array([('2013-11-30', 5)],
  227. dtype=dtype_instance),
  228. np.array([('1631-10-15', 6)],
  229. dtype=dtype_instance)
  230. ]
  231. for i in range(len(self.data["attr_date_number"])):
  232. assert_array_equal(self.data["attr_date_number"][i],
  233. expected[i])
  234. class TestRelationalAttributeLong:
  235. def setup_method(self):
  236. self.data, self.meta = loadarff(test10)
  237. def test_attributes(self):
  238. assert_equal(len(self.meta._attributes), 1)
  239. relational = list(self.meta._attributes.values())[0]
  240. assert_equal(relational.name, 'attr_relational')
  241. assert_equal(relational.type_name, 'relational')
  242. assert_equal(len(relational.attributes), 1)
  243. assert_equal(relational.attributes[0].name,
  244. 'attr_number')
  245. assert_equal(relational.attributes[0].type_name, 'numeric')
  246. def test_data(self):
  247. dtype_instance = [('attr_number', np.float64)]
  248. expected = np.array([(n,) for n in range(30000)],
  249. dtype=dtype_instance)
  250. assert_array_equal(self.data["attr_relational"][0],
  251. expected)
  252. class TestQuotedNominal:
  253. """
  254. Regression test for issue #10232:
  255. Exception in loadarff with quoted nominal attributes.
  256. """
  257. def setup_method(self):
  258. self.data, self.meta = loadarff(test_quoted_nominal)
  259. def test_attributes(self):
  260. assert_equal(len(self.meta._attributes), 2)
  261. age, smoker = self.meta._attributes.values()
  262. assert_equal(age.name, 'age')
  263. assert_equal(age.type_name, 'numeric')
  264. assert_equal(smoker.name, 'smoker')
  265. assert_equal(smoker.type_name, 'nominal')
  266. assert_equal(smoker.values, ['yes', 'no'])
  267. def test_data(self):
  268. age_dtype_instance = np.float64
  269. smoker_dtype_instance = '<S3'
  270. age_expected = np.array([
  271. 18,
  272. 24,
  273. 44,
  274. 56,
  275. 89,
  276. 11,
  277. ], dtype=age_dtype_instance)
  278. smoker_expected = np.array([
  279. 'no',
  280. 'yes',
  281. 'no',
  282. 'no',
  283. 'yes',
  284. 'no',
  285. ], dtype=smoker_dtype_instance)
  286. assert_array_equal(self.data["age"], age_expected)
  287. assert_array_equal(self.data["smoker"], smoker_expected)
  288. class TestQuotedNominalSpaces:
  289. """
  290. Regression test for issue #10232:
  291. Exception in loadarff with quoted nominal attributes.
  292. """
  293. def setup_method(self):
  294. self.data, self.meta = loadarff(test_quoted_nominal_spaces)
  295. def test_attributes(self):
  296. assert_equal(len(self.meta._attributes), 2)
  297. age, smoker = self.meta._attributes.values()
  298. assert_equal(age.name, 'age')
  299. assert_equal(age.type_name, 'numeric')
  300. assert_equal(smoker.name, 'smoker')
  301. assert_equal(smoker.type_name, 'nominal')
  302. assert_equal(smoker.values, [' yes', 'no '])
  303. def test_data(self):
  304. age_dtype_instance = np.float64
  305. smoker_dtype_instance = '<S5'
  306. age_expected = np.array([
  307. 18,
  308. 24,
  309. 44,
  310. 56,
  311. 89,
  312. 11,
  313. ], dtype=age_dtype_instance)
  314. smoker_expected = np.array([
  315. 'no ',
  316. ' yes',
  317. 'no ',
  318. 'no ',
  319. ' yes',
  320. 'no ',
  321. ], dtype=smoker_dtype_instance)
  322. assert_array_equal(self.data["age"], age_expected)
  323. assert_array_equal(self.data["smoker"], smoker_expected)