test__iotools.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. import time
  2. from datetime import date
  3. import pytest
  4. import numpy as np
  5. from numpy.lib._iotools import (
  6. LineSplitter,
  7. NameValidator,
  8. StringConverter,
  9. easy_dtype,
  10. flatten_dtype,
  11. has_nested_fields,
  12. )
  13. from numpy.testing import assert_, assert_allclose, assert_equal, assert_raises
  14. class TestLineSplitter:
  15. "Tests the LineSplitter class."
  16. def test_no_delimiter(self):
  17. "Test LineSplitter w/o delimiter"
  18. strg = " 1 2 3 4 5 # test"
  19. test = LineSplitter()(strg)
  20. assert_equal(test, ['1', '2', '3', '4', '5'])
  21. test = LineSplitter('')(strg)
  22. assert_equal(test, ['1', '2', '3', '4', '5'])
  23. def test_space_delimiter(self):
  24. "Test space delimiter"
  25. strg = " 1 2 3 4 5 # test"
  26. test = LineSplitter(' ')(strg)
  27. assert_equal(test, ['1', '2', '3', '4', '', '5'])
  28. test = LineSplitter(' ')(strg)
  29. assert_equal(test, ['1 2 3 4', '5'])
  30. def test_tab_delimiter(self):
  31. "Test tab delimiter"
  32. strg = " 1\t 2\t 3\t 4\t 5 6"
  33. test = LineSplitter('\t')(strg)
  34. assert_equal(test, ['1', '2', '3', '4', '5 6'])
  35. strg = " 1 2\t 3 4\t 5 6"
  36. test = LineSplitter('\t')(strg)
  37. assert_equal(test, ['1 2', '3 4', '5 6'])
  38. def test_other_delimiter(self):
  39. "Test LineSplitter on delimiter"
  40. strg = "1,2,3,4,,5"
  41. test = LineSplitter(',')(strg)
  42. assert_equal(test, ['1', '2', '3', '4', '', '5'])
  43. #
  44. strg = " 1,2,3,4,,5 # test"
  45. test = LineSplitter(',')(strg)
  46. assert_equal(test, ['1', '2', '3', '4', '', '5'])
  47. # gh-11028 bytes comment/delimiters should get encoded
  48. strg = b" 1,2,3,4,,5 % test"
  49. test = LineSplitter(delimiter=b',', comments=b'%')(strg)
  50. assert_equal(test, ['1', '2', '3', '4', '', '5'])
  51. def test_constant_fixed_width(self):
  52. "Test LineSplitter w/ fixed-width fields"
  53. strg = " 1 2 3 4 5 # test"
  54. test = LineSplitter(3)(strg)
  55. assert_equal(test, ['1', '2', '3', '4', '', '5', ''])
  56. #
  57. strg = " 1 3 4 5 6# test"
  58. test = LineSplitter(20)(strg)
  59. assert_equal(test, ['1 3 4 5 6'])
  60. #
  61. strg = " 1 3 4 5 6# test"
  62. test = LineSplitter(30)(strg)
  63. assert_equal(test, ['1 3 4 5 6'])
  64. def test_variable_fixed_width(self):
  65. strg = " 1 3 4 5 6# test"
  66. test = LineSplitter((3, 6, 6, 3))(strg)
  67. assert_equal(test, ['1', '3', '4 5', '6'])
  68. #
  69. strg = " 1 3 4 5 6# test"
  70. test = LineSplitter((6, 6, 9))(strg)
  71. assert_equal(test, ['1', '3 4', '5 6'])
  72. # -----------------------------------------------------------------------------
  73. class TestNameValidator:
  74. def test_case_sensitivity(self):
  75. "Test case sensitivity"
  76. names = ['A', 'a', 'b', 'c']
  77. test = NameValidator().validate(names)
  78. assert_equal(test, ['A', 'a', 'b', 'c'])
  79. test = NameValidator(case_sensitive=False).validate(names)
  80. assert_equal(test, ['A', 'A_1', 'B', 'C'])
  81. test = NameValidator(case_sensitive='upper').validate(names)
  82. assert_equal(test, ['A', 'A_1', 'B', 'C'])
  83. test = NameValidator(case_sensitive='lower').validate(names)
  84. assert_equal(test, ['a', 'a_1', 'b', 'c'])
  85. # check exceptions
  86. assert_raises(ValueError, NameValidator, case_sensitive='foobar')
  87. def test_excludelist(self):
  88. "Test excludelist"
  89. names = ['dates', 'data', 'Other Data', 'mask']
  90. validator = NameValidator(excludelist=['dates', 'data', 'mask'])
  91. test = validator.validate(names)
  92. assert_equal(test, ['dates_', 'data_', 'Other_Data', 'mask_'])
  93. def test_missing_names(self):
  94. "Test validate missing names"
  95. namelist = ('a', 'b', 'c')
  96. validator = NameValidator()
  97. assert_equal(validator(namelist), ['a', 'b', 'c'])
  98. namelist = ('', 'b', 'c')
  99. assert_equal(validator(namelist), ['f0', 'b', 'c'])
  100. namelist = ('a', 'b', '')
  101. assert_equal(validator(namelist), ['a', 'b', 'f0'])
  102. namelist = ('', 'f0', '')
  103. assert_equal(validator(namelist), ['f1', 'f0', 'f2'])
  104. def test_validate_nb_names(self):
  105. "Test validate nb names"
  106. namelist = ('a', 'b', 'c')
  107. validator = NameValidator()
  108. assert_equal(validator(namelist, nbfields=1), ('a',))
  109. assert_equal(validator(namelist, nbfields=5, defaultfmt="g%i"),
  110. ['a', 'b', 'c', 'g0', 'g1'])
  111. def test_validate_wo_names(self):
  112. "Test validate no names"
  113. namelist = None
  114. validator = NameValidator()
  115. assert_(validator(namelist) is None)
  116. assert_equal(validator(namelist, nbfields=3), ['f0', 'f1', 'f2'])
  117. # -----------------------------------------------------------------------------
  118. def _bytes_to_date(s):
  119. return date(*time.strptime(s, "%Y-%m-%d")[:3])
  120. class TestStringConverter:
  121. "Test StringConverter"
  122. def test_creation(self):
  123. "Test creation of a StringConverter"
  124. converter = StringConverter(int, -99999)
  125. assert_equal(converter._status, 1)
  126. assert_equal(converter.default, -99999)
  127. def test_upgrade(self):
  128. "Tests the upgrade method."
  129. converter = StringConverter()
  130. assert_equal(converter._status, 0)
  131. # test int
  132. assert_equal(converter.upgrade('0'), 0)
  133. assert_equal(converter._status, 1)
  134. # On systems where long defaults to 32-bit, the statuses will be
  135. # offset by one, so we check for this here.
  136. import numpy._core.numeric as nx
  137. status_offset = int(nx.dtype(nx.int_).itemsize < nx.dtype(nx.int64).itemsize)
  138. # test int > 2**32
  139. assert_equal(converter.upgrade('17179869184'), 17179869184)
  140. assert_equal(converter._status, 1 + status_offset)
  141. # test float
  142. assert_allclose(converter.upgrade('0.'), 0.0)
  143. assert_equal(converter._status, 2 + status_offset)
  144. # test complex
  145. assert_equal(converter.upgrade('0j'), complex('0j'))
  146. assert_equal(converter._status, 3 + status_offset)
  147. # test str
  148. # note that the longdouble type has been skipped, so the
  149. # _status increases by 2. Everything should succeed with
  150. # unicode conversion (8).
  151. for s in ['a', b'a']:
  152. res = converter.upgrade(s)
  153. assert_(type(res) is str)
  154. assert_equal(res, 'a')
  155. assert_equal(converter._status, 8 + status_offset)
  156. def test_missing(self):
  157. "Tests the use of missing values."
  158. converter = StringConverter(missing_values=('missing',
  159. 'missed'))
  160. converter.upgrade('0')
  161. assert_equal(converter('0'), 0)
  162. assert_equal(converter(''), converter.default)
  163. assert_equal(converter('missing'), converter.default)
  164. assert_equal(converter('missed'), converter.default)
  165. try:
  166. converter('miss')
  167. except ValueError:
  168. pass
  169. @pytest.mark.thread_unsafe(reason="monkeypatches StringConverter")
  170. def test_upgrademapper(self):
  171. "Tests updatemapper"
  172. dateparser = _bytes_to_date
  173. _original_mapper = StringConverter._mapper[:]
  174. try:
  175. StringConverter.upgrade_mapper(dateparser, date(2000, 1, 1))
  176. convert = StringConverter(dateparser, date(2000, 1, 1))
  177. test = convert('2001-01-01')
  178. assert_equal(test, date(2001, 1, 1))
  179. test = convert('2009-01-01')
  180. assert_equal(test, date(2009, 1, 1))
  181. test = convert('')
  182. assert_equal(test, date(2000, 1, 1))
  183. finally:
  184. StringConverter._mapper = _original_mapper
  185. def test_string_to_object(self):
  186. "Make sure that string-to-object functions are properly recognized"
  187. old_mapper = StringConverter._mapper[:] # copy of list
  188. conv = StringConverter(_bytes_to_date)
  189. assert_equal(conv._mapper, old_mapper)
  190. assert_(hasattr(conv, 'default'))
  191. def test_keep_default(self):
  192. "Make sure we don't lose an explicit default"
  193. converter = StringConverter(None, missing_values='',
  194. default=-999)
  195. converter.upgrade('3.14159265')
  196. assert_equal(converter.default, -999)
  197. assert_equal(converter.type, np.dtype(float))
  198. #
  199. converter = StringConverter(
  200. None, missing_values='', default=0)
  201. converter.upgrade('3.14159265')
  202. assert_equal(converter.default, 0)
  203. assert_equal(converter.type, np.dtype(float))
  204. def test_keep_default_zero(self):
  205. "Check that we don't lose a default of 0"
  206. converter = StringConverter(int, default=0,
  207. missing_values="N/A")
  208. assert_equal(converter.default, 0)
  209. def test_keep_missing_values(self):
  210. "Check that we're not losing missing values"
  211. converter = StringConverter(int, default=0,
  212. missing_values="N/A")
  213. assert_equal(
  214. converter.missing_values, {'', 'N/A'})
  215. def test_int64_dtype(self):
  216. "Check that int64 integer types can be specified"
  217. converter = StringConverter(np.int64, default=0)
  218. val = "-9223372036854775807"
  219. assert_(converter(val) == -9223372036854775807)
  220. val = "9223372036854775807"
  221. assert_(converter(val) == 9223372036854775807)
  222. def test_uint64_dtype(self):
  223. "Check that uint64 integer types can be specified"
  224. converter = StringConverter(np.uint64, default=0)
  225. val = "9223372043271415339"
  226. assert_(converter(val) == 9223372043271415339)
  227. class TestMiscFunctions:
  228. def test_has_nested_dtype(self):
  229. "Test has_nested_dtype"
  230. ndtype = np.dtype(float)
  231. assert_equal(has_nested_fields(ndtype), False)
  232. ndtype = np.dtype([('A', '|S3'), ('B', float)])
  233. assert_equal(has_nested_fields(ndtype), False)
  234. ndtype = np.dtype([('A', int), ('B', [('BA', float), ('BB', '|S1')])])
  235. assert_equal(has_nested_fields(ndtype), True)
  236. def test_easy_dtype(self):
  237. "Test ndtype on dtypes"
  238. # Simple case
  239. ndtype = float
  240. assert_equal(easy_dtype(ndtype), np.dtype(float))
  241. # As string w/o names
  242. ndtype = "i4, f8"
  243. assert_equal(easy_dtype(ndtype),
  244. np.dtype([('f0', "i4"), ('f1', "f8")]))
  245. # As string w/o names but different default format
  246. assert_equal(easy_dtype(ndtype, defaultfmt="field_%03i"),
  247. np.dtype([('field_000', "i4"), ('field_001', "f8")]))
  248. # As string w/ names
  249. ndtype = "i4, f8"
  250. assert_equal(easy_dtype(ndtype, names="a, b"),
  251. np.dtype([('a', "i4"), ('b', "f8")]))
  252. # As string w/ names (too many)
  253. ndtype = "i4, f8"
  254. assert_equal(easy_dtype(ndtype, names="a, b, c"),
  255. np.dtype([('a', "i4"), ('b', "f8")]))
  256. # As string w/ names (not enough)
  257. ndtype = "i4, f8"
  258. assert_equal(easy_dtype(ndtype, names=", b"),
  259. np.dtype([('f0', "i4"), ('b', "f8")]))
  260. # ... (with different default format)
  261. assert_equal(easy_dtype(ndtype, names="a", defaultfmt="f%02i"),
  262. np.dtype([('a', "i4"), ('f00', "f8")]))
  263. # As list of tuples w/o names
  264. ndtype = [('A', int), ('B', float)]
  265. assert_equal(easy_dtype(ndtype), np.dtype([('A', int), ('B', float)]))
  266. # As list of tuples w/ names
  267. assert_equal(easy_dtype(ndtype, names="a,b"),
  268. np.dtype([('a', int), ('b', float)]))
  269. # As list of tuples w/ not enough names
  270. assert_equal(easy_dtype(ndtype, names="a"),
  271. np.dtype([('a', int), ('f0', float)]))
  272. # As list of tuples w/ too many names
  273. assert_equal(easy_dtype(ndtype, names="a,b,c"),
  274. np.dtype([('a', int), ('b', float)]))
  275. # As list of types w/o names
  276. ndtype = (int, float, float)
  277. assert_equal(easy_dtype(ndtype),
  278. np.dtype([('f0', int), ('f1', float), ('f2', float)]))
  279. # As list of types w names
  280. ndtype = (int, float, float)
  281. assert_equal(easy_dtype(ndtype, names="a, b, c"),
  282. np.dtype([('a', int), ('b', float), ('c', float)]))
  283. # As simple dtype w/ names
  284. ndtype = np.dtype(float)
  285. assert_equal(easy_dtype(ndtype, names="a, b, c"),
  286. np.dtype([(_, float) for _ in ('a', 'b', 'c')]))
  287. # As simple dtype w/o names (but multiple fields)
  288. ndtype = np.dtype(float)
  289. assert_equal(
  290. easy_dtype(ndtype, names=['', '', ''], defaultfmt="f%02i"),
  291. np.dtype([(_, float) for _ in ('f00', 'f01', 'f02')]))
  292. def test_flatten_dtype(self):
  293. "Testing flatten_dtype"
  294. # Standard dtype
  295. dt = np.dtype([("a", "f8"), ("b", "f8")])
  296. dt_flat = flatten_dtype(dt)
  297. assert_equal(dt_flat, [float, float])
  298. # Recursive dtype
  299. dt = np.dtype([("a", [("aa", '|S1'), ("ab", '|S2')]), ("b", int)])
  300. dt_flat = flatten_dtype(dt)
  301. assert_equal(dt_flat, [np.dtype('|S1'), np.dtype('|S2'), int])
  302. # dtype with shaped fields
  303. dt = np.dtype([("a", (float, 2)), ("b", (int, 3))])
  304. dt_flat = flatten_dtype(dt)
  305. assert_equal(dt_flat, [float, int])
  306. dt_flat = flatten_dtype(dt, True)
  307. assert_equal(dt_flat, [float] * 2 + [int] * 3)
  308. # dtype w/ titles
  309. dt = np.dtype([(("a", "A"), "f8"), (("b", "B"), "f8")])
  310. dt_flat = flatten_dtype(dt)
  311. assert_equal(dt_flat, [float, float])