recfunctions.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685
  1. """
  2. Collection of utilities to manipulate structured arrays.
  3. Most of these functions were initially implemented by John Hunter for
  4. matplotlib. They have been rewritten and extended for convenience.
  5. """
  6. import itertools
  7. import numpy as np
  8. import numpy.ma as ma
  9. import numpy.ma.mrecords as mrec
  10. from numpy._core.overrides import array_function_dispatch
  11. from numpy.lib._iotools import _is_string_like
  12. __all__ = [
  13. 'append_fields', 'apply_along_fields', 'assign_fields_by_name',
  14. 'drop_fields', 'find_duplicates', 'flatten_descr',
  15. 'get_fieldstructure', 'get_names', 'get_names_flat',
  16. 'join_by', 'merge_arrays', 'rec_append_fields',
  17. 'rec_drop_fields', 'rec_join', 'recursive_fill_fields',
  18. 'rename_fields', 'repack_fields', 'require_fields',
  19. 'stack_arrays', 'structured_to_unstructured', 'unstructured_to_structured',
  20. ]
  21. def _recursive_fill_fields_dispatcher(input, output):
  22. return (input, output)
  23. @array_function_dispatch(_recursive_fill_fields_dispatcher)
  24. def recursive_fill_fields(input, output):
  25. """
  26. Fills fields from output with fields from input,
  27. with support for nested structures.
  28. Parameters
  29. ----------
  30. input : ndarray
  31. Input array.
  32. output : ndarray
  33. Output array.
  34. Notes
  35. -----
  36. * `output` should be at least the same size as `input`
  37. Examples
  38. --------
  39. >>> import numpy as np
  40. >>> from numpy.lib import recfunctions as rfn
  41. >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', np.int64), ('B', np.float64)])
  42. >>> b = np.zeros((3,), dtype=a.dtype)
  43. >>> rfn.recursive_fill_fields(a, b)
  44. array([(1, 10.), (2, 20.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])
  45. """
  46. newdtype = output.dtype
  47. for field in newdtype.names:
  48. try:
  49. current = input[field]
  50. except ValueError:
  51. continue
  52. if current.dtype.names is not None:
  53. recursive_fill_fields(current, output[field])
  54. else:
  55. output[field][:len(current)] = current
  56. return output
  57. def _get_fieldspec(dtype):
  58. """
  59. Produce a list of name/dtype pairs corresponding to the dtype fields
  60. Similar to dtype.descr, but the second item of each tuple is a dtype, not a
  61. string. As a result, this handles subarray dtypes
  62. Can be passed to the dtype constructor to reconstruct the dtype, noting that
  63. this (deliberately) discards field offsets.
  64. Examples
  65. --------
  66. >>> import numpy as np
  67. >>> dt = np.dtype([(('a', 'A'), np.int64), ('b', np.double, 3)])
  68. >>> dt.descr
  69. [(('a', 'A'), '<i8'), ('b', '<f8', (3,))]
  70. >>> _get_fieldspec(dt)
  71. [(('a', 'A'), dtype('int64')), ('b', dtype(('<f8', (3,))))]
  72. """
  73. if dtype.names is None:
  74. # .descr returns a nameless field, so we should too
  75. return [('', dtype)]
  76. else:
  77. fields = ((name, dtype.fields[name]) for name in dtype.names)
  78. # keep any titles, if present
  79. return [
  80. (name if len(f) == 2 else (f[2], name), f[0])
  81. for name, f in fields
  82. ]
  83. def get_names(adtype):
  84. """
  85. Returns the field names of the input datatype as a tuple. Input datatype
  86. must have fields otherwise error is raised.
  87. Parameters
  88. ----------
  89. adtype : dtype
  90. Input datatype
  91. Examples
  92. --------
  93. >>> import numpy as np
  94. >>> from numpy.lib import recfunctions as rfn
  95. >>> rfn.get_names(np.empty((1,), dtype=[('A', int)]).dtype)
  96. ('A',)
  97. >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]).dtype)
  98. ('A', 'B')
  99. >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
  100. >>> rfn.get_names(adtype)
  101. ('a', ('b', ('ba', 'bb')))
  102. """
  103. listnames = []
  104. names = adtype.names
  105. for name in names:
  106. current = adtype[name]
  107. if current.names is not None:
  108. listnames.append((name, tuple(get_names(current))))
  109. else:
  110. listnames.append(name)
  111. return tuple(listnames)
  112. def get_names_flat(adtype):
  113. """
  114. Returns the field names of the input datatype as a tuple. Input datatype
  115. must have fields otherwise error is raised.
  116. Nested structure are flattened beforehand.
  117. Parameters
  118. ----------
  119. adtype : dtype
  120. Input datatype
  121. Examples
  122. --------
  123. >>> import numpy as np
  124. >>> from numpy.lib import recfunctions as rfn
  125. >>> rfn.get_names_flat(np.empty((1,), dtype=[('A', int)]).dtype) is None
  126. False
  127. >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', str)]).dtype)
  128. ('A', 'B')
  129. >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
  130. >>> rfn.get_names_flat(adtype)
  131. ('a', 'b', 'ba', 'bb')
  132. """
  133. listnames = []
  134. names = adtype.names
  135. for name in names:
  136. listnames.append(name)
  137. current = adtype[name]
  138. if current.names is not None:
  139. listnames.extend(get_names_flat(current))
  140. return tuple(listnames)
  141. def flatten_descr(ndtype):
  142. """
  143. Flatten a structured data-type description.
  144. Examples
  145. --------
  146. >>> import numpy as np
  147. >>> from numpy.lib import recfunctions as rfn
  148. >>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])])
  149. >>> rfn.flatten_descr(ndtype)
  150. (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32')))
  151. """
  152. names = ndtype.names
  153. if names is None:
  154. return (('', ndtype),)
  155. else:
  156. descr = []
  157. for field in names:
  158. (typ, _) = ndtype.fields[field]
  159. if typ.names is not None:
  160. descr.extend(flatten_descr(typ))
  161. else:
  162. descr.append((field, typ))
  163. return tuple(descr)
  164. def _zip_dtype(seqarrays, flatten=False):
  165. newdtype = []
  166. if flatten:
  167. for a in seqarrays:
  168. newdtype.extend(flatten_descr(a.dtype))
  169. else:
  170. for a in seqarrays:
  171. current = a.dtype
  172. if current.names is not None and len(current.names) == 1:
  173. # special case - dtypes of 1 field are flattened
  174. newdtype.extend(_get_fieldspec(current))
  175. else:
  176. newdtype.append(('', current))
  177. return np.dtype(newdtype)
  178. def _zip_descr(seqarrays, flatten=False):
  179. """
  180. Combine the dtype description of a series of arrays.
  181. Parameters
  182. ----------
  183. seqarrays : sequence of arrays
  184. Sequence of arrays
  185. flatten : {boolean}, optional
  186. Whether to collapse nested descriptions.
  187. """
  188. return _zip_dtype(seqarrays, flatten=flatten).descr
  189. def get_fieldstructure(adtype, lastname=None, parents=None,):
  190. """
  191. Returns a dictionary with fields indexing lists of their parent fields.
  192. This function is used to simplify access to fields nested in other fields.
  193. Parameters
  194. ----------
  195. adtype : np.dtype
  196. Input datatype
  197. lastname : optional
  198. Last processed field name (used internally during recursion).
  199. parents : dictionary
  200. Dictionary of parent fields (used internally during recursion).
  201. Examples
  202. --------
  203. >>> import numpy as np
  204. >>> from numpy.lib import recfunctions as rfn
  205. >>> ndtype = np.dtype([('A', int),
  206. ... ('B', [('BA', int),
  207. ... ('BB', [('BBA', int), ('BBB', int)])])])
  208. >>> rfn.get_fieldstructure(ndtype)
  209. ... # XXX: possible regression, order of BBA and BBB is swapped
  210. {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
  211. """
  212. if parents is None:
  213. parents = {}
  214. names = adtype.names
  215. for name in names:
  216. current = adtype[name]
  217. if current.names is not None:
  218. if lastname:
  219. parents[name] = [lastname, ]
  220. else:
  221. parents[name] = []
  222. parents.update(get_fieldstructure(current, name, parents))
  223. else:
  224. lastparent = list((parents.get(lastname, []) or []))
  225. if lastparent:
  226. lastparent.append(lastname)
  227. elif lastname:
  228. lastparent = [lastname, ]
  229. parents[name] = lastparent or []
  230. return parents
  231. def _izip_fields_flat(iterable):
  232. """
  233. Returns an iterator of concatenated fields from a sequence of arrays,
  234. collapsing any nested structure.
  235. """
  236. for element in iterable:
  237. if isinstance(element, np.void):
  238. yield from _izip_fields_flat(tuple(element))
  239. else:
  240. yield element
  241. def _izip_fields(iterable):
  242. """
  243. Returns an iterator of concatenated fields from a sequence of arrays.
  244. """
  245. for element in iterable:
  246. if (hasattr(element, '__iter__') and
  247. not isinstance(element, str)):
  248. yield from _izip_fields(element)
  249. elif isinstance(element, np.void) and len(tuple(element)) == 1:
  250. # this statement is the same from the previous expression
  251. yield from _izip_fields(element)
  252. else:
  253. yield element
  254. def _izip_records(seqarrays, fill_value=None, flatten=True):
  255. """
  256. Returns an iterator of concatenated items from a sequence of arrays.
  257. Parameters
  258. ----------
  259. seqarrays : sequence of arrays
  260. Sequence of arrays.
  261. fill_value : {None, integer}
  262. Value used to pad shorter iterables.
  263. flatten : {True, False},
  264. Whether to
  265. """
  266. # Should we flatten the items, or just use a nested approach
  267. if flatten:
  268. zipfunc = _izip_fields_flat
  269. else:
  270. zipfunc = _izip_fields
  271. for tup in itertools.zip_longest(*seqarrays, fillvalue=fill_value):
  272. yield tuple(zipfunc(tup))
  273. def _fix_output(output, usemask=True, asrecarray=False):
  274. """
  275. Private function: return a recarray, a ndarray, a MaskedArray
  276. or a MaskedRecords depending on the input parameters
  277. """
  278. if not isinstance(output, ma.MaskedArray):
  279. usemask = False
  280. if usemask:
  281. if asrecarray:
  282. output = output.view(mrec.MaskedRecords)
  283. else:
  284. output = ma.filled(output)
  285. if asrecarray:
  286. output = output.view(np.recarray)
  287. return output
  288. def _fix_defaults(output, defaults=None):
  289. """
  290. Update the fill_value and masked data of `output`
  291. from the default given in a dictionary defaults.
  292. """
  293. names = output.dtype.names
  294. (data, mask, fill_value) = (output.data, output.mask, output.fill_value)
  295. for (k, v) in (defaults or {}).items():
  296. if k in names:
  297. fill_value[k] = v
  298. data[k][mask[k]] = v
  299. return output
  300. def _merge_arrays_dispatcher(seqarrays, fill_value=None, flatten=None,
  301. usemask=None, asrecarray=None):
  302. return seqarrays
  303. @array_function_dispatch(_merge_arrays_dispatcher)
  304. def merge_arrays(seqarrays, fill_value=-1, flatten=False,
  305. usemask=False, asrecarray=False):
  306. """
  307. Merge arrays field by field.
  308. Parameters
  309. ----------
  310. seqarrays : sequence of ndarrays
  311. Sequence of arrays
  312. fill_value : {float}, optional
  313. Filling value used to pad missing data on the shorter arrays.
  314. flatten : {False, True}, optional
  315. Whether to collapse nested fields.
  316. usemask : {False, True}, optional
  317. Whether to return a masked array or not.
  318. asrecarray : {False, True}, optional
  319. Whether to return a recarray (MaskedRecords) or not.
  320. Examples
  321. --------
  322. >>> import numpy as np
  323. >>> from numpy.lib import recfunctions as rfn
  324. >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
  325. array([( 1, 10.), ( 2, 20.), (-1, 30.)],
  326. dtype=[('f0', '<i8'), ('f1', '<f8')])
  327. >>> rfn.merge_arrays((np.array([1, 2], dtype=np.int64),
  328. ... np.array([10., 20., 30.])), usemask=False)
  329. array([(1, 10.0), (2, 20.0), (-1, 30.0)],
  330. dtype=[('f0', '<i8'), ('f1', '<f8')])
  331. >>> rfn.merge_arrays((np.array([1, 2]).view([('a', np.int64)]),
  332. ... np.array([10., 20., 30.])),
  333. ... usemask=False, asrecarray=True)
  334. rec.array([( 1, 10.), ( 2, 20.), (-1, 30.)],
  335. dtype=[('a', '<i8'), ('f1', '<f8')])
  336. Notes
  337. -----
  338. * Without a mask, the missing value will be filled with something,
  339. depending on what its corresponding type:
  340. * ``-1`` for integers
  341. * ``-1.0`` for floating point numbers
  342. * ``'-'`` for characters
  343. * ``'-1'`` for strings
  344. * ``True`` for boolean values
  345. * XXX: I just obtained these values empirically
  346. """
  347. # Only one item in the input sequence ?
  348. if (len(seqarrays) == 1):
  349. seqarrays = np.asanyarray(seqarrays[0])
  350. # Do we have a single ndarray as input ?
  351. if isinstance(seqarrays, (np.ndarray, np.void)):
  352. seqdtype = seqarrays.dtype
  353. # Make sure we have named fields
  354. if seqdtype.names is None:
  355. seqdtype = np.dtype([('', seqdtype)])
  356. if not flatten or _zip_dtype((seqarrays,), flatten=True) == seqdtype:
  357. # Minimal processing needed: just make sure everything's a-ok
  358. seqarrays = seqarrays.ravel()
  359. # Find what type of array we must return
  360. if usemask:
  361. if asrecarray:
  362. seqtype = mrec.MaskedRecords
  363. else:
  364. seqtype = ma.MaskedArray
  365. elif asrecarray:
  366. seqtype = np.recarray
  367. else:
  368. seqtype = np.ndarray
  369. return seqarrays.view(dtype=seqdtype, type=seqtype)
  370. else:
  371. seqarrays = (seqarrays,)
  372. else:
  373. # Make sure we have arrays in the input sequence
  374. seqarrays = [np.asanyarray(_m) for _m in seqarrays]
  375. # Find the sizes of the inputs and their maximum
  376. sizes = tuple(a.size for a in seqarrays)
  377. maxlength = max(sizes)
  378. # Get the dtype of the output (flattening if needed)
  379. newdtype = _zip_dtype(seqarrays, flatten=flatten)
  380. # Initialize the sequences for data and mask
  381. seqdata = []
  382. seqmask = []
  383. # If we expect some kind of MaskedArray, make a special loop.
  384. if usemask:
  385. for (a, n) in zip(seqarrays, sizes):
  386. nbmissing = (maxlength - n)
  387. # Get the data and mask
  388. data = a.ravel().__array__()
  389. mask = ma.getmaskarray(a).ravel()
  390. # Get the filling value (if needed)
  391. if nbmissing:
  392. fval = mrec._check_fill_value(fill_value, a.dtype)
  393. if isinstance(fval, (np.ndarray, np.void)):
  394. if len(fval.dtype) == 1:
  395. fval = fval.item()[0]
  396. fmsk = True
  397. else:
  398. fval = np.array(fval, dtype=a.dtype, ndmin=1)
  399. fmsk = np.ones((1,), dtype=mask.dtype)
  400. else:
  401. fval = None
  402. fmsk = True
  403. # Store an iterator padding the input to the expected length
  404. seqdata.append(itertools.chain(data, [fval] * nbmissing))
  405. seqmask.append(itertools.chain(mask, [fmsk] * nbmissing))
  406. # Create an iterator for the data
  407. data = tuple(_izip_records(seqdata, flatten=flatten))
  408. output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength),
  409. mask=list(_izip_records(seqmask, flatten=flatten)))
  410. if asrecarray:
  411. output = output.view(mrec.MaskedRecords)
  412. else:
  413. # Same as before, without the mask we don't need...
  414. for (a, n) in zip(seqarrays, sizes):
  415. nbmissing = (maxlength - n)
  416. data = a.ravel().__array__()
  417. if nbmissing:
  418. fval = mrec._check_fill_value(fill_value, a.dtype)
  419. if isinstance(fval, (np.ndarray, np.void)):
  420. if len(fval.dtype) == 1:
  421. fval = fval.item()[0]
  422. else:
  423. fval = np.array(fval, dtype=a.dtype, ndmin=1)
  424. else:
  425. fval = None
  426. seqdata.append(itertools.chain(data, [fval] * nbmissing))
  427. output = np.fromiter(tuple(_izip_records(seqdata, flatten=flatten)),
  428. dtype=newdtype, count=maxlength)
  429. if asrecarray:
  430. output = output.view(np.recarray)
  431. # And we're done...
  432. return output
  433. def _drop_fields_dispatcher(base, drop_names, usemask=None, asrecarray=None):
  434. return (base,)
  435. @array_function_dispatch(_drop_fields_dispatcher)
  436. def drop_fields(base, drop_names, usemask=True, asrecarray=False):
  437. """
  438. Return a new array with fields in `drop_names` dropped.
  439. Nested fields are supported.
  440. Parameters
  441. ----------
  442. base : array
  443. Input array
  444. drop_names : string or sequence
  445. String or sequence of strings corresponding to the names of the
  446. fields to drop.
  447. usemask : {False, True}, optional
  448. Whether to return a masked array or not.
  449. asrecarray : string or sequence, optional
  450. Whether to return a recarray or a mrecarray (`asrecarray=True`) or
  451. a plain ndarray or masked array with flexible dtype. The default
  452. is False.
  453. Examples
  454. --------
  455. >>> import numpy as np
  456. >>> from numpy.lib import recfunctions as rfn
  457. >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
  458. ... dtype=[('a', np.int64), ('b', [('ba', np.double), ('bb', np.int64)])])
  459. >>> rfn.drop_fields(a, 'a')
  460. array([((2., 3),), ((5., 6),)],
  461. dtype=[('b', [('ba', '<f8'), ('bb', '<i8')])])
  462. >>> rfn.drop_fields(a, 'ba')
  463. array([(1, (3,)), (4, (6,))], dtype=[('a', '<i8'), ('b', [('bb', '<i8')])])
  464. >>> rfn.drop_fields(a, ['ba', 'bb'])
  465. array([(1,), (4,)], dtype=[('a', '<i8')])
  466. """
  467. if _is_string_like(drop_names):
  468. drop_names = [drop_names]
  469. else:
  470. drop_names = set(drop_names)
  471. def _drop_descr(ndtype, drop_names):
  472. names = ndtype.names
  473. newdtype = []
  474. for name in names:
  475. current = ndtype[name]
  476. if name in drop_names:
  477. continue
  478. if current.names is not None:
  479. descr = _drop_descr(current, drop_names)
  480. if descr:
  481. newdtype.append((name, descr))
  482. else:
  483. newdtype.append((name, current))
  484. return newdtype
  485. newdtype = _drop_descr(base.dtype, drop_names)
  486. output = np.empty(base.shape, dtype=newdtype)
  487. output = recursive_fill_fields(base, output)
  488. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  489. def _keep_fields(base, keep_names, usemask=True, asrecarray=False):
  490. """
  491. Return a new array keeping only the fields in `keep_names`,
  492. and preserving the order of those fields.
  493. Parameters
  494. ----------
  495. base : array
  496. Input array
  497. keep_names : string or sequence
  498. String or sequence of strings corresponding to the names of the
  499. fields to keep. Order of the names will be preserved.
  500. usemask : {False, True}, optional
  501. Whether to return a masked array or not.
  502. asrecarray : string or sequence, optional
  503. Whether to return a recarray or a mrecarray (`asrecarray=True`) or
  504. a plain ndarray or masked array with flexible dtype. The default
  505. is False.
  506. """
  507. newdtype = [(n, base.dtype[n]) for n in keep_names]
  508. output = np.empty(base.shape, dtype=newdtype)
  509. output = recursive_fill_fields(base, output)
  510. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  511. def _rec_drop_fields_dispatcher(base, drop_names):
  512. return (base,)
  513. @array_function_dispatch(_rec_drop_fields_dispatcher)
  514. def rec_drop_fields(base, drop_names):
  515. """
  516. Returns a new numpy.recarray with fields in `drop_names` dropped.
  517. """
  518. return drop_fields(base, drop_names, usemask=False, asrecarray=True)
  519. def _rename_fields_dispatcher(base, namemapper):
  520. return (base,)
  521. @array_function_dispatch(_rename_fields_dispatcher)
  522. def rename_fields(base, namemapper):
  523. """
  524. Rename the fields from a flexible-datatype ndarray or recarray.
  525. Nested fields are supported.
  526. Parameters
  527. ----------
  528. base : ndarray
  529. Input array whose fields must be modified.
  530. namemapper : dictionary
  531. Dictionary mapping old field names to their new version.
  532. Examples
  533. --------
  534. >>> import numpy as np
  535. >>> from numpy.lib import recfunctions as rfn
  536. >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
  537. ... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
  538. >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
  539. array([(1, (2., [ 3., 30.])), (4, (5., [ 6., 60.]))],
  540. dtype=[('A', '<i8'), ('b', [('ba', '<f8'), ('BB', '<f8', (2,))])])
  541. """
  542. def _recursive_rename_fields(ndtype, namemapper):
  543. newdtype = []
  544. for name in ndtype.names:
  545. newname = namemapper.get(name, name)
  546. current = ndtype[name]
  547. if current.names is not None:
  548. newdtype.append(
  549. (newname, _recursive_rename_fields(current, namemapper))
  550. )
  551. else:
  552. newdtype.append((newname, current))
  553. return newdtype
  554. newdtype = _recursive_rename_fields(base.dtype, namemapper)
  555. return base.view(newdtype)
  556. def _append_fields_dispatcher(base, names, data, dtypes=None,
  557. fill_value=None, usemask=None, asrecarray=None):
  558. yield base
  559. yield from data
  560. @array_function_dispatch(_append_fields_dispatcher)
  561. def append_fields(base, names, data, dtypes=None,
  562. fill_value=-1, usemask=True, asrecarray=False):
  563. """
  564. Add new fields to an existing array.
  565. The names of the fields are given with the `names` arguments,
  566. the corresponding values with the `data` arguments.
  567. If a single field is appended, `names`, `data` and `dtypes` do not have
  568. to be lists but just values.
  569. Parameters
  570. ----------
  571. base : array
  572. Input array to extend.
  573. names : string, sequence
  574. String or sequence of strings corresponding to the names
  575. of the new fields.
  576. data : array or sequence of arrays
  577. Array or sequence of arrays storing the fields to add to the base.
  578. dtypes : sequence of datatypes, optional
  579. Datatype or sequence of datatypes.
  580. If None, the datatypes are estimated from the `data`.
  581. fill_value : {float}, optional
  582. Filling value used to pad missing data on the shorter arrays.
  583. usemask : {False, True}, optional
  584. Whether to return a masked array or not.
  585. asrecarray : {False, True}, optional
  586. Whether to return a recarray (MaskedRecords) or not.
  587. """
  588. # Check the names
  589. if isinstance(names, (tuple, list)):
  590. if len(names) != len(data):
  591. msg = "The number of arrays does not match the number of names"
  592. raise ValueError(msg)
  593. elif isinstance(names, str):
  594. names = [names, ]
  595. data = [data, ]
  596. #
  597. if dtypes is None:
  598. data = [np.array(a, copy=None, subok=True) for a in data]
  599. data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)]
  600. else:
  601. if not isinstance(dtypes, (tuple, list)):
  602. dtypes = [dtypes, ]
  603. if len(data) != len(dtypes):
  604. if len(dtypes) == 1:
  605. dtypes = dtypes * len(data)
  606. else:
  607. msg = "The dtypes argument must be None, a dtype, or a list."
  608. raise ValueError(msg)
  609. data = [np.array(a, copy=None, subok=True, dtype=d).view([(n, d)])
  610. for (a, n, d) in zip(data, names, dtypes)]
  611. #
  612. base = merge_arrays(base, usemask=usemask, fill_value=fill_value)
  613. if len(data) > 1:
  614. data = merge_arrays(data, flatten=True, usemask=usemask,
  615. fill_value=fill_value)
  616. else:
  617. data = data.pop()
  618. #
  619. output = ma.masked_all(
  620. max(len(base), len(data)),
  621. dtype=_get_fieldspec(base.dtype) + _get_fieldspec(data.dtype))
  622. output = recursive_fill_fields(base, output)
  623. output = recursive_fill_fields(data, output)
  624. #
  625. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  626. def _rec_append_fields_dispatcher(base, names, data, dtypes=None):
  627. yield base
  628. yield from data
  629. @array_function_dispatch(_rec_append_fields_dispatcher)
  630. def rec_append_fields(base, names, data, dtypes=None):
  631. """
  632. Add new fields to an existing array.
  633. The names of the fields are given with the `names` arguments,
  634. the corresponding values with the `data` arguments.
  635. If a single field is appended, `names`, `data` and `dtypes` do not have
  636. to be lists but just values.
  637. Parameters
  638. ----------
  639. base : array
  640. Input array to extend.
  641. names : string, sequence
  642. String or sequence of strings corresponding to the names
  643. of the new fields.
  644. data : array or sequence of arrays
  645. Array or sequence of arrays storing the fields to add to the base.
  646. dtypes : sequence of datatypes, optional
  647. Datatype or sequence of datatypes.
  648. If None, the datatypes are estimated from the `data`.
  649. See Also
  650. --------
  651. append_fields
  652. Returns
  653. -------
  654. appended_array : np.recarray
  655. """
  656. return append_fields(base, names, data=data, dtypes=dtypes,
  657. asrecarray=True, usemask=False)
  658. def _repack_fields_dispatcher(a, align=None, recurse=None):
  659. return (a,)
  660. @array_function_dispatch(_repack_fields_dispatcher)
  661. def repack_fields(a, align=False, recurse=False):
  662. """
  663. Re-pack the fields of a structured array or dtype in memory.
  664. The memory layout of structured datatypes allows fields at arbitrary
  665. byte offsets. This means the fields can be separated by padding bytes,
  666. their offsets can be non-monotonically increasing, and they can overlap.
  667. This method removes any overlaps and reorders the fields in memory so they
  668. have increasing byte offsets, and adds or removes padding bytes depending
  669. on the `align` option, which behaves like the `align` option to
  670. `numpy.dtype`.
  671. If `align=False`, this method produces a "packed" memory layout in which
  672. each field starts at the byte the previous field ended, and any padding
  673. bytes are removed.
  674. If `align=True`, this methods produces an "aligned" memory layout in which
  675. each field's offset is a multiple of its alignment, and the total itemsize
  676. is a multiple of the largest alignment, by adding padding bytes as needed.
  677. Parameters
  678. ----------
  679. a : ndarray or dtype
  680. array or dtype for which to repack the fields.
  681. align : boolean
  682. If true, use an "aligned" memory layout, otherwise use a "packed" layout.
  683. recurse : boolean
  684. If True, also repack nested structures.
  685. Returns
  686. -------
  687. repacked : ndarray or dtype
  688. Copy of `a` with fields repacked, or `a` itself if no repacking was
  689. needed.
  690. Examples
  691. --------
  692. >>> import numpy as np
  693. >>> from numpy.lib import recfunctions as rfn
  694. >>> def print_offsets(d):
  695. ... print("offsets:", [d.fields[name][1] for name in d.names])
  696. ... print("itemsize:", d.itemsize)
  697. ...
  698. >>> dt = np.dtype('u1, <i8, <f8', align=True)
  699. >>> dt
  700. dtype({'names': ['f0', 'f1', 'f2'], 'formats': ['u1', '<i8', '<f8'], \
  701. 'offsets': [0, 8, 16], 'itemsize': 24}, align=True)
  702. >>> print_offsets(dt)
  703. offsets: [0, 8, 16]
  704. itemsize: 24
  705. >>> packed_dt = rfn.repack_fields(dt)
  706. >>> packed_dt
  707. dtype([('f0', 'u1'), ('f1', '<i8'), ('f2', '<f8')])
  708. >>> print_offsets(packed_dt)
  709. offsets: [0, 1, 9]
  710. itemsize: 17
  711. """
  712. if not isinstance(a, np.dtype):
  713. dt = repack_fields(a.dtype, align=align, recurse=recurse)
  714. return a.astype(dt, copy=False)
  715. if a.names is None:
  716. return a
  717. fieldinfo = []
  718. for name in a.names:
  719. tup = a.fields[name]
  720. if recurse:
  721. fmt = repack_fields(tup[0], align=align, recurse=True)
  722. else:
  723. fmt = tup[0]
  724. if len(tup) == 3:
  725. name = (tup[2], name)
  726. fieldinfo.append((name, fmt))
  727. dt = np.dtype(fieldinfo, align=align)
  728. return np.dtype((a.type, dt))
  729. def _get_fields_and_offsets(dt, offset=0):
  730. """
  731. Returns a flat list of (dtype, count, offset) tuples of all the
  732. scalar fields in the dtype "dt", including nested fields, in left
  733. to right order.
  734. """
  735. # counts up elements in subarrays, including nested subarrays, and returns
  736. # base dtype and count
  737. def count_elem(dt):
  738. count = 1
  739. while dt.shape != ():
  740. for size in dt.shape:
  741. count *= size
  742. dt = dt.base
  743. return dt, count
  744. fields = []
  745. for name in dt.names:
  746. field = dt.fields[name]
  747. f_dt, f_offset = field[0], field[1]
  748. f_dt, n = count_elem(f_dt)
  749. if f_dt.names is None:
  750. fields.append((np.dtype((f_dt, (n,))), n, f_offset + offset))
  751. else:
  752. subfields = _get_fields_and_offsets(f_dt, f_offset + offset)
  753. size = f_dt.itemsize
  754. for i in range(n):
  755. if i == 0:
  756. # optimization: avoid list comprehension if no subarray
  757. fields.extend(subfields)
  758. else:
  759. fields.extend([(d, c, o + i*size) for d, c, o in subfields])
  760. return fields
  761. def _common_stride(offsets, counts, itemsize):
  762. """
  763. Returns the stride between the fields, or None if the stride is not
  764. constant. The values in "counts" designate the lengths of
  765. subarrays. Subarrays are treated as many contiguous fields, with
  766. always positive stride.
  767. """
  768. if len(offsets) <= 1:
  769. return itemsize
  770. negative = offsets[1] < offsets[0] # negative stride
  771. if negative:
  772. # reverse, so offsets will be ascending
  773. it = zip(reversed(offsets), reversed(counts))
  774. else:
  775. it = zip(offsets, counts)
  776. prev_offset = None
  777. stride = None
  778. for offset, count in it:
  779. if count != 1: # subarray: always c-contiguous
  780. if negative:
  781. return None # subarrays can never have a negative stride
  782. if stride is None:
  783. stride = itemsize
  784. if stride != itemsize:
  785. return None
  786. end_offset = offset + (count - 1) * itemsize
  787. else:
  788. end_offset = offset
  789. if prev_offset is not None:
  790. new_stride = offset - prev_offset
  791. if stride is None:
  792. stride = new_stride
  793. if stride != new_stride:
  794. return None
  795. prev_offset = end_offset
  796. if negative:
  797. return -stride
  798. return stride
  799. def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
  800. casting=None):
  801. return (arr,)
  802. @array_function_dispatch(_structured_to_unstructured_dispatcher)
  803. def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
  804. """
  805. Converts an n-D structured array into an (n+1)-D unstructured array.
  806. The new array will have a new last dimension equal in size to the
  807. number of field-elements of the input array. If not supplied, the output
  808. datatype is determined from the numpy type promotion rules applied to all
  809. the field datatypes.
  810. Nested fields, as well as each element of any subarray fields, all count
  811. as a single field-elements.
  812. Parameters
  813. ----------
  814. arr : ndarray
  815. Structured array or dtype to convert. Cannot contain object datatype.
  816. dtype : dtype, optional
  817. The dtype of the output unstructured array.
  818. copy : bool, optional
  819. If true, always return a copy. If false, a view is returned if
  820. possible, such as when the `dtype` and strides of the fields are
  821. suitable and the array subtype is one of `numpy.ndarray`,
  822. `numpy.recarray` or `numpy.memmap`.
  823. .. versionchanged:: 1.25.0
  824. A view can now be returned if the fields are separated by a
  825. uniform stride.
  826. casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
  827. See casting argument of `numpy.ndarray.astype`. Controls what kind of
  828. data casting may occur.
  829. Returns
  830. -------
  831. unstructured : ndarray
  832. Unstructured array with one more dimension.
  833. Examples
  834. --------
  835. >>> import numpy as np
  836. >>> from numpy.lib import recfunctions as rfn
  837. >>> a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
  838. >>> a
  839. array([(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.]),
  840. (0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.])],
  841. dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
  842. >>> rfn.structured_to_unstructured(a)
  843. array([[0., 0., 0., 0., 0.],
  844. [0., 0., 0., 0., 0.],
  845. [0., 0., 0., 0., 0.],
  846. [0., 0., 0., 0., 0.]])
  847. >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
  848. ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
  849. >>> np.mean(rfn.structured_to_unstructured(b[['x', 'z']]), axis=-1)
  850. array([ 3. , 5.5, 9. , 11. ])
  851. """
  852. if arr.dtype.names is None:
  853. raise ValueError('arr must be a structured array')
  854. fields = _get_fields_and_offsets(arr.dtype)
  855. n_fields = len(fields)
  856. if n_fields == 0 and dtype is None:
  857. raise ValueError("arr has no fields. Unable to guess dtype")
  858. elif n_fields == 0:
  859. # too many bugs elsewhere for this to work now
  860. raise NotImplementedError("arr with no fields is not supported")
  861. dts, counts, offsets = zip(*fields)
  862. names = ['f{}'.format(n) for n in range(n_fields)]
  863. if dtype is None:
  864. out_dtype = np.result_type(*[dt.base for dt in dts])
  865. else:
  866. out_dtype = np.dtype(dtype)
  867. # Use a series of views and casts to convert to an unstructured array:
  868. # first view using flattened fields (doesn't work for object arrays)
  869. # Note: dts may include a shape for subarrays
  870. flattened_fields = np.dtype({'names': names,
  871. 'formats': dts,
  872. 'offsets': offsets,
  873. 'itemsize': arr.dtype.itemsize})
  874. arr = arr.view(flattened_fields)
  875. # we only allow a few types to be unstructured by manipulating the
  876. # strides, because we know it won't work with, for example, np.matrix nor
  877. # np.ma.MaskedArray.
  878. can_view = type(arr) in (np.ndarray, np.recarray, np.memmap)
  879. if (not copy) and can_view and all(dt.base == out_dtype for dt in dts):
  880. # all elements have the right dtype already; if they have a common
  881. # stride, we can just return a view
  882. common_stride = _common_stride(offsets, counts, out_dtype.itemsize)
  883. if common_stride is not None:
  884. wrap = arr.__array_wrap__
  885. new_shape = arr.shape + (sum(counts), out_dtype.itemsize)
  886. new_strides = arr.strides + (abs(common_stride), 1)
  887. arr = arr[..., np.newaxis].view(np.uint8) # view as bytes
  888. arr = arr[..., min(offsets):] # remove the leading unused data
  889. arr = np.lib.stride_tricks.as_strided(arr,
  890. new_shape,
  891. new_strides,
  892. subok=True)
  893. # cast and drop the last dimension again
  894. arr = arr.view(out_dtype)[..., 0]
  895. if common_stride < 0:
  896. arr = arr[..., ::-1] # reverse, if the stride was negative
  897. if type(arr) is not type(wrap.__self__):
  898. # Some types (e.g. recarray) turn into an ndarray along the
  899. # way, so we have to wrap it again in order to match the
  900. # behavior with copy=True.
  901. arr = wrap(arr)
  902. return arr
  903. # next cast to a packed format with all fields converted to new dtype
  904. packed_fields = np.dtype({'names': names,
  905. 'formats': [(out_dtype, dt.shape) for dt in dts]})
  906. arr = arr.astype(packed_fields, copy=copy, casting=casting)
  907. # finally is it safe to view the packed fields as the unstructured type
  908. return arr.view((out_dtype, (sum(counts),)))
  909. def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None,
  910. align=None, copy=None, casting=None):
  911. return (arr,)
  912. @array_function_dispatch(_unstructured_to_structured_dispatcher)
  913. def unstructured_to_structured(arr, dtype=None, names=None, align=False,
  914. copy=False, casting='unsafe'):
  915. """
  916. Converts an n-D unstructured array into an (n-1)-D structured array.
  917. The last dimension of the input array is converted into a structure, with
  918. number of field-elements equal to the size of the last dimension of the
  919. input array. By default all output fields have the input array's dtype, but
  920. an output structured dtype with an equal number of fields-elements can be
  921. supplied instead.
  922. Nested fields, as well as each element of any subarray fields, all count
  923. towards the number of field-elements.
  924. Parameters
  925. ----------
  926. arr : ndarray
  927. Unstructured array or dtype to convert.
  928. dtype : dtype, optional
  929. The structured dtype of the output array
  930. names : list of strings, optional
  931. If dtype is not supplied, this specifies the field names for the output
  932. dtype, in order. The field dtypes will be the same as the input array.
  933. align : boolean, optional
  934. Whether to create an aligned memory layout.
  935. copy : bool, optional
  936. See copy argument to `numpy.ndarray.astype`. If true, always return a
  937. copy. If false, and `dtype` requirements are satisfied, a view is
  938. returned.
  939. casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
  940. See casting argument of `numpy.ndarray.astype`. Controls what kind of
  941. data casting may occur.
  942. Returns
  943. -------
  944. structured : ndarray
  945. Structured array with fewer dimensions.
  946. Examples
  947. --------
  948. >>> import numpy as np
  949. >>> from numpy.lib import recfunctions as rfn
  950. >>> dt = np.dtype([('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
  951. >>> a = np.arange(20).reshape((4,5))
  952. >>> a
  953. array([[ 0, 1, 2, 3, 4],
  954. [ 5, 6, 7, 8, 9],
  955. [10, 11, 12, 13, 14],
  956. [15, 16, 17, 18, 19]])
  957. >>> rfn.unstructured_to_structured(a, dt)
  958. array([( 0, ( 1., 2), [ 3., 4.]), ( 5, ( 6., 7), [ 8., 9.]),
  959. (10, (11., 12), [13., 14.]), (15, (16., 17), [18., 19.])],
  960. dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
  961. """
  962. if arr.shape == ():
  963. raise ValueError('arr must have at least one dimension')
  964. n_elem = arr.shape[-1]
  965. if n_elem == 0:
  966. # too many bugs elsewhere for this to work now
  967. raise NotImplementedError("last axis with size 0 is not supported")
  968. if dtype is None:
  969. if names is None:
  970. names = ['f{}'.format(n) for n in range(n_elem)]
  971. out_dtype = np.dtype([(n, arr.dtype) for n in names], align=align)
  972. fields = _get_fields_and_offsets(out_dtype)
  973. dts, counts, offsets = zip(*fields)
  974. else:
  975. if names is not None:
  976. raise ValueError("don't supply both dtype and names")
  977. # if dtype is the args of np.dtype, construct it
  978. dtype = np.dtype(dtype)
  979. # sanity check of the input dtype
  980. fields = _get_fields_and_offsets(dtype)
  981. if len(fields) == 0:
  982. dts, counts, offsets = [], [], []
  983. else:
  984. dts, counts, offsets = zip(*fields)
  985. if n_elem != sum(counts):
  986. raise ValueError('The length of the last dimension of arr must '
  987. 'be equal to the number of fields in dtype')
  988. out_dtype = dtype
  989. if align and not out_dtype.isalignedstruct:
  990. raise ValueError("align was True but dtype is not aligned")
  991. names = ['f{}'.format(n) for n in range(len(fields))]
  992. # Use a series of views and casts to convert to a structured array:
  993. # first view as a packed structured array of one dtype
  994. packed_fields = np.dtype({'names': names,
  995. 'formats': [(arr.dtype, dt.shape) for dt in dts]})
  996. arr = np.ascontiguousarray(arr).view(packed_fields)
  997. # next cast to an unpacked but flattened format with varied dtypes
  998. flattened_fields = np.dtype({'names': names,
  999. 'formats': dts,
  1000. 'offsets': offsets,
  1001. 'itemsize': out_dtype.itemsize})
  1002. arr = arr.astype(flattened_fields, copy=copy, casting=casting)
  1003. # finally view as the final nested dtype and remove the last axis
  1004. return arr.view(out_dtype)[..., 0]
  1005. def _apply_along_fields_dispatcher(func, arr):
  1006. return (arr,)
  1007. @array_function_dispatch(_apply_along_fields_dispatcher)
  1008. def apply_along_fields(func, arr):
  1009. """
  1010. Apply function 'func' as a reduction across fields of a structured array.
  1011. This is similar to `numpy.apply_along_axis`, but treats the fields of a
  1012. structured array as an extra axis. The fields are all first cast to a
  1013. common type following the type-promotion rules from `numpy.result_type`
  1014. applied to the field's dtypes.
  1015. Parameters
  1016. ----------
  1017. func : function
  1018. Function to apply on the "field" dimension. This function must
  1019. support an `axis` argument, like `numpy.mean`, `numpy.sum`, etc.
  1020. arr : ndarray
  1021. Structured array for which to apply func.
  1022. Returns
  1023. -------
  1024. out : ndarray
  1025. Result of the reduction operation
  1026. Examples
  1027. --------
  1028. >>> import numpy as np
  1029. >>> from numpy.lib import recfunctions as rfn
  1030. >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
  1031. ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
  1032. >>> rfn.apply_along_fields(np.mean, b)
  1033. array([ 2.66666667, 5.33333333, 8.66666667, 11. ])
  1034. >>> rfn.apply_along_fields(np.mean, b[['x', 'z']])
  1035. array([ 3. , 5.5, 9. , 11. ])
  1036. """
  1037. if arr.dtype.names is None:
  1038. raise ValueError('arr must be a structured array')
  1039. uarr = structured_to_unstructured(arr)
  1040. return func(uarr, axis=-1)
  1041. # works and avoids axis requirement, but very, very slow:
  1042. #return np.apply_along_axis(func, -1, uarr)
  1043. def _assign_fields_by_name_dispatcher(dst, src, zero_unassigned=None):
  1044. return dst, src
  1045. @array_function_dispatch(_assign_fields_by_name_dispatcher)
  1046. def assign_fields_by_name(dst, src, zero_unassigned=True):
  1047. """
  1048. Assigns values from one structured array to another by field name.
  1049. Normally in numpy >= 1.14, assignment of one structured array to another
  1050. copies fields "by position", meaning that the first field from the src is
  1051. copied to the first field of the dst, and so on, regardless of field name.
  1052. This function instead copies "by field name", such that fields in the dst
  1053. are assigned from the identically named field in the src. This applies
  1054. recursively for nested structures. This is how structure assignment worked
  1055. in numpy >= 1.6 to <= 1.13.
  1056. Parameters
  1057. ----------
  1058. dst : ndarray
  1059. src : ndarray
  1060. The source and destination arrays during assignment.
  1061. zero_unassigned : bool, optional
  1062. If True, fields in the dst for which there was no matching
  1063. field in the src are filled with the value 0 (zero). This
  1064. was the behavior of numpy <= 1.13. If False, those fields
  1065. are not modified.
  1066. """
  1067. if dst.dtype.names is None:
  1068. dst[...] = src
  1069. return
  1070. for name in dst.dtype.names:
  1071. if name not in src.dtype.names:
  1072. if zero_unassigned:
  1073. dst[name] = 0
  1074. else:
  1075. assign_fields_by_name(dst[name], src[name],
  1076. zero_unassigned)
  1077. def _require_fields_dispatcher(array, required_dtype):
  1078. return (array,)
  1079. @array_function_dispatch(_require_fields_dispatcher)
  1080. def require_fields(array, required_dtype):
  1081. """
  1082. Casts a structured array to a new dtype using assignment by field-name.
  1083. This function assigns from the old to the new array by name, so the
  1084. value of a field in the output array is the value of the field with the
  1085. same name in the source array. This has the effect of creating a new
  1086. ndarray containing only the fields "required" by the required_dtype.
  1087. If a field name in the required_dtype does not exist in the
  1088. input array, that field is created and set to 0 in the output array.
  1089. Parameters
  1090. ----------
  1091. a : ndarray
  1092. array to cast
  1093. required_dtype : dtype
  1094. datatype for output array
  1095. Returns
  1096. -------
  1097. out : ndarray
  1098. array with the new dtype, with field values copied from the fields in
  1099. the input array with the same name
  1100. Examples
  1101. --------
  1102. >>> import numpy as np
  1103. >>> from numpy.lib import recfunctions as rfn
  1104. >>> a = np.ones(4, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')])
  1105. >>> rfn.require_fields(a, [('b', 'f4'), ('c', 'u1')])
  1106. array([(1., 1), (1., 1), (1., 1), (1., 1)],
  1107. dtype=[('b', '<f4'), ('c', 'u1')])
  1108. >>> rfn.require_fields(a, [('b', 'f4'), ('newf', 'u1')])
  1109. array([(1., 0), (1., 0), (1., 0), (1., 0)],
  1110. dtype=[('b', '<f4'), ('newf', 'u1')])
  1111. """
  1112. out = np.empty(array.shape, dtype=required_dtype)
  1113. assign_fields_by_name(out, array)
  1114. return out
  1115. def _stack_arrays_dispatcher(arrays, defaults=None, usemask=None,
  1116. asrecarray=None, autoconvert=None):
  1117. return arrays
  1118. @array_function_dispatch(_stack_arrays_dispatcher)
  1119. def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
  1120. autoconvert=False):
  1121. """
  1122. Superposes arrays fields by fields
  1123. Parameters
  1124. ----------
  1125. arrays : array or sequence
  1126. Sequence of input arrays.
  1127. defaults : dictionary, optional
  1128. Dictionary mapping field names to the corresponding default values.
  1129. usemask : {True, False}, optional
  1130. Whether to return a MaskedArray (or MaskedRecords is
  1131. `asrecarray==True`) or a ndarray.
  1132. asrecarray : {False, True}, optional
  1133. Whether to return a recarray (or MaskedRecords if `usemask==True`)
  1134. or just a flexible-type ndarray.
  1135. autoconvert : {False, True}, optional
  1136. Whether automatically cast the type of the field to the maximum.
  1137. Examples
  1138. --------
  1139. >>> import numpy as np
  1140. >>> from numpy.lib import recfunctions as rfn
  1141. >>> x = np.array([1, 2,])
  1142. >>> rfn.stack_arrays(x) is x
  1143. True
  1144. >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
  1145. >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
  1146. ... dtype=[('A', '|S3'), ('B', np.double), ('C', np.double)])
  1147. >>> test = rfn.stack_arrays((z,zz))
  1148. >>> test
  1149. masked_array(data=[(b'A', 1.0, --), (b'B', 2.0, --), (b'a', 10.0, 100.0),
  1150. (b'b', 20.0, 200.0), (b'c', 30.0, 300.0)],
  1151. mask=[(False, False, True), (False, False, True),
  1152. (False, False, False), (False, False, False),
  1153. (False, False, False)],
  1154. fill_value=(b'N/A', 1e+20, 1e+20),
  1155. dtype=[('A', 'S3'), ('B', '<f8'), ('C', '<f8')])
  1156. """
  1157. if isinstance(arrays, np.ndarray):
  1158. return arrays
  1159. elif len(arrays) == 1:
  1160. return arrays[0]
  1161. seqarrays = [np.asanyarray(a).ravel() for a in arrays]
  1162. nrecords = [len(a) for a in seqarrays]
  1163. ndtype = [a.dtype for a in seqarrays]
  1164. fldnames = [d.names for d in ndtype]
  1165. #
  1166. dtype_l = ndtype[0]
  1167. newdescr = _get_fieldspec(dtype_l)
  1168. names = [n for n, d in newdescr]
  1169. for dtype_n in ndtype[1:]:
  1170. for fname, fdtype in _get_fieldspec(dtype_n):
  1171. if fname not in names:
  1172. newdescr.append((fname, fdtype))
  1173. names.append(fname)
  1174. else:
  1175. nameidx = names.index(fname)
  1176. _, cdtype = newdescr[nameidx]
  1177. if autoconvert:
  1178. newdescr[nameidx] = (fname, max(fdtype, cdtype))
  1179. elif fdtype != cdtype:
  1180. raise TypeError("Incompatible type '%s' <> '%s'" %
  1181. (cdtype, fdtype))
  1182. # Only one field: use concatenate
  1183. if len(newdescr) == 1:
  1184. output = ma.concatenate(seqarrays)
  1185. else:
  1186. #
  1187. output = ma.masked_all((np.sum(nrecords),), newdescr)
  1188. offset = np.cumsum(np.r_[0, nrecords])
  1189. seen = []
  1190. for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]):
  1191. names = a.dtype.names
  1192. if names is None:
  1193. output['f%i' % len(seen)][i:j] = a
  1194. else:
  1195. for name in n:
  1196. output[name][i:j] = a[name]
  1197. if name not in seen:
  1198. seen.append(name)
  1199. #
  1200. return _fix_output(_fix_defaults(output, defaults),
  1201. usemask=usemask, asrecarray=asrecarray)
  1202. def _find_duplicates_dispatcher(
  1203. a, key=None, ignoremask=None, return_index=None):
  1204. return (a,)
  1205. @array_function_dispatch(_find_duplicates_dispatcher)
  1206. def find_duplicates(a, key=None, ignoremask=True, return_index=False):
  1207. """
  1208. Find the duplicates in a structured array along a given key
  1209. Parameters
  1210. ----------
  1211. a : array-like
  1212. Input array
  1213. key : {string, None}, optional
  1214. Name of the fields along which to check the duplicates.
  1215. If None, the search is performed by records
  1216. ignoremask : {True, False}, optional
  1217. Whether masked data should be discarded or considered as duplicates.
  1218. return_index : {False, True}, optional
  1219. Whether to return the indices of the duplicated values.
  1220. Examples
  1221. --------
  1222. >>> import numpy as np
  1223. >>> from numpy.lib import recfunctions as rfn
  1224. >>> ndtype = [('a', int)]
  1225. >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
  1226. ... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
  1227. >>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
  1228. (masked_array(data=[(1,), (1,), (2,), (2,)],
  1229. mask=[(False,), (False,), (False,), (False,)],
  1230. fill_value=(999999,),
  1231. dtype=[('a', '<i8')]), array([0, 1, 3, 4]))
  1232. """
  1233. a = np.asanyarray(a).ravel()
  1234. # Get a dictionary of fields
  1235. fields = get_fieldstructure(a.dtype)
  1236. # Get the sorting data (by selecting the corresponding field)
  1237. base = a
  1238. if key:
  1239. for f in fields[key]:
  1240. base = base[f]
  1241. base = base[key]
  1242. # Get the sorting indices and the sorted data
  1243. sortidx = base.argsort()
  1244. sortedbase = base[sortidx]
  1245. sorteddata = sortedbase.filled()
  1246. # Compare the sorting data
  1247. flag = (sorteddata[:-1] == sorteddata[1:])
  1248. # If masked data must be ignored, set the flag to false where needed
  1249. if ignoremask:
  1250. sortedmask = sortedbase.recordmask
  1251. flag[sortedmask[1:]] = False
  1252. flag = np.concatenate(([False], flag))
  1253. # We need to take the point on the left as well (else we're missing it)
  1254. flag[:-1] = flag[:-1] + flag[1:]
  1255. duplicates = a[sortidx][flag]
  1256. if return_index:
  1257. return (duplicates, sortidx[flag])
  1258. else:
  1259. return duplicates
  1260. def _join_by_dispatcher(
  1261. key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
  1262. defaults=None, usemask=None, asrecarray=None):
  1263. return (r1, r2)
  1264. @array_function_dispatch(_join_by_dispatcher)
  1265. def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
  1266. defaults=None, usemask=True, asrecarray=False):
  1267. """
  1268. Join arrays `r1` and `r2` on key `key`.
  1269. The key should be either a string or a sequence of string corresponding
  1270. to the fields used to join the array. An exception is raised if the
  1271. `key` field cannot be found in the two input arrays. Neither `r1` nor
  1272. `r2` should have any duplicates along `key`: the presence of duplicates
  1273. will make the output quite unreliable. Note that duplicates are not
  1274. looked for by the algorithm.
  1275. Parameters
  1276. ----------
  1277. key : {string, sequence}
  1278. A string or a sequence of strings corresponding to the fields used
  1279. for comparison.
  1280. r1, r2 : arrays
  1281. Structured arrays.
  1282. jointype : {'inner', 'outer', 'leftouter'}, optional
  1283. If 'inner', returns the elements common to both r1 and r2.
  1284. If 'outer', returns the common elements as well as the elements of
  1285. r1 not in r2 and the elements of not in r2.
  1286. If 'leftouter', returns the common elements and the elements of r1
  1287. not in r2.
  1288. r1postfix : string, optional
  1289. String appended to the names of the fields of r1 that are present
  1290. in r2 but absent of the key.
  1291. r2postfix : string, optional
  1292. String appended to the names of the fields of r2 that are present
  1293. in r1 but absent of the key.
  1294. defaults : {dictionary}, optional
  1295. Dictionary mapping field names to the corresponding default values.
  1296. usemask : {True, False}, optional
  1297. Whether to return a MaskedArray (or MaskedRecords is
  1298. `asrecarray==True`) or a ndarray.
  1299. asrecarray : {False, True}, optional
  1300. Whether to return a recarray (or MaskedRecords if `usemask==True`)
  1301. or just a flexible-type ndarray.
  1302. Notes
  1303. -----
  1304. * The output is sorted along the key.
  1305. * A temporary array is formed by dropping the fields not in the key for
  1306. the two arrays and concatenating the result. This array is then
  1307. sorted, and the common entries selected. The output is constructed by
  1308. filling the fields with the selected entries. Matching is not
  1309. preserved if there are some duplicates...
  1310. """
  1311. # Check jointype
  1312. if jointype not in ('inner', 'outer', 'leftouter'):
  1313. raise ValueError(
  1314. "The 'jointype' argument should be in 'inner', "
  1315. "'outer' or 'leftouter' (got '%s' instead)" % jointype
  1316. )
  1317. # If we have a single key, put it in a tuple
  1318. if isinstance(key, str):
  1319. key = (key,)
  1320. # Check the keys
  1321. if len(set(key)) != len(key):
  1322. dup = next(x for n,x in enumerate(key) if x in key[n+1:])
  1323. raise ValueError("duplicate join key %r" % dup)
  1324. for name in key:
  1325. if name not in r1.dtype.names:
  1326. raise ValueError('r1 does not have key field %r' % name)
  1327. if name not in r2.dtype.names:
  1328. raise ValueError('r2 does not have key field %r' % name)
  1329. # Make sure we work with ravelled arrays
  1330. r1 = r1.ravel()
  1331. r2 = r2.ravel()
  1332. # Fixme: nb2 below is never used. Commenting out for pyflakes.
  1333. # (nb1, nb2) = (len(r1), len(r2))
  1334. nb1 = len(r1)
  1335. (r1names, r2names) = (r1.dtype.names, r2.dtype.names)
  1336. # Check the names for collision
  1337. collisions = (set(r1names) & set(r2names)) - set(key)
  1338. if collisions and not (r1postfix or r2postfix):
  1339. msg = "r1 and r2 contain common names, r1postfix and r2postfix "
  1340. msg += "can't both be empty"
  1341. raise ValueError(msg)
  1342. # Make temporary arrays of just the keys
  1343. # (use order of keys in `r1` for back-compatibility)
  1344. key1 = [ n for n in r1names if n in key ]
  1345. r1k = _keep_fields(r1, key1)
  1346. r2k = _keep_fields(r2, key1)
  1347. # Concatenate the two arrays for comparison
  1348. aux = ma.concatenate((r1k, r2k))
  1349. idx_sort = aux.argsort(order=key)
  1350. aux = aux[idx_sort]
  1351. #
  1352. # Get the common keys
  1353. flag_in = ma.concatenate(([False], aux[1:] == aux[:-1]))
  1354. flag_in[:-1] = flag_in[1:] + flag_in[:-1]
  1355. idx_in = idx_sort[flag_in]
  1356. idx_1 = idx_in[(idx_in < nb1)]
  1357. idx_2 = idx_in[(idx_in >= nb1)] - nb1
  1358. (r1cmn, r2cmn) = (len(idx_1), len(idx_2))
  1359. if jointype == 'inner':
  1360. (r1spc, r2spc) = (0, 0)
  1361. elif jointype == 'outer':
  1362. idx_out = idx_sort[~flag_in]
  1363. idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
  1364. idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1))
  1365. (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn)
  1366. elif jointype == 'leftouter':
  1367. idx_out = idx_sort[~flag_in]
  1368. idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
  1369. (r1spc, r2spc) = (len(idx_1) - r1cmn, 0)
  1370. # Select the entries from each input
  1371. (s1, s2) = (r1[idx_1], r2[idx_2])
  1372. #
  1373. # Build the new description of the output array .......
  1374. # Start with the key fields
  1375. ndtype = _get_fieldspec(r1k.dtype)
  1376. # Add the fields from r1
  1377. for fname, fdtype in _get_fieldspec(r1.dtype):
  1378. if fname not in key:
  1379. ndtype.append((fname, fdtype))
  1380. # Add the fields from r2
  1381. for fname, fdtype in _get_fieldspec(r2.dtype):
  1382. # Have we seen the current name already ?
  1383. # we need to rebuild this list every time
  1384. names = list(name for name, dtype in ndtype)
  1385. try:
  1386. nameidx = names.index(fname)
  1387. except ValueError:
  1388. #... we haven't: just add the description to the current list
  1389. ndtype.append((fname, fdtype))
  1390. else:
  1391. # collision
  1392. _, cdtype = ndtype[nameidx]
  1393. if fname in key:
  1394. # The current field is part of the key: take the largest dtype
  1395. ndtype[nameidx] = (fname, max(fdtype, cdtype))
  1396. else:
  1397. # The current field is not part of the key: add the suffixes,
  1398. # and place the new field adjacent to the old one
  1399. ndtype[nameidx:nameidx + 1] = [
  1400. (fname + r1postfix, cdtype),
  1401. (fname + r2postfix, fdtype)
  1402. ]
  1403. # Rebuild a dtype from the new fields
  1404. ndtype = np.dtype(ndtype)
  1405. # Find the largest nb of common fields :
  1406. # r1cmn and r2cmn should be equal, but...
  1407. cmn = max(r1cmn, r2cmn)
  1408. # Construct an empty array
  1409. output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype)
  1410. names = output.dtype.names
  1411. for f in r1names:
  1412. selected = s1[f]
  1413. if f not in names or (f in r2names and not r2postfix and f not in key):
  1414. f += r1postfix
  1415. current = output[f]
  1416. current[:r1cmn] = selected[:r1cmn]
  1417. if jointype in ('outer', 'leftouter'):
  1418. current[cmn:cmn + r1spc] = selected[r1cmn:]
  1419. for f in r2names:
  1420. selected = s2[f]
  1421. if f not in names or (f in r1names and not r1postfix and f not in key):
  1422. f += r2postfix
  1423. current = output[f]
  1424. current[:r2cmn] = selected[:r2cmn]
  1425. if (jointype == 'outer') and r2spc:
  1426. current[-r2spc:] = selected[r2cmn:]
  1427. # Sort and finalize the output
  1428. output.sort(order=key)
  1429. kwargs = dict(usemask=usemask, asrecarray=asrecarray)
  1430. return _fix_output(_fix_defaults(output, defaults), **kwargs)
  1431. def _rec_join_dispatcher(
  1432. key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
  1433. defaults=None):
  1434. return (r1, r2)
  1435. @array_function_dispatch(_rec_join_dispatcher)
  1436. def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
  1437. defaults=None):
  1438. """
  1439. Join arrays `r1` and `r2` on keys.
  1440. Alternative to join_by, that always returns a np.recarray.
  1441. See Also
  1442. --------
  1443. join_by : equivalent function
  1444. """
  1445. kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix,
  1446. defaults=defaults, usemask=False, asrecarray=True)
  1447. return join_by(key, r1, r2, **kwargs)
  1448. del array_function_dispatch