pthreadpool.h 97 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #ifndef PTHREADPOOL_H_
  3. #define PTHREADPOOL_H_
  4. #include <stddef.h>
  5. #include <stdint.h>
  6. typedef struct pthreadpool* pthreadpool_t;
  7. typedef void (*pthreadpool_task_1d_t)(void*, size_t);
  8. typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t);
  9. typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
  10. typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
  11. typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t);
  12. typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
  13. typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
  14. typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
  15. typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
  16. typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t, size_t, size_t);
  17. typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
  18. typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
  19. typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
  20. typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
  21. typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
  22. typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
  23. typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
  24. typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
  25. typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
  26. typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
  27. typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
  28. typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t);
  29. typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
  30. typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
  31. typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
  32. typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
  33. typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
  34. typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
  35. /**
  36. * Disable support for denormalized numbers to the maximum extent possible for
  37. * the duration of the computation.
  38. *
  39. * Handling denormalized floating-point numbers is often implemented in
  40. * microcode, and incurs significant performance degradation. This hint
  41. * instructs the thread pool to disable support for denormalized numbers before
  42. * running the computation by manipulating architecture-specific control
  43. * registers, and restore the initial value of control registers after the
  44. * computation is complete. The thread pool temporary disables denormalized
  45. * numbers on all threads involved in the computation (i.e. the caller threads,
  46. * and potentially worker threads).
  47. *
  48. * Disabling denormalized numbers may have a small negative effect on results'
  49. * accuracy. As various architectures differ in capabilities to control
  50. * processing of denormalized numbers, using this flag may also hurt results'
  51. * reproducibility across different instruction set architectures.
  52. */
  53. #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
  54. /**
  55. * Yield worker threads to the system scheduler after the operation is finished.
  56. *
  57. * Force workers to use kernel wait (instead of active spin-wait by default) for
  58. * new commands after this command is processed. This flag affects only the
  59. * immediate next operation on this thread pool. To make the thread pool always
  60. * use kernel wait, pass this flag to all parallelization functions.
  61. */
  62. #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
  63. #ifdef __cplusplus
  64. extern "C" {
  65. #endif
  66. /**
  67. * Create a thread pool with the specified number of threads.
  68. *
  69. * @param threads_count the number of threads in the thread pool.
  70. * A value of 0 has special interpretation: it creates a thread pool with as
  71. * many threads as there are logical processors in the system.
  72. *
  73. * @returns A pointer to an opaque thread pool object if the call is
  74. * successful, or NULL pointer if the call failed.
  75. */
  76. pthreadpool_t pthreadpool_create(size_t threads_count);
  77. /**
  78. * Query the number of threads in a thread pool.
  79. *
  80. * @param threadpool the thread pool to query.
  81. *
  82. * @returns The number of threads in the thread pool.
  83. */
  84. size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
  85. /**
  86. * Process items on a 1D grid.
  87. *
  88. * The function implements a parallel version of the following snippet:
  89. *
  90. * for (size_t i = 0; i < range; i++)
  91. * function(context, i);
  92. *
  93. * When the function returns, all items have been processed and the thread pool
  94. * is ready for a new task.
  95. *
  96. * @note If multiple threads call this function with the same thread pool, the
  97. * calls are serialized.
  98. *
  99. * @param threadpool the thread pool to use for parallelisation. If threadpool
  100. * is NULL, all items are processed serially on the calling thread.
  101. * @param function the function to call for each item.
  102. * @param context the first argument passed to the specified function.
  103. * @param range the number of items on the 1D grid to process. The
  104. * specified function will be called once for each item.
  105. * @param flags a bitwise combination of zero or more optional flags
  106. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  107. */
  108. void pthreadpool_parallelize_1d(
  109. pthreadpool_t threadpool,
  110. pthreadpool_task_1d_t function,
  111. void* context,
  112. size_t range,
  113. uint32_t flags);
  114. /**
  115. * Process items on a 1D grid passing along the current thread id.
  116. *
  117. * The function implements a parallel version of the following snippet:
  118. *
  119. * for (size_t i = 0; i < range; i++)
  120. * function(context, thread_index, i);
  121. *
  122. * When the function returns, all items have been processed and the thread pool
  123. * is ready for a new task.
  124. *
  125. * @note If multiple threads call this function with the same thread pool, the
  126. * calls are serialized.
  127. *
  128. * @param threadpool the thread pool to use for parallelisation. If threadpool
  129. * is NULL, all items are processed serially on the calling thread.
  130. * @param function the function to call for each item.
  131. * @param context the first argument passed to the specified function.
  132. * @param range the number of items on the 1D grid to process. The
  133. * specified function will be called once for each item.
  134. * @param flags a bitwise combination of zero or more optional flags
  135. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  136. */
  137. void pthreadpool_parallelize_1d_with_thread(
  138. pthreadpool_t threadpool,
  139. pthreadpool_task_1d_with_thread_t function,
  140. void* context,
  141. size_t range,
  142. uint32_t flags);
  143. /**
  144. * Process items on a 1D grid using a microarchitecture-aware task function.
  145. *
  146. * The function implements a parallel version of the following snippet:
  147. *
  148. * uint32_t uarch_index = cpuinfo_initialize() ?
  149. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  150. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  151. * for (size_t i = 0; i < range; i++)
  152. * function(context, uarch_index, i);
  153. *
  154. * When the function returns, all items have been processed and the thread pool
  155. * is ready for a new task.
  156. *
  157. * @note If multiple threads call this function with the same thread pool, the
  158. * calls are serialized.
  159. *
  160. * @param threadpool the thread pool to use for parallelisation. If
  161. * threadpool is NULL, all items are processed serially on the calling
  162. * thread.
  163. * @param function the function to call for each item.
  164. * @param context the first argument passed to the specified
  165. * function.
  166. * @param default_uarch_index the microarchitecture index to use when
  167. * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  168. * or index returned by cpuinfo_get_current_uarch_index() exceeds the
  169. * max_uarch_index value.
  170. * @param max_uarch_index the maximum microarchitecture index expected by
  171. * the specified function. If the index returned by
  172. * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  173. * will be used instead. default_uarch_index can exceed max_uarch_index.
  174. * @param range the number of items on the 1D grid to process.
  175. * The specified function will be called once for each item.
  176. * @param flags a bitwise combination of zero or more optional
  177. * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  178. * PTHREADPOOL_FLAG_YIELD_WORKERS)
  179. */
  180. void pthreadpool_parallelize_1d_with_uarch(
  181. pthreadpool_t threadpool,
  182. pthreadpool_task_1d_with_id_t function,
  183. void* context,
  184. uint32_t default_uarch_index,
  185. uint32_t max_uarch_index,
  186. size_t range,
  187. uint32_t flags);
  188. /**
  189. * Process items on a 1D grid with specified maximum tile size.
  190. *
  191. * The function implements a parallel version of the following snippet:
  192. *
  193. * for (size_t i = 0; i < range; i += tile)
  194. * function(context, i, min(range - i, tile));
  195. *
  196. * When the call returns, all items have been processed and the thread pool is
  197. * ready for a new task.
  198. *
  199. * @note If multiple threads call this function with the same thread pool,
  200. * the calls are serialized.
  201. *
  202. * @param threadpool the thread pool to use for parallelisation. If threadpool
  203. * is NULL, all items are processed serially on the calling thread.
  204. * @param function the function to call for each tile.
  205. * @param context the first argument passed to the specified function.
  206. * @param range the number of items on the 1D grid to process.
  207. * @param tile the maximum number of items on the 1D grid to process in
  208. * one function call.
  209. * @param flags a bitwise combination of zero or more optional flags
  210. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  211. */
  212. void pthreadpool_parallelize_1d_tile_1d(
  213. pthreadpool_t threadpool,
  214. pthreadpool_task_1d_tile_1d_t function,
  215. void* context,
  216. size_t range,
  217. size_t tile,
  218. uint32_t flags);
  219. /**
  220. * Process items on a 2D grid.
  221. *
  222. * The function implements a parallel version of the following snippet:
  223. *
  224. * for (size_t i = 0; i < range_i; i++)
  225. * for (size_t j = 0; j < range_j; j++)
  226. * function(context, i, j);
  227. *
  228. * When the function returns, all items have been processed and the thread pool
  229. * is ready for a new task.
  230. *
  231. * @note If multiple threads call this function with the same thread pool, the
  232. * calls are serialized.
  233. *
  234. * @param threadpool the thread pool to use for parallelisation. If threadpool
  235. * is NULL, all items are processed serially on the calling thread.
  236. * @param function the function to call for each item.
  237. * @param context the first argument passed to the specified function.
  238. * @param range_i the number of items to process along the first dimension
  239. * of the 2D grid.
  240. * @param range_j the number of items to process along the second dimension
  241. * of the 2D grid.
  242. * @param flags a bitwise combination of zero or more optional flags
  243. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  244. */
  245. void pthreadpool_parallelize_2d(
  246. pthreadpool_t threadpool,
  247. pthreadpool_task_2d_t function,
  248. void* context,
  249. size_t range_i,
  250. size_t range_j,
  251. uint32_t flags);
  252. /**
  253. * Process items on a 2D grid passing along the current thread id.
  254. *
  255. * The function implements a parallel version of the following snippet:
  256. *
  257. * for (size_t i = 0; i < range_i; i++)
  258. * for (size_t j = 0; j < range_j; j++)
  259. * function(context, thread_index, i, j);
  260. *
  261. * When the function returns, all items have been processed and the thread pool
  262. * is ready for a new task.
  263. *
  264. * @note If multiple threads call this function with the same thread pool, the
  265. * calls are serialized.
  266. *
  267. * @param threadpool the thread pool to use for parallelisation. If threadpool
  268. * is NULL, all items are processed serially on the calling thread.
  269. * @param function the function to call for each item.
  270. * @param context the first argument passed to the specified function.
  271. * @param range_i the number of items to process along the first dimension
  272. * of the 2D grid.
  273. * @param range_j the number of items to process along the second dimension
  274. * of the 2D grid.
  275. * @param flags a bitwise combination of zero or more optional flags
  276. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  277. */
  278. void pthreadpool_parallelize_2d_with_thread(
  279. pthreadpool_t threadpool,
  280. pthreadpool_task_2d_with_thread_t function,
  281. void* context,
  282. size_t range_i,
  283. size_t range_j,
  284. uint32_t flags);
  285. /**
  286. * Process items on a 2D grid with the specified maximum tile size along the
  287. * last grid dimension.
  288. *
  289. * The function implements a parallel version of the following snippet:
  290. *
  291. * for (size_t i = 0; i < range_i; i++)
  292. * for (size_t j = 0; j < range_j; j += tile_j)
  293. * function(context, i, j, min(range_j - j, tile_j));
  294. *
  295. * When the function returns, all items have been processed and the thread pool
  296. * is ready for a new task.
  297. *
  298. * @note If multiple threads call this function with the same thread pool, the
  299. * calls are serialized.
  300. *
  301. * @param threadpool the thread pool to use for parallelisation. If threadpool
  302. * is NULL, all items are processed serially on the calling thread.
  303. * @param function the function to call for each tile.
  304. * @param context the first argument passed to the specified function.
  305. * @param range_i the number of items to process along the first dimension
  306. * of the 2D grid.
  307. * @param range_j the number of items to process along the second dimension
  308. * of the 2D grid.
  309. * @param tile_j the maximum number of items along the second dimension of
  310. * the 2D grid to process in one function call.
  311. * @param flags a bitwise combination of zero or more optional flags
  312. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  313. */
  314. void pthreadpool_parallelize_2d_tile_1d(
  315. pthreadpool_t threadpool,
  316. pthreadpool_task_2d_tile_1d_t function,
  317. void* context,
  318. size_t range_i,
  319. size_t range_j,
  320. size_t tile_j,
  321. uint32_t flags);
  322. /**
  323. * Process items on a 2D grid with the specified maximum tile size along the
  324. * last grid dimension using a microarchitecture-aware task function.
  325. *
  326. * The function implements a parallel version of the following snippet:
  327. *
  328. * uint32_t uarch_index = cpuinfo_initialize() ?
  329. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  330. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  331. * for (size_t i = 0; i < range_i; i++)
  332. * for (size_t j = 0; j < range_j; j += tile_j)
  333. * function(context, uarch_index, i, j, min(range_j - j, tile_j));
  334. *
  335. * When the function returns, all items have been processed and the thread pool
  336. * is ready for a new task.
  337. *
  338. * @note If multiple threads call this function with the same thread pool, the
  339. * calls are serialized.
  340. *
  341. * @param threadpool the thread pool to use for parallelisation. If threadpool
  342. * is NULL, all items are processed serially on the calling thread.
  343. * @param function the function to call for each tile.
  344. * @param context the first argument passed to the specified function.
  345. * @param default_uarch_index the microarchitecture index to use when
  346. * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  347. * or index returned by cpuinfo_get_current_uarch_index() exceeds the
  348. * max_uarch_index value.
  349. * @param max_uarch_index the maximum microarchitecture index expected by
  350. * the specified function. If the index returned by
  351. * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  352. * will be used instead. default_uarch_index can exceed max_uarch_index.
  353. * @param range_i the number of items to process along the first dimension
  354. * of the 2D grid.
  355. * @param range_j the number of items to process along the second dimension
  356. * of the 2D grid.
  357. * @param tile_j the maximum number of items along the second dimension of
  358. * the 2D grid to process in one function call.
  359. * @param flags a bitwise combination of zero or more optional flags
  360. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  361. */
  362. void pthreadpool_parallelize_2d_tile_1d_with_uarch(
  363. pthreadpool_t threadpool,
  364. pthreadpool_task_2d_tile_1d_with_id_t function,
  365. void* context,
  366. uint32_t default_uarch_index,
  367. uint32_t max_uarch_index,
  368. size_t range_i,
  369. size_t range_j,
  370. size_t tile_j,
  371. uint32_t flags);
  372. /**
  373. * Process items on a 2D grid with the specified maximum tile size along the
  374. * last grid dimension using a microarchitecture-aware task function and passing
  375. * along the current thread id.
  376. *
  377. * The function implements a parallel version of the following snippet:
  378. *
  379. * uint32_t uarch_index = cpuinfo_initialize() ?
  380. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  381. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  382. * for (size_t i = 0; i < range_i; i++)
  383. * for (size_t j = 0; j < range_j; j += tile_j)
  384. * function(context, uarch_index, thread_index, i, j, min(range_j - j, tile_j));
  385. *
  386. * When the function returns, all items have been processed and the thread pool
  387. * is ready for a new task.
  388. *
  389. * @note If multiple threads call this function with the same thread pool, the
  390. * calls are serialized.
  391. *
  392. * @param threadpool the thread pool to use for parallelisation. If threadpool
  393. * is NULL, all items are processed serially on the calling thread.
  394. * @param function the function to call for each tile.
  395. * @param context the first argument passed to the specified function.
  396. * @param default_uarch_index the microarchitecture index to use when
  397. * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  398. * or index returned by cpuinfo_get_current_uarch_index() exceeds the
  399. * max_uarch_index value.
  400. * @param max_uarch_index the maximum microarchitecture index expected by
  401. * the specified function. If the index returned by
  402. * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  403. * will be used instead. default_uarch_index can exceed max_uarch_index.
  404. * @param range_i the number of items to process along the first dimension
  405. * of the 2D grid.
  406. * @param range_j the number of items to process along the second dimension
  407. * of the 2D grid.
  408. * @param tile_j the maximum number of items along the second dimension of
  409. * the 2D grid to process in one function call.
  410. * @param flags a bitwise combination of zero or more optional flags
  411. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  412. */
  413. void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(
  414. pthreadpool_t threadpool,
  415. pthreadpool_task_2d_tile_1d_with_id_with_thread_t function,
  416. void* context,
  417. uint32_t default_uarch_index,
  418. uint32_t max_uarch_index,
  419. size_t range_i,
  420. size_t range_j,
  421. size_t tile_j,
  422. uint32_t flags);
  423. /**
  424. * Process items on a 2D grid with the specified maximum tile size along each
  425. * grid dimension.
  426. *
  427. * The function implements a parallel version of the following snippet:
  428. *
  429. * for (size_t i = 0; i < range_i; i += tile_i)
  430. * for (size_t j = 0; j < range_j; j += tile_j)
  431. * function(context, i, j,
  432. * min(range_i - i, tile_i), min(range_j - j, tile_j));
  433. *
  434. * When the function returns, all items have been processed and the thread pool
  435. * is ready for a new task.
  436. *
  437. * @note If multiple threads call this function with the same thread pool, the
  438. * calls are serialized.
  439. *
  440. * @param threadpool the thread pool to use for parallelisation. If threadpool
  441. * is NULL, all items are processed serially on the calling thread.
  442. * @param function the function to call for each tile.
  443. * @param context the first argument passed to the specified function.
  444. * @param range_i the number of items to process along the first dimension
  445. * of the 2D grid.
  446. * @param range_j the number of items to process along the second dimension
  447. * of the 2D grid.
  448. * @param tile_j the maximum number of items along the first dimension of
  449. * the 2D grid to process in one function call.
  450. * @param tile_j the maximum number of items along the second dimension of
  451. * the 2D grid to process in one function call.
  452. * @param flags a bitwise combination of zero or more optional flags
  453. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  454. */
  455. void pthreadpool_parallelize_2d_tile_2d(
  456. pthreadpool_t threadpool,
  457. pthreadpool_task_2d_tile_2d_t function,
  458. void* context,
  459. size_t range_i,
  460. size_t range_j,
  461. size_t tile_i,
  462. size_t tile_j,
  463. uint32_t flags);
  464. /**
  465. * Process items on a 2D grid with the specified maximum tile size along each
  466. * grid dimension using a microarchitecture-aware task function.
  467. *
  468. * The function implements a parallel version of the following snippet:
  469. *
  470. * uint32_t uarch_index = cpuinfo_initialize() ?
  471. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  472. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  473. * for (size_t i = 0; i < range_i; i += tile_i)
  474. * for (size_t j = 0; j < range_j; j += tile_j)
  475. * function(context, uarch_index, i, j,
  476. * min(range_i - i, tile_i), min(range_j - j, tile_j));
  477. *
  478. * When the function returns, all items have been processed and the thread pool
  479. * is ready for a new task.
  480. *
  481. * @note If multiple threads call this function with the same thread pool, the
  482. * calls are serialized.
  483. *
  484. * @param threadpool the thread pool to use for parallelisation. If
  485. * threadpool is NULL, all items are processed serially on the calling
  486. * thread.
  487. * @param function the function to call for each tile.
  488. * @param context the first argument passed to the specified
  489. * function.
  490. * @param default_uarch_index the microarchitecture index to use when
  491. * pthreadpool is configured without cpuinfo,
  492. * cpuinfo initialization failed, or index returned
  493. * by cpuinfo_get_current_uarch_index() exceeds
  494. * the max_uarch_index value.
  495. * @param max_uarch_index the maximum microarchitecture index expected
  496. * by the specified function. If the index returned
  497. * by cpuinfo_get_current_uarch_index() exceeds this
  498. * value, default_uarch_index will be used instead.
  499. * default_uarch_index can exceed max_uarch_index.
  500. * @param range_i the number of items to process along the first
  501. * dimension of the 2D grid.
  502. * @param range_j the number of items to process along the second
  503. * dimension of the 2D grid.
  504. * @param tile_j the maximum number of items along the first
  505. * dimension of the 2D grid to process in one function call.
  506. * @param tile_j the maximum number of items along the second
  507. * dimension of the 2D grid to process in one function call.
  508. * @param flags a bitwise combination of zero or more optional
  509. * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  510. * PTHREADPOOL_FLAG_YIELD_WORKERS)
  511. */
  512. void pthreadpool_parallelize_2d_tile_2d_with_uarch(
  513. pthreadpool_t threadpool,
  514. pthreadpool_task_2d_tile_2d_with_id_t function,
  515. void* context,
  516. uint32_t default_uarch_index,
  517. uint32_t max_uarch_index,
  518. size_t range_i,
  519. size_t range_j,
  520. size_t tile_i,
  521. size_t tile_j,
  522. uint32_t flags);
  523. /**
  524. * Process items on a 3D grid.
  525. *
  526. * The function implements a parallel version of the following snippet:
  527. *
  528. * for (size_t i = 0; i < range_i; i++)
  529. * for (size_t j = 0; j < range_j; j++)
  530. * for (size_t k = 0; k < range_k; k++)
  531. * function(context, i, j, k);
  532. *
  533. * When the function returns, all items have been processed and the thread pool
  534. * is ready for a new task.
  535. *
  536. * @note If multiple threads call this function with the same thread pool, the
  537. * calls are serialized.
  538. *
  539. * @param threadpool the thread pool to use for parallelisation. If threadpool
  540. * is NULL, all items are processed serially on the calling thread.
  541. * @param function the function to call for each tile.
  542. * @param context the first argument passed to the specified function.
  543. * @param range_i the number of items to process along the first dimension
  544. * of the 3D grid.
  545. * @param range_j the number of items to process along the second dimension
  546. * of the 3D grid.
  547. * @param range_k the number of items to process along the third dimension
  548. * of the 3D grid.
  549. * @param flags a bitwise combination of zero or more optional flags
  550. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  551. */
  552. void pthreadpool_parallelize_3d(
  553. pthreadpool_t threadpool,
  554. pthreadpool_task_3d_t function,
  555. void* context,
  556. size_t range_i,
  557. size_t range_j,
  558. size_t range_k,
  559. uint32_t flags);
  560. /**
  561. * Process items on a 3D grid with the specified maximum tile size along the
  562. * last grid dimension.
  563. *
  564. * The function implements a parallel version of the following snippet:
  565. *
  566. * for (size_t i = 0; i < range_i; i++)
  567. * for (size_t j = 0; j < range_j; j++)
  568. * for (size_t k = 0; k < range_k; k += tile_k)
  569. * function(context, i, j, k, min(range_k - k, tile_k));
  570. *
  571. * When the function returns, all items have been processed and the thread pool
  572. * is ready for a new task.
  573. *
  574. * @note If multiple threads call this function with the same thread pool, the
  575. * calls are serialized.
  576. *
  577. * @param threadpool the thread pool to use for parallelisation. If threadpool
  578. * is NULL, all items are processed serially on the calling thread.
  579. * @param function the function to call for each tile.
  580. * @param context the first argument passed to the specified function.
  581. * @param range_i the number of items to process along the first dimension
  582. * of the 3D grid.
  583. * @param range_j the number of items to process along the second dimension
  584. * of the 3D grid.
  585. * @param range_k the number of items to process along the third dimension
  586. * of the 3D grid.
  587. * @param tile_k the maximum number of items along the third dimension of
  588. * the 3D grid to process in one function call.
  589. * @param flags a bitwise combination of zero or more optional flags
  590. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  591. */
  592. void pthreadpool_parallelize_3d_tile_1d(
  593. pthreadpool_t threadpool,
  594. pthreadpool_task_3d_tile_1d_t function,
  595. void* context,
  596. size_t range_i,
  597. size_t range_j,
  598. size_t range_k,
  599. size_t tile_k,
  600. uint32_t flags);
  601. /**
  602. * Process items on a 3D grid with the specified maximum tile size along the
  603. * last grid dimension and passing along the current thread id.
  604. *
  605. * The function implements a parallel version of the following snippet:
  606. *
  607. * for (size_t i = 0; i < range_i; i++)
  608. * for (size_t j = 0; j < range_j; j++)
  609. * for (size_t k = 0; k < range_k; k += tile_k)
  610. * function(context, thread_index, i, j, k, min(range_k - k, tile_k));
  611. *
  612. * When the function returns, all items have been processed and the thread pool
  613. * is ready for a new task.
  614. *
  615. * @note If multiple threads call this function with the same thread pool, the
  616. * calls are serialized.
  617. *
  618. * @param threadpool the thread pool to use for parallelisation. If threadpool
  619. * is NULL, all items are processed serially on the calling thread.
  620. * @param function the function to call for each tile.
  621. * @param context the first argument passed to the specified function.
  622. * @param range_i the number of items to process along the first dimension
  623. * of the 3D grid.
  624. * @param range_j the number of items to process along the second dimension
  625. * of the 3D grid.
  626. * @param range_k the number of items to process along the third dimension
  627. * of the 3D grid.
  628. * @param tile_k the maximum number of items along the third dimension of
  629. * the 3D grid to process in one function call.
  630. * @param flags a bitwise combination of zero or more optional flags
  631. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  632. */
  633. void pthreadpool_parallelize_3d_tile_1d_with_thread(
  634. pthreadpool_t threadpool,
  635. pthreadpool_task_3d_tile_1d_with_thread_t function,
  636. void* context,
  637. size_t range_i,
  638. size_t range_j,
  639. size_t range_k,
  640. size_t tile_k,
  641. uint32_t flags);
  642. /**
  643. * Process items on a 3D grid with the specified maximum tile size along the
  644. * last grid dimension using a microarchitecture-aware task function.
  645. *
  646. * The function implements a parallel version of the following snippet:
  647. *
  648. * uint32_t uarch_index = cpuinfo_initialize() ?
  649. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  650. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  651. * for (size_t i = 0; i < range_i; i++)
  652. * for (size_t j = 0; j < range_j; j++)
  653. * for (size_t k = 0; k < range_k; k += tile_k)
  654. * function(context, uarch_index, i, j, k, min(range_k - k, tile_k));
  655. *
  656. * When the function returns, all items have been processed and the thread pool
  657. * is ready for a new task.
  658. *
  659. * @note If multiple threads call this function with the same thread pool, the
  660. * calls are serialized.
  661. *
  662. * @param threadpool the thread pool to use for parallelisation. If
  663. * threadpool is NULL, all items are processed serially on the calling
  664. * thread.
  665. * @param function the function to call for each tile.
  666. * @param context the first argument passed to the specified
  667. * function.
  668. * @param default_uarch_index the microarchitecture index to use when
  669. * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  670. * or index returned by cpuinfo_get_current_uarch_index() exceeds the
  671. * max_uarch_index value.
  672. * @param max_uarch_index the maximum microarchitecture index expected by
  673. * the specified function. If the index returned by
  674. * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  675. * will be used instead. default_uarch_index can exceed max_uarch_index.
  676. * @param range_i the number of items to process along the first
  677. * dimension of the 3D grid.
  678. * @param range_j the number of items to process along the second
  679. * dimension of the 3D grid.
  680. * @param range_k the number of items to process along the third
  681. * dimension of the 3D grid.
  682. * @param tile_k the maximum number of items along the third
  683. * dimension of the 3D grid to process in one function call.
  684. * @param flags a bitwise combination of zero or more optional
  685. * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  686. * PTHREADPOOL_FLAG_YIELD_WORKERS)
  687. */
  688. void pthreadpool_parallelize_3d_tile_1d_with_uarch(
  689. pthreadpool_t threadpool,
  690. pthreadpool_task_3d_tile_1d_with_id_t function,
  691. void* context,
  692. uint32_t default_uarch_index,
  693. uint32_t max_uarch_index,
  694. size_t range_i,
  695. size_t range_j,
  696. size_t range_k,
  697. size_t tile_k,
  698. uint32_t flags);
  699. /**
  700. * Process items on a 3D grid with the specified maximum tile size along the
  701. * last grid dimension using a microarchitecture-aware task function and passing
  702. * along the current thread id.
  703. *
  704. * The function implements a parallel version of the following snippet:
  705. *
  706. * uint32_t uarch_index = cpuinfo_initialize() ?
  707. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  708. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  709. * for (size_t i = 0; i < range_i; i++)
  710. * for (size_t j = 0; j < range_j; j++)
  711. * for (size_t k = 0; k < range_k; k += tile_k)
  712. * function(context, uarch_index, thread_index, i, j, k, min(range_k - k, tile_k));
  713. *
  714. * When the function returns, all items have been processed and the thread pool
  715. * is ready for a new task.
  716. *
  717. * @note If multiple threads call this function with the same thread pool, the
  718. * calls are serialized.
  719. *
  720. * @param threadpool the thread pool to use for parallelisation. If
  721. * threadpool is NULL, all items are processed serially on the calling
  722. * thread.
  723. * @param function the function to call for each tile.
  724. * @param context the first argument passed to the specified
  725. * function.
  726. * @param default_uarch_index the microarchitecture index to use when
  727. * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  728. * or index returned by cpuinfo_get_current_uarch_index() exceeds the
  729. * max_uarch_index value.
  730. * @param max_uarch_index the maximum microarchitecture index expected by
  731. * the specified function. If the index returned by
  732. * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  733. * will be used instead. default_uarch_index can exceed max_uarch_index.
  734. * @param range_i the number of items to process along the first
  735. * dimension of the 3D grid.
  736. * @param range_j the number of items to process along the second
  737. * dimension of the 3D grid.
  738. * @param range_k the number of items to process along the third
  739. * dimension of the 3D grid.
  740. * @param tile_k the maximum number of items along the third
  741. * dimension of the 3D grid to process in one function call.
  742. * @param flags a bitwise combination of zero or more optional
  743. * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  744. * PTHREADPOOL_FLAG_YIELD_WORKERS)
  745. */
  746. void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(
  747. pthreadpool_t threadpool,
  748. pthreadpool_task_3d_tile_1d_with_id_with_thread_t function,
  749. void* context,
  750. uint32_t default_uarch_index,
  751. uint32_t max_uarch_index,
  752. size_t range_i,
  753. size_t range_j,
  754. size_t range_k,
  755. size_t tile_k,
  756. uint32_t flags);
  757. /**
  758. * Process items on a 3D grid with the specified maximum tile size along the
  759. * last two grid dimensions.
  760. *
  761. * The function implements a parallel version of the following snippet:
  762. *
  763. * for (size_t i = 0; i < range_i; i++)
  764. * for (size_t j = 0; j < range_j; j += tile_j)
  765. * for (size_t k = 0; k < range_k; k += tile_k)
  766. * function(context, i, j, k,
  767. * min(range_j - j, tile_j), min(range_k - k, tile_k));
  768. *
  769. * When the function returns, all items have been processed and the thread pool
  770. * is ready for a new task.
  771. *
  772. * @note If multiple threads call this function with the same thread pool, the
  773. * calls are serialized.
  774. *
  775. * @param threadpool the thread pool to use for parallelisation. If threadpool
  776. * is NULL, all items are processed serially on the calling thread.
  777. * @param function the function to call for each tile.
  778. * @param context the first argument passed to the specified function.
  779. * @param range_i the number of items to process along the first dimension
  780. * of the 3D grid.
  781. * @param range_j the number of items to process along the second dimension
  782. * of the 3D grid.
  783. * @param range_k the number of items to process along the third dimension
  784. * of the 3D grid.
  785. * @param tile_j the maximum number of items along the second dimension of
  786. * the 3D grid to process in one function call.
  787. * @param tile_k the maximum number of items along the third dimension of
  788. * the 3D grid to process in one function call.
  789. * @param flags a bitwise combination of zero or more optional flags
  790. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  791. */
  792. void pthreadpool_parallelize_3d_tile_2d(
  793. pthreadpool_t threadpool,
  794. pthreadpool_task_3d_tile_2d_t function,
  795. void* context,
  796. size_t range_i,
  797. size_t range_j,
  798. size_t range_k,
  799. size_t tile_j,
  800. size_t tile_k,
  801. uint32_t flags);
  802. /**
  803. * Process items on a 3D grid with the specified maximum tile size along the
  804. * last two grid dimensions using a microarchitecture-aware task function.
  805. *
  806. * The function implements a parallel version of the following snippet:
  807. *
  808. * uint32_t uarch_index = cpuinfo_initialize() ?
  809. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  810. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  811. * for (size_t i = 0; i < range_i; i++)
  812. * for (size_t j = 0; j < range_j; j += tile_j)
  813. * for (size_t k = 0; k < range_k; k += tile_k)
  814. * function(context, uarch_index, i, j, k,
  815. * min(range_j - j, tile_j), min(range_k - k, tile_k));
  816. *
  817. * When the function returns, all items have been processed and the thread pool
  818. * is ready for a new task.
  819. *
  820. * @note If multiple threads call this function with the same thread pool, the
  821. * calls are serialized.
  822. *
  823. * @param threadpool the thread pool to use for parallelisation. If
  824. * threadpool is NULL, all items are processed serially on the calling
  825. * thread.
  826. * @param function the function to call for each tile.
  827. * @param context the first argument passed to the specified
  828. * function.
  829. * @param default_uarch_index the microarchitecture index to use when
  830. * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  831. * or index returned by cpuinfo_get_current_uarch_index() exceeds the
  832. * max_uarch_index value.
  833. * @param max_uarch_index the maximum microarchitecture index expected by
  834. * the specified function. If the index returned by
  835. * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  836. * will be used instead. default_uarch_index can exceed max_uarch_index.
  837. * @param range_i the number of items to process along the first
  838. * dimension of the 3D grid.
  839. * @param range_j the number of items to process along the second
  840. * dimension of the 3D grid.
  841. * @param range_k the number of items to process along the third
  842. * dimension of the 3D grid.
  843. * @param tile_j the maximum number of items along the second
  844. * dimension of the 3D grid to process in one function call.
  845. * @param tile_k the maximum number of items along the third
  846. * dimension of the 3D grid to process in one function call.
  847. * @param flags a bitwise combination of zero or more optional
  848. * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  849. * PTHREADPOOL_FLAG_YIELD_WORKERS)
  850. */
  851. void pthreadpool_parallelize_3d_tile_2d_with_uarch(
  852. pthreadpool_t threadpool,
  853. pthreadpool_task_3d_tile_2d_with_id_t function,
  854. void* context,
  855. uint32_t default_uarch_index,
  856. uint32_t max_uarch_index,
  857. size_t range_i,
  858. size_t range_j,
  859. size_t range_k,
  860. size_t tile_j,
  861. size_t tile_k,
  862. uint32_t flags);
  863. /**
  864. * Process items on a 4D grid.
  865. *
  866. * The function implements a parallel version of the following snippet:
  867. *
  868. * for (size_t i = 0; i < range_i; i++)
  869. * for (size_t j = 0; j < range_j; j++)
  870. * for (size_t k = 0; k < range_k; k++)
  871. * for (size_t l = 0; l < range_l; l++)
  872. * function(context, i, j, k, l);
  873. *
  874. * When the function returns, all items have been processed and the thread pool
  875. * is ready for a new task.
  876. *
  877. * @note If multiple threads call this function with the same thread pool, the
  878. * calls are serialized.
  879. *
  880. * @param threadpool the thread pool to use for parallelisation. If threadpool
  881. * is NULL, all items are processed serially on the calling thread.
  882. * @param function the function to call for each tile.
  883. * @param context the first argument passed to the specified function.
  884. * @param range_i the number of items to process along the first dimension
  885. * of the 4D grid.
  886. * @param range_j the number of items to process along the second dimension
  887. * of the 4D grid.
  888. * @param range_k the number of items to process along the third dimension
  889. * of the 4D grid.
  890. * @param range_l the number of items to process along the fourth dimension
  891. * of the 4D grid.
  892. * @param flags a bitwise combination of zero or more optional flags
  893. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  894. */
  895. void pthreadpool_parallelize_4d(
  896. pthreadpool_t threadpool,
  897. pthreadpool_task_4d_t function,
  898. void* context,
  899. size_t range_i,
  900. size_t range_j,
  901. size_t range_k,
  902. size_t range_l,
  903. uint32_t flags);
  904. /**
  905. * Process items on a 4D grid with the specified maximum tile size along the
  906. * last grid dimension.
  907. *
  908. * The function implements a parallel version of the following snippet:
  909. *
  910. * for (size_t i = 0; i < range_i; i++)
  911. * for (size_t j = 0; j < range_j; j++)
  912. * for (size_t k = 0; k < range_k; k++)
  913. * for (size_t l = 0; l < range_l; l += tile_l)
  914. * function(context, i, j, k, l, min(range_l - l, tile_l));
  915. *
  916. * When the function returns, all items have been processed and the thread pool
  917. * is ready for a new task.
  918. *
  919. * @note If multiple threads call this function with the same thread pool, the
  920. * calls are serialized.
  921. *
  922. * @param threadpool the thread pool to use for parallelisation. If threadpool
  923. * is NULL, all items are processed serially on the calling thread.
  924. * @param function the function to call for each tile.
  925. * @param context the first argument passed to the specified function.
  926. * @param range_i the number of items to process along the first dimension
  927. * of the 4D grid.
  928. * @param range_j the number of items to process along the second dimension
  929. * of the 4D grid.
  930. * @param range_k the number of items to process along the third dimension
  931. * of the 4D grid.
  932. * @param range_l the number of items to process along the fourth dimension
  933. * of the 4D grid.
  934. * @param tile_l the maximum number of items along the fourth dimension of
  935. * the 4D grid to process in one function call.
  936. * @param flags a bitwise combination of zero or more optional flags
  937. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  938. */
  939. void pthreadpool_parallelize_4d_tile_1d(
  940. pthreadpool_t threadpool,
  941. pthreadpool_task_4d_tile_1d_t function,
  942. void* context,
  943. size_t range_i,
  944. size_t range_j,
  945. size_t range_k,
  946. size_t range_l,
  947. size_t tile_l,
  948. uint32_t flags);
  949. /**
  950. * Process items on a 4D grid with the specified maximum tile size along the
  951. * last two grid dimensions.
  952. *
  953. * The function implements a parallel version of the following snippet:
  954. *
  955. * for (size_t i = 0; i < range_i; i++)
  956. * for (size_t j = 0; j < range_j; j++)
  957. * for (size_t k = 0; k < range_k; k += tile_k)
  958. * for (size_t l = 0; l < range_l; l += tile_l)
  959. * function(context, i, j, k, l,
  960. * min(range_k - k, tile_k), min(range_l - l, tile_l));
  961. *
  962. * When the function returns, all items have been processed and the thread pool
  963. * is ready for a new task.
  964. *
  965. * @note If multiple threads call this function with the same thread pool, the
  966. * calls are serialized.
  967. *
  968. * @param threadpool the thread pool to use for parallelisation. If threadpool
  969. * is NULL, all items are processed serially on the calling thread.
  970. * @param function the function to call for each tile.
  971. * @param context the first argument passed to the specified function.
  972. * @param range_i the number of items to process along the first dimension
  973. * of the 4D grid.
  974. * @param range_j the number of items to process along the second dimension
  975. * of the 4D grid.
  976. * @param range_k the number of items to process along the third dimension
  977. * of the 4D grid.
  978. * @param range_l the number of items to process along the fourth dimension
  979. * of the 4D grid.
  980. * @param tile_k the maximum number of items along the third dimension of
  981. * the 4D grid to process in one function call.
  982. * @param tile_l the maximum number of items along the fourth dimension of
  983. * the 4D grid to process in one function call.
  984. * @param flags a bitwise combination of zero or more optional flags
  985. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  986. */
  987. void pthreadpool_parallelize_4d_tile_2d(
  988. pthreadpool_t threadpool,
  989. pthreadpool_task_4d_tile_2d_t function,
  990. void* context,
  991. size_t range_i,
  992. size_t range_j,
  993. size_t range_k,
  994. size_t range_l,
  995. size_t tile_k,
  996. size_t tile_l,
  997. uint32_t flags);
  998. /**
  999. * Process items on a 4D grid with the specified maximum tile size along the
  1000. * last two grid dimensions using a microarchitecture-aware task function.
  1001. *
  1002. * The function implements a parallel version of the following snippet:
  1003. *
  1004. * uint32_t uarch_index = cpuinfo_initialize() ?
  1005. * cpuinfo_get_current_uarch_index() : default_uarch_index;
  1006. * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  1007. * for (size_t i = 0; i < range_i; i++)
  1008. * for (size_t j = 0; j < range_j; j++)
  1009. * for (size_t k = 0; k < range_k; k += tile_k)
  1010. * for (size_t l = 0; l < range_l; l += tile_l)
  1011. * function(context, uarch_index, i, j, k, l,
  1012. * min(range_k - k, tile_k), min(range_l - l, tile_l));
  1013. *
  1014. * When the function returns, all items have been processed and the thread pool
  1015. * is ready for a new task.
  1016. *
  1017. * @note If multiple threads call this function with the same thread pool, the
  1018. * calls are serialized.
  1019. *
  1020. * @param threadpool the thread pool to use for parallelisation. If
  1021. * threadpool is NULL, all items are processed serially on the calling
  1022. * thread.
  1023. * @param function the function to call for each tile.
  1024. * @param context the first argument passed to the specified
  1025. * function.
  1026. * @param default_uarch_index the microarchitecture index to use when
  1027. * pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  1028. * or index returned by cpuinfo_get_current_uarch_index() exceeds the
  1029. * max_uarch_index value.
  1030. * @param max_uarch_index the maximum microarchitecture index expected by
  1031. * the specified function. If the index returned by
  1032. * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  1033. * will be used instead. default_uarch_index can exceed max_uarch_index.
  1034. * @param range_i the number of items to process along the first
  1035. * dimension of the 4D grid.
  1036. * @param range_j the number of items to process along the second
  1037. * dimension of the 4D grid.
  1038. * @param range_k the number of items to process along the third
  1039. * dimension of the 4D grid.
  1040. * @param range_l the number of items to process along the fourth
  1041. * dimension of the 4D grid.
  1042. * @param tile_k the maximum number of items along the third
  1043. * dimension of the 4D grid to process in one function call.
  1044. * @param tile_l the maximum number of items along the fourth
  1045. * dimension of the 4D grid to process in one function call.
  1046. * @param flags a bitwise combination of zero or more optional
  1047. * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  1048. * PTHREADPOOL_FLAG_YIELD_WORKERS)
  1049. */
  1050. void pthreadpool_parallelize_4d_tile_2d_with_uarch(
  1051. pthreadpool_t threadpool,
  1052. pthreadpool_task_4d_tile_2d_with_id_t function,
  1053. void* context,
  1054. uint32_t default_uarch_index,
  1055. uint32_t max_uarch_index,
  1056. size_t range_i,
  1057. size_t range_j,
  1058. size_t range_k,
  1059. size_t range_l,
  1060. size_t tile_k,
  1061. size_t tile_l,
  1062. uint32_t flags);
  1063. /**
  1064. * Process items on a 5D grid.
  1065. *
  1066. * The function implements a parallel version of the following snippet:
  1067. *
  1068. * for (size_t i = 0; i < range_i; i++)
  1069. * for (size_t j = 0; j < range_j; j++)
  1070. * for (size_t k = 0; k < range_k; k++)
  1071. * for (size_t l = 0; l < range_l; l++)
  1072. * for (size_t m = 0; m < range_m; m++)
  1073. * function(context, i, j, k, l, m);
  1074. *
  1075. * When the function returns, all items have been processed and the thread pool
  1076. * is ready for a new task.
  1077. *
  1078. * @note If multiple threads call this function with the same thread pool, the
  1079. * calls are serialized.
  1080. *
  1081. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1082. * is NULL, all items are processed serially on the calling thread.
  1083. * @param function the function to call for each tile.
  1084. * @param context the first argument passed to the specified function.
  1085. * @param range_i the number of items to process along the first dimension
  1086. * of the 5D grid.
  1087. * @param range_j the number of items to process along the second dimension
  1088. * of the 5D grid.
  1089. * @param range_k the number of items to process along the third dimension
  1090. * of the 5D grid.
  1091. * @param range_l the number of items to process along the fourth dimension
  1092. * of the 5D grid.
  1093. * @param range_m the number of items to process along the fifth dimension
  1094. * of the 5D grid.
  1095. * @param flags a bitwise combination of zero or more optional flags
  1096. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1097. */
  1098. void pthreadpool_parallelize_5d(
  1099. pthreadpool_t threadpool,
  1100. pthreadpool_task_5d_t function,
  1101. void* context,
  1102. size_t range_i,
  1103. size_t range_j,
  1104. size_t range_k,
  1105. size_t range_l,
  1106. size_t range_m,
  1107. uint32_t flags);
  1108. /**
  1109. * Process items on a 5D grid with the specified maximum tile size along the
  1110. * last grid dimension.
  1111. *
  1112. * The function implements a parallel version of the following snippet:
  1113. *
  1114. * for (size_t i = 0; i < range_i; i++)
  1115. * for (size_t j = 0; j < range_j; j++)
  1116. * for (size_t k = 0; k < range_k; k++)
  1117. * for (size_t l = 0; l < range_l; l++)
  1118. * for (size_t m = 0; m < range_m; m += tile_m)
  1119. * function(context, i, j, k, l, m, min(range_m - m, tile_m));
  1120. *
  1121. * When the function returns, all items have been processed and the thread pool
  1122. * is ready for a new task.
  1123. *
  1124. * @note If multiple threads call this function with the same thread pool, the
  1125. * calls are serialized.
  1126. *
  1127. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1128. * is NULL, all items are processed serially on the calling thread.
  1129. * @param function the function to call for each tile.
  1130. * @param context the first argument passed to the specified function.
  1131. * @param range_i the number of items to process along the first dimension
  1132. * of the 5D grid.
  1133. * @param range_j the number of items to process along the second dimension
  1134. * of the 5D grid.
  1135. * @param range_k the number of items to process along the third dimension
  1136. * of the 5D grid.
  1137. * @param range_l the number of items to process along the fourth dimension
  1138. * of the 5D grid.
  1139. * @param range_m the number of items to process along the fifth dimension
  1140. * of the 5D grid.
  1141. * @param tile_m the maximum number of items along the fifth dimension of
  1142. * the 5D grid to process in one function call.
  1143. * @param flags a bitwise combination of zero or more optional flags
  1144. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1145. */
  1146. void pthreadpool_parallelize_5d_tile_1d(
  1147. pthreadpool_t threadpool,
  1148. pthreadpool_task_5d_tile_1d_t function,
  1149. void* context,
  1150. size_t range_i,
  1151. size_t range_j,
  1152. size_t range_k,
  1153. size_t range_l,
  1154. size_t range_m,
  1155. size_t tile_m,
  1156. uint32_t flags);
  1157. /**
  1158. * Process items on a 5D grid with the specified maximum tile size along the
  1159. * last two grid dimensions.
  1160. *
  1161. * The function implements a parallel version of the following snippet:
  1162. *
  1163. * for (size_t i = 0; i < range_i; i++)
  1164. * for (size_t j = 0; j < range_j; j++)
  1165. * for (size_t k = 0; k < range_k; k++)
  1166. * for (size_t l = 0; l < range_l; l += tile_l)
  1167. * for (size_t m = 0; m < range_m; m += tile_m)
  1168. * function(context, i, j, k, l, m,
  1169. * min(range_l - l, tile_l), min(range_m - m, tile_m));
  1170. *
  1171. * When the function returns, all items have been processed and the thread pool
  1172. * is ready for a new task.
  1173. *
  1174. * @note If multiple threads call this function with the same thread pool, the
  1175. * calls are serialized.
  1176. *
  1177. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1178. * is NULL, all items are processed serially on the calling thread.
  1179. * @param function the function to call for each tile.
  1180. * @param context the first argument passed to the specified function.
  1181. * @param range_i the number of items to process along the first dimension
  1182. * of the 5D grid.
  1183. * @param range_j the number of items to process along the second dimension
  1184. * of the 5D grid.
  1185. * @param range_k the number of items to process along the third dimension
  1186. * of the 5D grid.
  1187. * @param range_l the number of items to process along the fourth dimension
  1188. * of the 5D grid.
  1189. * @param range_m the number of items to process along the fifth dimension
  1190. * of the 5D grid.
  1191. * @param tile_l the maximum number of items along the fourth dimension of
  1192. * the 5D grid to process in one function call.
  1193. * @param tile_m the maximum number of items along the fifth dimension of
  1194. * the 5D grid to process in one function call.
  1195. * @param flags a bitwise combination of zero or more optional flags
  1196. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1197. */
  1198. void pthreadpool_parallelize_5d_tile_2d(
  1199. pthreadpool_t threadpool,
  1200. pthreadpool_task_5d_tile_2d_t function,
  1201. void* context,
  1202. size_t range_i,
  1203. size_t range_j,
  1204. size_t range_k,
  1205. size_t range_l,
  1206. size_t range_m,
  1207. size_t tile_l,
  1208. size_t tile_m,
  1209. uint32_t flags);
  1210. /**
  1211. * Process items on a 6D grid.
  1212. *
  1213. * The function implements a parallel version of the following snippet:
  1214. *
  1215. * for (size_t i = 0; i < range_i; i++)
  1216. * for (size_t j = 0; j < range_j; j++)
  1217. * for (size_t k = 0; k < range_k; k++)
  1218. * for (size_t l = 0; l < range_l; l++)
  1219. * for (size_t m = 0; m < range_m; m++)
  1220. * for (size_t n = 0; n < range_n; n++)
  1221. * function(context, i, j, k, l, m, n);
  1222. *
  1223. * When the function returns, all items have been processed and the thread pool
  1224. * is ready for a new task.
  1225. *
  1226. * @note If multiple threads call this function with the same thread pool, the
  1227. * calls are serialized.
  1228. *
  1229. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1230. * is NULL, all items are processed serially on the calling thread.
  1231. * @param function the function to call for each tile.
  1232. * @param context the first argument passed to the specified function.
  1233. * @param range_i the number of items to process along the first dimension
  1234. * of the 6D grid.
  1235. * @param range_j the number of items to process along the second dimension
  1236. * of the 6D grid.
  1237. * @param range_k the number of items to process along the third dimension
  1238. * of the 6D grid.
  1239. * @param range_l the number of items to process along the fourth dimension
  1240. * of the 6D grid.
  1241. * @param range_m the number of items to process along the fifth dimension
  1242. * of the 6D grid.
  1243. * @param range_n the number of items to process along the sixth dimension
  1244. * of the 6D grid.
  1245. * @param tile_n the maximum number of items along the sixth dimension of
  1246. * the 6D grid to process in one function call.
  1247. * @param flags a bitwise combination of zero or more optional flags
  1248. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1249. */
  1250. void pthreadpool_parallelize_6d(
  1251. pthreadpool_t threadpool,
  1252. pthreadpool_task_6d_t function,
  1253. void* context,
  1254. size_t range_i,
  1255. size_t range_j,
  1256. size_t range_k,
  1257. size_t range_l,
  1258. size_t range_m,
  1259. size_t range_n,
  1260. uint32_t flags);
  1261. /**
  1262. * Process items on a 6D grid with the specified maximum tile size along the
  1263. * last grid dimension.
  1264. *
  1265. * The function implements a parallel version of the following snippet:
  1266. *
  1267. * for (size_t i = 0; i < range_i; i++)
  1268. * for (size_t j = 0; j < range_j; j++)
  1269. * for (size_t k = 0; k < range_k; k++)
  1270. * for (size_t l = 0; l < range_l; l++)
  1271. * for (size_t m = 0; m < range_m; m++)
  1272. * for (size_t n = 0; n < range_n; n += tile_n)
  1273. * function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
  1274. *
  1275. * When the function returns, all items have been processed and the thread pool
  1276. * is ready for a new task.
  1277. *
  1278. * @note If multiple threads call this function with the same thread pool, the
  1279. * calls are serialized.
  1280. *
  1281. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1282. * is NULL, all items are processed serially on the calling thread.
  1283. * @param function the function to call for each tile.
  1284. * @param context the first argument passed to the specified function.
  1285. * @param range_i the number of items to process along the first dimension
  1286. * of the 6D grid.
  1287. * @param range_j the number of items to process along the second dimension
  1288. * of the 6D grid.
  1289. * @param range_k the number of items to process along the third dimension
  1290. * of the 6D grid.
  1291. * @param range_l the number of items to process along the fourth dimension
  1292. * of the 6D grid.
  1293. * @param range_m the number of items to process along the fifth dimension
  1294. * of the 6D grid.
  1295. * @param range_n the number of items to process along the sixth dimension
  1296. * of the 6D grid.
  1297. * @param tile_n the maximum number of items along the sixth dimension of
  1298. * the 6D grid to process in one function call.
  1299. * @param flags a bitwise combination of zero or more optional flags
  1300. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1301. */
  1302. void pthreadpool_parallelize_6d_tile_1d(
  1303. pthreadpool_t threadpool,
  1304. pthreadpool_task_6d_tile_1d_t function,
  1305. void* context,
  1306. size_t range_i,
  1307. size_t range_j,
  1308. size_t range_k,
  1309. size_t range_l,
  1310. size_t range_m,
  1311. size_t range_n,
  1312. size_t tile_n,
  1313. uint32_t flags);
  1314. /**
  1315. * Process items on a 6D grid with the specified maximum tile size along the
  1316. * last two grid dimensions.
  1317. *
  1318. * The function implements a parallel version of the following snippet:
  1319. *
  1320. * for (size_t i = 0; i < range_i; i++)
  1321. * for (size_t j = 0; j < range_j; j++)
  1322. * for (size_t k = 0; k < range_k; k++)
  1323. * for (size_t l = 0; l < range_l; l++)
  1324. * for (size_t m = 0; m < range_m; m += tile_m)
  1325. * for (size_t n = 0; n < range_n; n += tile_n)
  1326. * function(context, i, j, k, l, m, n,
  1327. * min(range_m - m, tile_m), min(range_n - n, tile_n));
  1328. *
  1329. * When the function returns, all items have been processed and the thread pool
  1330. * is ready for a new task.
  1331. *
  1332. * @note If multiple threads call this function with the same thread pool, the
  1333. * calls are serialized.
  1334. *
  1335. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1336. * is NULL, all items are processed serially on the calling thread.
  1337. * @param function the function to call for each tile.
  1338. * @param context the first argument passed to the specified function.
  1339. * @param range_i the number of items to process along the first dimension
  1340. * of the 6D grid.
  1341. * @param range_j the number of items to process along the second dimension
  1342. * of the 6D grid.
  1343. * @param range_k the number of items to process along the third dimension
  1344. * of the 6D grid.
  1345. * @param range_l the number of items to process along the fourth dimension
  1346. * of the 6D grid.
  1347. * @param range_m the number of items to process along the fifth dimension
  1348. * of the 6D grid.
  1349. * @param range_n the number of items to process along the sixth dimension
  1350. * of the 6D grid.
  1351. * @param tile_m the maximum number of items along the fifth dimension of
  1352. * the 6D grid to process in one function call.
  1353. * @param tile_n the maximum number of items along the sixth dimension of
  1354. * the 6D grid to process in one function call.
  1355. * @param flags a bitwise combination of zero or more optional flags
  1356. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1357. */
  1358. void pthreadpool_parallelize_6d_tile_2d(
  1359. pthreadpool_t threadpool,
  1360. pthreadpool_task_6d_tile_2d_t function,
  1361. void* context,
  1362. size_t range_i,
  1363. size_t range_j,
  1364. size_t range_k,
  1365. size_t range_l,
  1366. size_t range_m,
  1367. size_t range_n,
  1368. size_t tile_m,
  1369. size_t tile_n,
  1370. uint32_t flags);
  1371. /**
  1372. * Terminates threads in the thread pool and releases associated resources.
  1373. *
  1374. * @warning Accessing the thread pool after a call to this function constitutes
  1375. * undefined behaviour and may cause data corruption.
  1376. *
  1377. * @param[in,out] threadpool The thread pool to destroy.
  1378. */
  1379. void pthreadpool_destroy(pthreadpool_t threadpool);
  1380. #ifndef PTHREADPOOL_NO_DEPRECATED_API
  1381. /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
  1382. #if defined(__GNUC__)
  1383. #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
  1384. #else
  1385. #define PTHREADPOOL_DEPRECATED
  1386. #endif
  1387. typedef void (*pthreadpool_function_1d_t)(void*, size_t);
  1388. typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
  1389. typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
  1390. typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
  1391. typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
  1392. typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
  1393. void pthreadpool_compute_1d(
  1394. pthreadpool_t threadpool,
  1395. pthreadpool_function_1d_t function,
  1396. void* argument,
  1397. size_t range) PTHREADPOOL_DEPRECATED;
  1398. void pthreadpool_compute_1d_tiled(
  1399. pthreadpool_t threadpool,
  1400. pthreadpool_function_1d_tiled_t function,
  1401. void* argument,
  1402. size_t range,
  1403. size_t tile) PTHREADPOOL_DEPRECATED;
  1404. void pthreadpool_compute_2d(
  1405. pthreadpool_t threadpool,
  1406. pthreadpool_function_2d_t function,
  1407. void* argument,
  1408. size_t range_i,
  1409. size_t range_j) PTHREADPOOL_DEPRECATED;
  1410. void pthreadpool_compute_2d_tiled(
  1411. pthreadpool_t threadpool,
  1412. pthreadpool_function_2d_tiled_t function,
  1413. void* argument,
  1414. size_t range_i,
  1415. size_t range_j,
  1416. size_t tile_i,
  1417. size_t tile_j) PTHREADPOOL_DEPRECATED;
  1418. void pthreadpool_compute_3d_tiled(
  1419. pthreadpool_t threadpool,
  1420. pthreadpool_function_3d_tiled_t function,
  1421. void* argument,
  1422. size_t range_i,
  1423. size_t range_j,
  1424. size_t range_k,
  1425. size_t tile_i,
  1426. size_t tile_j,
  1427. size_t tile_k) PTHREADPOOL_DEPRECATED;
  1428. void pthreadpool_compute_4d_tiled(
  1429. pthreadpool_t threadpool,
  1430. pthreadpool_function_4d_tiled_t function,
  1431. void* argument,
  1432. size_t range_i,
  1433. size_t range_j,
  1434. size_t range_k,
  1435. size_t range_l,
  1436. size_t tile_i,
  1437. size_t tile_j,
  1438. size_t tile_k,
  1439. size_t tile_l) PTHREADPOOL_DEPRECATED;
  1440. #endif /* PTHREADPOOL_NO_DEPRECATED_API */
  1441. #ifdef __cplusplus
  1442. } /* extern "C" */
  1443. #endif
  1444. #ifdef __cplusplus
  1445. namespace libpthreadpool {
  1446. namespace detail {
  1447. namespace {
  1448. template<class T>
  1449. void call_wrapper_1d(void* arg, size_t i) {
  1450. (*static_cast<const T*>(arg))(i);
  1451. }
  1452. template<class T>
  1453. void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) {
  1454. (*static_cast<const T*>(arg))(range_i, tile_i);
  1455. }
  1456. template<class T>
  1457. void call_wrapper_2d(void* functor, size_t i, size_t j) {
  1458. (*static_cast<const T*>(functor))(i, j);
  1459. }
  1460. template<class T>
  1461. void call_wrapper_2d_tile_1d(void* functor,
  1462. size_t i, size_t range_j, size_t tile_j)
  1463. {
  1464. (*static_cast<const T*>(functor))(i, range_j, tile_j);
  1465. }
  1466. template<class T>
  1467. void call_wrapper_2d_tile_2d(void* functor,
  1468. size_t range_i, size_t range_j,
  1469. size_t tile_i, size_t tile_j)
  1470. {
  1471. (*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j);
  1472. }
  1473. template<class T>
  1474. void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) {
  1475. (*static_cast<const T*>(functor))(i, j, k);
  1476. }
  1477. template<class T>
  1478. void call_wrapper_3d_tile_1d(void* functor,
  1479. size_t i, size_t j, size_t range_k,
  1480. size_t tile_k)
  1481. {
  1482. (*static_cast<const T*>(functor))(i, j, range_k, tile_k);
  1483. }
  1484. template<class T>
  1485. void call_wrapper_3d_tile_2d(void* functor,
  1486. size_t i, size_t range_j, size_t range_k,
  1487. size_t tile_j, size_t tile_k)
  1488. {
  1489. (*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k);
  1490. }
  1491. template<class T>
  1492. void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) {
  1493. (*static_cast<const T*>(functor))(i, j, k, l);
  1494. }
  1495. template<class T>
  1496. void call_wrapper_4d_tile_1d(void* functor,
  1497. size_t i, size_t j, size_t k, size_t range_l,
  1498. size_t tile_l)
  1499. {
  1500. (*static_cast<const T*>(functor))(i, j, k, range_l, tile_l);
  1501. }
  1502. template<class T>
  1503. void call_wrapper_4d_tile_2d(void* functor,
  1504. size_t i, size_t j, size_t range_k, size_t range_l,
  1505. size_t tile_k, size_t tile_l)
  1506. {
  1507. (*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l);
  1508. }
  1509. template<class T>
  1510. void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) {
  1511. (*static_cast<const T*>(functor))(i, j, k, l, m);
  1512. }
  1513. template<class T>
  1514. void call_wrapper_5d_tile_1d(void* functor,
  1515. size_t i, size_t j, size_t k, size_t l, size_t range_m,
  1516. size_t tile_m)
  1517. {
  1518. (*static_cast<const T*>(functor))(i, j, k, l, range_m, tile_m);
  1519. }
  1520. template<class T>
  1521. void call_wrapper_5d_tile_2d(void* functor,
  1522. size_t i, size_t j, size_t k, size_t range_l, size_t range_m,
  1523. size_t tile_l, size_t tile_m)
  1524. {
  1525. (*static_cast<const T*>(functor))(i, j, k, range_l, range_m, tile_l, tile_m);
  1526. }
  1527. template<class T>
  1528. void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
  1529. (*static_cast<const T*>(functor))(i, j, k, l, m, n);
  1530. }
  1531. template<class T>
  1532. void call_wrapper_6d_tile_1d(void* functor,
  1533. size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n,
  1534. size_t tile_n)
  1535. {
  1536. (*static_cast<const T*>(functor))(i, j, k, l, m, range_n, tile_n);
  1537. }
  1538. template<class T>
  1539. void call_wrapper_6d_tile_2d(void* functor,
  1540. size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n,
  1541. size_t tile_m, size_t tile_n)
  1542. {
  1543. (*static_cast<const T*>(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n);
  1544. }
  1545. } /* namespace */
  1546. } /* namespace detail */
  1547. } /* namespace libpthreadpool */
  1548. /**
  1549. * Process items on a 1D grid.
  1550. *
  1551. * The function implements a parallel version of the following snippet:
  1552. *
  1553. * for (size_t i = 0; i < range; i++)
  1554. * functor(i);
  1555. *
  1556. * When the function returns, all items have been processed and the thread pool
  1557. * is ready for a new task.
  1558. *
  1559. * @note If multiple threads call this function with the same thread pool, the
  1560. * calls are serialized.
  1561. *
  1562. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1563. * is NULL, all items are processed serially on the calling thread.
  1564. * @param functor the functor to call for each item.
  1565. * @param range the number of items on the 1D grid to process. The
  1566. * specified functor will be called once for each item.
  1567. * @param flags a bitwise combination of zero or more optional flags
  1568. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1569. */
  1570. template<class T>
  1571. inline void pthreadpool_parallelize_1d(
  1572. pthreadpool_t threadpool,
  1573. const T& functor,
  1574. size_t range,
  1575. uint32_t flags = 0)
  1576. {
  1577. pthreadpool_parallelize_1d(
  1578. threadpool,
  1579. &libpthreadpool::detail::call_wrapper_1d<const T>,
  1580. const_cast<void*>(static_cast<const void*>(&functor)),
  1581. range,
  1582. flags);
  1583. }
  1584. /**
  1585. * Process items on a 1D grid with specified maximum tile size.
  1586. *
  1587. * The function implements a parallel version of the following snippet:
  1588. *
  1589. * for (size_t i = 0; i < range; i += tile)
  1590. * functor(i, min(range - i, tile));
  1591. *
  1592. * When the call returns, all items have been processed and the thread pool is
  1593. * ready for a new task.
  1594. *
  1595. * @note If multiple threads call this function with the same thread pool,
  1596. * the calls are serialized.
  1597. *
  1598. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1599. * is NULL, all items are processed serially on the calling thread.
  1600. * @param functor the functor to call for each tile.
  1601. * @param range the number of items on the 1D grid to process.
  1602. * @param tile the maximum number of items on the 1D grid to process in
  1603. * one functor call.
  1604. * @param flags a bitwise combination of zero or more optional flags
  1605. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1606. */
  1607. template<class T>
  1608. inline void pthreadpool_parallelize_1d_tile_1d(
  1609. pthreadpool_t threadpool,
  1610. const T& functor,
  1611. size_t range,
  1612. size_t tile,
  1613. uint32_t flags = 0)
  1614. {
  1615. pthreadpool_parallelize_1d_tile_1d(
  1616. threadpool,
  1617. &libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>,
  1618. const_cast<void*>(static_cast<const void*>(&functor)),
  1619. range,
  1620. tile,
  1621. flags);
  1622. }
  1623. /**
  1624. * Process items on a 2D grid.
  1625. *
  1626. * The function implements a parallel version of the following snippet:
  1627. *
  1628. * for (size_t i = 0; i < range_i; i++)
  1629. * for (size_t j = 0; j < range_j; j++)
  1630. * functor(i, j);
  1631. *
  1632. * When the function returns, all items have been processed and the thread pool
  1633. * is ready for a new task.
  1634. *
  1635. * @note If multiple threads call this function with the same thread pool, the
  1636. * calls are serialized.
  1637. *
  1638. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1639. * is NULL, all items are processed serially on the calling thread.
  1640. * @param functor the functor to call for each item.
  1641. * @param range_i the number of items to process along the first dimension
  1642. * of the 2D grid.
  1643. * @param range_j the number of items to process along the second dimension
  1644. * of the 2D grid.
  1645. * @param flags a bitwise combination of zero or more optional flags
  1646. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1647. */
  1648. template<class T>
  1649. inline void pthreadpool_parallelize_2d(
  1650. pthreadpool_t threadpool,
  1651. const T& functor,
  1652. size_t range_i,
  1653. size_t range_j,
  1654. uint32_t flags = 0)
  1655. {
  1656. pthreadpool_parallelize_2d(
  1657. threadpool,
  1658. &libpthreadpool::detail::call_wrapper_2d<const T>,
  1659. const_cast<void*>(static_cast<const void*>(&functor)),
  1660. range_i,
  1661. range_j,
  1662. flags);
  1663. }
  1664. /**
  1665. * Process items on a 2D grid with the specified maximum tile size along the
  1666. * last grid dimension.
  1667. *
  1668. * The function implements a parallel version of the following snippet:
  1669. *
  1670. * for (size_t i = 0; i < range_i; i++)
  1671. * for (size_t j = 0; j < range_j; j += tile_j)
  1672. * functor(i, j, min(range_j - j, tile_j));
  1673. *
  1674. * When the function returns, all items have been processed and the thread pool
  1675. * is ready for a new task.
  1676. *
  1677. * @note If multiple threads call this function with the same thread pool, the
  1678. * calls are serialized.
  1679. *
  1680. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1681. * is NULL, all items are processed serially on the calling thread.
  1682. * @param functor the functor to call for each tile.
  1683. * @param range_i the number of items to process along the first dimension
  1684. * of the 2D grid.
  1685. * @param range_j the number of items to process along the second dimension
  1686. * of the 2D grid.
  1687. * @param tile_j the maximum number of items along the second dimension of
  1688. * the 2D grid to process in one functor call.
  1689. * @param flags a bitwise combination of zero or more optional flags
  1690. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1691. */
  1692. template<class T>
  1693. inline void pthreadpool_parallelize_2d_tile_1d(
  1694. pthreadpool_t threadpool,
  1695. const T& functor,
  1696. size_t range_i,
  1697. size_t range_j,
  1698. size_t tile_j,
  1699. uint32_t flags = 0)
  1700. {
  1701. pthreadpool_parallelize_2d_tile_1d(
  1702. threadpool,
  1703. &libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>,
  1704. const_cast<void*>(static_cast<const void*>(&functor)),
  1705. range_i,
  1706. range_j,
  1707. tile_j,
  1708. flags);
  1709. }
  1710. /**
  1711. * Process items on a 2D grid with the specified maximum tile size along each
  1712. * grid dimension.
  1713. *
  1714. * The function implements a parallel version of the following snippet:
  1715. *
  1716. * for (size_t i = 0; i < range_i; i += tile_i)
  1717. * for (size_t j = 0; j < range_j; j += tile_j)
  1718. * functor(i, j,
  1719. * min(range_i - i, tile_i), min(range_j - j, tile_j));
  1720. *
  1721. * When the function returns, all items have been processed and the thread pool
  1722. * is ready for a new task.
  1723. *
  1724. * @note If multiple threads call this function with the same thread pool, the
  1725. * calls are serialized.
  1726. *
  1727. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1728. * is NULL, all items are processed serially on the calling thread.
  1729. * @param functor the functor to call for each tile.
  1730. * @param range_i the number of items to process along the first dimension
  1731. * of the 2D grid.
  1732. * @param range_j the number of items to process along the second dimension
  1733. * of the 2D grid.
  1734. * @param tile_j the maximum number of items along the first dimension of
  1735. * the 2D grid to process in one functor call.
  1736. * @param tile_j the maximum number of items along the second dimension of
  1737. * the 2D grid to process in one functor call.
  1738. * @param flags a bitwise combination of zero or more optional flags
  1739. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1740. */
  1741. template<class T>
  1742. inline void pthreadpool_parallelize_2d_tile_2d(
  1743. pthreadpool_t threadpool,
  1744. const T& functor,
  1745. size_t range_i,
  1746. size_t range_j,
  1747. size_t tile_i,
  1748. size_t tile_j,
  1749. uint32_t flags = 0)
  1750. {
  1751. pthreadpool_parallelize_2d_tile_2d(
  1752. threadpool,
  1753. &libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>,
  1754. const_cast<void*>(static_cast<const void*>(&functor)),
  1755. range_i,
  1756. range_j,
  1757. tile_i,
  1758. tile_j,
  1759. flags);
  1760. }
  1761. /**
  1762. * Process items on a 3D grid.
  1763. *
  1764. * The function implements a parallel version of the following snippet:
  1765. *
  1766. * for (size_t i = 0; i < range_i; i++)
  1767. * for (size_t j = 0; j < range_j; j++)
  1768. * for (size_t k = 0; k < range_k; k++)
  1769. * functor(i, j, k);
  1770. *
  1771. * When the function returns, all items have been processed and the thread pool
  1772. * is ready for a new task.
  1773. *
  1774. * @note If multiple threads call this function with the same thread pool, the
  1775. * calls are serialized.
  1776. *
  1777. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1778. * is NULL, all items are processed serially on the calling thread.
  1779. * @param functor the functor to call for each tile.
  1780. * @param range_i the number of items to process along the first dimension
  1781. * of the 3D grid.
  1782. * @param range_j the number of items to process along the second dimension
  1783. * of the 3D grid.
  1784. * @param range_k the number of items to process along the third dimension
  1785. * of the 3D grid.
  1786. * @param flags a bitwise combination of zero or more optional flags
  1787. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1788. */
  1789. template<class T>
  1790. inline void pthreadpool_parallelize_3d(
  1791. pthreadpool_t threadpool,
  1792. const T& functor,
  1793. size_t range_i,
  1794. size_t range_j,
  1795. size_t range_k,
  1796. uint32_t flags = 0)
  1797. {
  1798. pthreadpool_parallelize_3d(
  1799. threadpool,
  1800. &libpthreadpool::detail::call_wrapper_3d<const T>,
  1801. const_cast<void*>(static_cast<const void*>(&functor)),
  1802. range_i,
  1803. range_j,
  1804. range_k,
  1805. flags);
  1806. }
  1807. /**
  1808. * Process items on a 3D grid with the specified maximum tile size along the
  1809. * last grid dimension.
  1810. *
  1811. * The function implements a parallel version of the following snippet:
  1812. *
  1813. * for (size_t i = 0; i < range_i; i++)
  1814. * for (size_t j = 0; j < range_j; j++)
  1815. * for (size_t k = 0; k < range_k; k += tile_k)
  1816. * functor(i, j, k, min(range_k - k, tile_k));
  1817. *
  1818. * When the function returns, all items have been processed and the thread pool
  1819. * is ready for a new task.
  1820. *
  1821. * @note If multiple threads call this function with the same thread pool, the
  1822. * calls are serialized.
  1823. *
  1824. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1825. * is NULL, all items are processed serially on the calling thread.
  1826. * @param functor the functor to call for each tile.
  1827. * @param range_i the number of items to process along the first dimension
  1828. * of the 3D grid.
  1829. * @param range_j the number of items to process along the second dimension
  1830. * of the 3D grid.
  1831. * @param range_k the number of items to process along the third dimension
  1832. * of the 3D grid.
  1833. * @param tile_k the maximum number of items along the third dimension of
  1834. * the 3D grid to process in one functor call.
  1835. * @param flags a bitwise combination of zero or more optional flags
  1836. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1837. */
  1838. template<class T>
  1839. inline void pthreadpool_parallelize_3d_tile_1d(
  1840. pthreadpool_t threadpool,
  1841. const T& functor,
  1842. size_t range_i,
  1843. size_t range_j,
  1844. size_t range_k,
  1845. size_t tile_k,
  1846. uint32_t flags = 0)
  1847. {
  1848. pthreadpool_parallelize_3d_tile_1d(
  1849. threadpool,
  1850. &libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>,
  1851. const_cast<void*>(static_cast<const void*>(&functor)),
  1852. range_i,
  1853. range_j,
  1854. range_k,
  1855. tile_k,
  1856. flags);
  1857. }
  1858. /**
  1859. * Process items on a 3D grid with the specified maximum tile size along the
  1860. * last two grid dimensions.
  1861. *
  1862. * The function implements a parallel version of the following snippet:
  1863. *
  1864. * for (size_t i = 0; i < range_i; i++)
  1865. * for (size_t j = 0; j < range_j; j += tile_j)
  1866. * for (size_t k = 0; k < range_k; k += tile_k)
  1867. * functor(i, j, k,
  1868. * min(range_j - j, tile_j), min(range_k - k, tile_k));
  1869. *
  1870. * When the function returns, all items have been processed and the thread pool
  1871. * is ready for a new task.
  1872. *
  1873. * @note If multiple threads call this function with the same thread pool, the
  1874. * calls are serialized.
  1875. *
  1876. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1877. * is NULL, all items are processed serially on the calling thread.
  1878. * @param functor the functor to call for each tile.
  1879. * @param range_i the number of items to process along the first dimension
  1880. * of the 3D grid.
  1881. * @param range_j the number of items to process along the second dimension
  1882. * of the 3D grid.
  1883. * @param range_k the number of items to process along the third dimension
  1884. * of the 3D grid.
  1885. * @param tile_j the maximum number of items along the second dimension of
  1886. * the 3D grid to process in one functor call.
  1887. * @param tile_k the maximum number of items along the third dimension of
  1888. * the 3D grid to process in one functor call.
  1889. * @param flags a bitwise combination of zero or more optional flags
  1890. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1891. */
  1892. template<class T>
  1893. inline void pthreadpool_parallelize_3d_tile_2d(
  1894. pthreadpool_t threadpool,
  1895. const T& functor,
  1896. size_t range_i,
  1897. size_t range_j,
  1898. size_t range_k,
  1899. size_t tile_j,
  1900. size_t tile_k,
  1901. uint32_t flags = 0)
  1902. {
  1903. pthreadpool_parallelize_3d_tile_2d(
  1904. threadpool,
  1905. &libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>,
  1906. const_cast<void*>(static_cast<const void*>(&functor)),
  1907. range_i,
  1908. range_j,
  1909. range_k,
  1910. tile_j,
  1911. tile_k,
  1912. flags);
  1913. }
  1914. /**
  1915. * Process items on a 4D grid.
  1916. *
  1917. * The function implements a parallel version of the following snippet:
  1918. *
  1919. * for (size_t i = 0; i < range_i; i++)
  1920. * for (size_t j = 0; j < range_j; j++)
  1921. * for (size_t k = 0; k < range_k; k++)
  1922. * for (size_t l = 0; l < range_l; l++)
  1923. * functor(i, j, k, l);
  1924. *
  1925. * When the function returns, all items have been processed and the thread pool
  1926. * is ready for a new task.
  1927. *
  1928. * @note If multiple threads call this function with the same thread pool, the
  1929. * calls are serialized.
  1930. *
  1931. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1932. * is NULL, all items are processed serially on the calling thread.
  1933. * @param functor the functor to call for each tile.
  1934. * @param range_i the number of items to process along the first dimension
  1935. * of the 4D grid.
  1936. * @param range_j the number of items to process along the second dimension
  1937. * of the 4D grid.
  1938. * @param range_k the number of items to process along the third dimension
  1939. * of the 4D grid.
  1940. * @param range_l the number of items to process along the fourth dimension
  1941. * of the 4D grid.
  1942. * @param flags a bitwise combination of zero or more optional flags
  1943. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1944. */
  1945. template<class T>
  1946. inline void pthreadpool_parallelize_4d(
  1947. pthreadpool_t threadpool,
  1948. const T& functor,
  1949. size_t range_i,
  1950. size_t range_j,
  1951. size_t range_k,
  1952. size_t range_l,
  1953. uint32_t flags = 0)
  1954. {
  1955. pthreadpool_parallelize_4d(
  1956. threadpool,
  1957. &libpthreadpool::detail::call_wrapper_4d<const T>,
  1958. const_cast<void*>(static_cast<const void*>(&functor)),
  1959. range_i,
  1960. range_j,
  1961. range_k,
  1962. range_l,
  1963. flags);
  1964. }
  1965. /**
  1966. * Process items on a 4D grid with the specified maximum tile size along the
  1967. * last grid dimension.
  1968. *
  1969. * The function implements a parallel version of the following snippet:
  1970. *
  1971. * for (size_t i = 0; i < range_i; i++)
  1972. * for (size_t j = 0; j < range_j; j++)
  1973. * for (size_t k = 0; k < range_k; k++)
  1974. * for (size_t l = 0; l < range_l; l += tile_l)
  1975. * functor(i, j, k, l, min(range_l - l, tile_l));
  1976. *
  1977. * When the function returns, all items have been processed and the thread pool
  1978. * is ready for a new task.
  1979. *
  1980. * @note If multiple threads call this function with the same thread pool, the
  1981. * calls are serialized.
  1982. *
  1983. * @param threadpool the thread pool to use for parallelisation. If threadpool
  1984. * is NULL, all items are processed serially on the calling thread.
  1985. * @param functor the functor to call for each tile.
  1986. * @param range_i the number of items to process along the first dimension
  1987. * of the 4D grid.
  1988. * @param range_j the number of items to process along the second dimension
  1989. * of the 4D grid.
  1990. * @param range_k the number of items to process along the third dimension
  1991. * of the 4D grid.
  1992. * @param range_l the number of items to process along the fourth dimension
  1993. * of the 4D grid.
  1994. * @param tile_l the maximum number of items along the fourth dimension of
  1995. * the 4D grid to process in one functor call.
  1996. * @param flags a bitwise combination of zero or more optional flags
  1997. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  1998. */
  1999. template<class T>
  2000. inline void pthreadpool_parallelize_4d_tile_1d(
  2001. pthreadpool_t threadpool,
  2002. const T& functor,
  2003. size_t range_i,
  2004. size_t range_j,
  2005. size_t range_k,
  2006. size_t range_l,
  2007. size_t tile_l,
  2008. uint32_t flags = 0)
  2009. {
  2010. pthreadpool_parallelize_4d_tile_1d(
  2011. threadpool,
  2012. &libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>,
  2013. const_cast<void*>(static_cast<const void*>(&functor)),
  2014. range_i,
  2015. range_j,
  2016. range_k,
  2017. range_l,
  2018. tile_l,
  2019. flags);
  2020. }
  2021. /**
  2022. * Process items on a 4D grid with the specified maximum tile size along the
  2023. * last two grid dimensions.
  2024. *
  2025. * The function implements a parallel version of the following snippet:
  2026. *
  2027. * for (size_t i = 0; i < range_i; i++)
  2028. * for (size_t j = 0; j < range_j; j++)
  2029. * for (size_t k = 0; k < range_k; k += tile_k)
  2030. * for (size_t l = 0; l < range_l; l += tile_l)
  2031. * functor(i, j, k, l,
  2032. * min(range_k - k, tile_k), min(range_l - l, tile_l));
  2033. *
  2034. * When the function returns, all items have been processed and the thread pool
  2035. * is ready for a new task.
  2036. *
  2037. * @note If multiple threads call this function with the same thread pool, the
  2038. * calls are serialized.
  2039. *
  2040. * @param threadpool the thread pool to use for parallelisation. If threadpool
  2041. * is NULL, all items are processed serially on the calling thread.
  2042. * @param functor the functor to call for each tile.
  2043. * @param range_i the number of items to process along the first dimension
  2044. * of the 4D grid.
  2045. * @param range_j the number of items to process along the second dimension
  2046. * of the 4D grid.
  2047. * @param range_k the number of items to process along the third dimension
  2048. * of the 4D grid.
  2049. * @param range_l the number of items to process along the fourth dimension
  2050. * of the 4D grid.
  2051. * @param tile_k the maximum number of items along the third dimension of
  2052. * the 4D grid to process in one functor call.
  2053. * @param tile_l the maximum number of items along the fourth dimension of
  2054. * the 4D grid to process in one functor call.
  2055. * @param flags a bitwise combination of zero or more optional flags
  2056. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  2057. */
  2058. template<class T>
  2059. inline void pthreadpool_parallelize_4d_tile_2d(
  2060. pthreadpool_t threadpool,
  2061. const T& functor,
  2062. size_t range_i,
  2063. size_t range_j,
  2064. size_t range_k,
  2065. size_t range_l,
  2066. size_t tile_k,
  2067. size_t tile_l,
  2068. uint32_t flags = 0)
  2069. {
  2070. pthreadpool_parallelize_4d_tile_2d(
  2071. threadpool,
  2072. &libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>,
  2073. const_cast<void*>(static_cast<const void*>(&functor)),
  2074. range_i,
  2075. range_j,
  2076. range_k,
  2077. range_l,
  2078. tile_k,
  2079. tile_l,
  2080. flags);
  2081. }
  2082. /**
  2083. * Process items on a 5D grid.
  2084. *
  2085. * The function implements a parallel version of the following snippet:
  2086. *
  2087. * for (size_t i = 0; i < range_i; i++)
  2088. * for (size_t j = 0; j < range_j; j++)
  2089. * for (size_t k = 0; k < range_k; k++)
  2090. * for (size_t l = 0; l < range_l; l++)
  2091. * for (size_t m = 0; m < range_m; m++)
  2092. * functor(i, j, k, l, m);
  2093. *
  2094. * When the function returns, all items have been processed and the thread pool
  2095. * is ready for a new task.
  2096. *
  2097. * @note If multiple threads call this function with the same thread pool, the
  2098. * calls are serialized.
  2099. *
  2100. * @param threadpool the thread pool to use for parallelisation. If threadpool
  2101. * is NULL, all items are processed serially on the calling thread.
  2102. * @param functor the functor to call for each tile.
  2103. * @param range_i the number of items to process along the first dimension
  2104. * of the 5D grid.
  2105. * @param range_j the number of items to process along the second dimension
  2106. * of the 5D grid.
  2107. * @param range_k the number of items to process along the third dimension
  2108. * of the 5D grid.
  2109. * @param range_l the number of items to process along the fourth dimension
  2110. * of the 5D grid.
  2111. * @param range_m the number of items to process along the fifth dimension
  2112. * of the 5D grid.
  2113. * @param flags a bitwise combination of zero or more optional flags
  2114. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  2115. */
  2116. template<class T>
  2117. inline void pthreadpool_parallelize_5d(
  2118. pthreadpool_t threadpool,
  2119. const T& functor,
  2120. size_t range_i,
  2121. size_t range_j,
  2122. size_t range_k,
  2123. size_t range_l,
  2124. size_t range_m,
  2125. uint32_t flags = 0)
  2126. {
  2127. pthreadpool_parallelize_5d(
  2128. threadpool,
  2129. &libpthreadpool::detail::call_wrapper_5d<const T>,
  2130. const_cast<void*>(static_cast<const void*>(&functor)),
  2131. range_i,
  2132. range_j,
  2133. range_k,
  2134. range_l,
  2135. range_m,
  2136. flags);
  2137. }
  2138. /**
  2139. * Process items on a 5D grid with the specified maximum tile size along the
  2140. * last grid dimension.
  2141. *
  2142. * The function implements a parallel version of the following snippet:
  2143. *
  2144. * for (size_t i = 0; i < range_i; i++)
  2145. * for (size_t j = 0; j < range_j; j++)
  2146. * for (size_t k = 0; k < range_k; k++)
  2147. * for (size_t l = 0; l < range_l; l++)
  2148. * for (size_t m = 0; m < range_m; m += tile_m)
  2149. * functor(i, j, k, l, m, min(range_m - m, tile_m));
  2150. *
  2151. * When the function returns, all items have been processed and the thread pool
  2152. * is ready for a new task.
  2153. *
  2154. * @note If multiple threads call this function with the same thread pool, the
  2155. * calls are serialized.
  2156. *
  2157. * @param threadpool the thread pool to use for parallelisation. If threadpool
  2158. * is NULL, all items are processed serially on the calling thread.
  2159. * @param functor the functor to call for each tile.
  2160. * @param range_i the number of items to process along the first dimension
  2161. * of the 5D grid.
  2162. * @param range_j the number of items to process along the second dimension
  2163. * of the 5D grid.
  2164. * @param range_k the number of items to process along the third dimension
  2165. * of the 5D grid.
  2166. * @param range_l the number of items to process along the fourth dimension
  2167. * of the 5D grid.
  2168. * @param range_m the number of items to process along the fifth dimension
  2169. * of the 5D grid.
  2170. * @param tile_m the maximum number of items along the fifth dimension of
  2171. * the 5D grid to process in one functor call.
  2172. * @param flags a bitwise combination of zero or more optional flags
  2173. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  2174. */
  2175. template<class T>
  2176. inline void pthreadpool_parallelize_5d_tile_1d(
  2177. pthreadpool_t threadpool,
  2178. const T& functor,
  2179. size_t range_i,
  2180. size_t range_j,
  2181. size_t range_k,
  2182. size_t range_l,
  2183. size_t range_m,
  2184. size_t tile_m,
  2185. uint32_t flags = 0)
  2186. {
  2187. pthreadpool_parallelize_5d_tile_1d(
  2188. threadpool,
  2189. &libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>,
  2190. const_cast<void*>(static_cast<const void*>(&functor)),
  2191. range_i,
  2192. range_j,
  2193. range_k,
  2194. range_l,
  2195. range_m,
  2196. tile_m,
  2197. flags);
  2198. }
  2199. /**
  2200. * Process items on a 5D grid with the specified maximum tile size along the
  2201. * last two grid dimensions.
  2202. *
  2203. * The function implements a parallel version of the following snippet:
  2204. *
  2205. * for (size_t i = 0; i < range_i; i++)
  2206. * for (size_t j = 0; j < range_j; j++)
  2207. * for (size_t k = 0; k < range_k; k++)
  2208. * for (size_t l = 0; l < range_l; l += tile_l)
  2209. * for (size_t m = 0; m < range_m; m += tile_m)
  2210. * functor(i, j, k, l, m,
  2211. * min(range_l - l, tile_l), min(range_m - m, tile_m));
  2212. *
  2213. * When the function returns, all items have been processed and the thread pool
  2214. * is ready for a new task.
  2215. *
  2216. * @note If multiple threads call this function with the same thread pool, the
  2217. * calls are serialized.
  2218. *
  2219. * @param threadpool the thread pool to use for parallelisation. If threadpool
  2220. * is NULL, all items are processed serially on the calling thread.
  2221. * @param functor the functor to call for each tile.
  2222. * @param range_i the number of items to process along the first dimension
  2223. * of the 5D grid.
  2224. * @param range_j the number of items to process along the second dimension
  2225. * of the 5D grid.
  2226. * @param range_k the number of items to process along the third dimension
  2227. * of the 5D grid.
  2228. * @param range_l the number of items to process along the fourth dimension
  2229. * of the 5D grid.
  2230. * @param range_m the number of items to process along the fifth dimension
  2231. * of the 5D grid.
  2232. * @param tile_l the maximum number of items along the fourth dimension of
  2233. * the 5D grid to process in one functor call.
  2234. * @param tile_m the maximum number of items along the fifth dimension of
  2235. * the 5D grid to process in one functor call.
  2236. * @param flags a bitwise combination of zero or more optional flags
  2237. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  2238. */
  2239. template<class T>
  2240. inline void pthreadpool_parallelize_5d_tile_2d(
  2241. pthreadpool_t threadpool,
  2242. const T& functor,
  2243. size_t range_i,
  2244. size_t range_j,
  2245. size_t range_k,
  2246. size_t range_l,
  2247. size_t range_m,
  2248. size_t tile_l,
  2249. size_t tile_m,
  2250. uint32_t flags = 0)
  2251. {
  2252. pthreadpool_parallelize_5d_tile_2d(
  2253. threadpool,
  2254. &libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>,
  2255. const_cast<void*>(static_cast<const void*>(&functor)),
  2256. range_i,
  2257. range_j,
  2258. range_k,
  2259. range_l,
  2260. range_m,
  2261. tile_l,
  2262. tile_m,
  2263. flags);
  2264. }
  2265. /**
  2266. * Process items on a 6D grid.
  2267. *
  2268. * The function implements a parallel version of the following snippet:
  2269. *
  2270. * for (size_t i = 0; i < range_i; i++)
  2271. * for (size_t j = 0; j < range_j; j++)
  2272. * for (size_t k = 0; k < range_k; k++)
  2273. * for (size_t l = 0; l < range_l; l++)
  2274. * for (size_t m = 0; m < range_m; m++)
  2275. * for (size_t n = 0; n < range_n; n++)
  2276. * functor(i, j, k, l, m, n);
  2277. *
  2278. * When the function returns, all items have been processed and the thread pool
  2279. * is ready for a new task.
  2280. *
  2281. * @note If multiple threads call this function with the same thread pool, the
  2282. * calls are serialized.
  2283. *
  2284. * @param threadpool the thread pool to use for parallelisation. If threadpool
  2285. * is NULL, all items are processed serially on the calling thread.
  2286. * @param functor the functor to call for each tile.
  2287. * @param range_i the number of items to process along the first dimension
  2288. * of the 6D grid.
  2289. * @param range_j the number of items to process along the second dimension
  2290. * of the 6D grid.
  2291. * @param range_k the number of items to process along the third dimension
  2292. * of the 6D grid.
  2293. * @param range_l the number of items to process along the fourth dimension
  2294. * of the 6D grid.
  2295. * @param range_m the number of items to process along the fifth dimension
  2296. * of the 6D grid.
  2297. * @param range_n the number of items to process along the sixth dimension
  2298. * of the 6D grid.
  2299. * @param tile_n the maximum number of items along the sixth dimension of
  2300. * the 6D grid to process in one functor call.
  2301. * @param flags a bitwise combination of zero or more optional flags
  2302. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  2303. */
  2304. template<class T>
  2305. inline void pthreadpool_parallelize_6d(
  2306. pthreadpool_t threadpool,
  2307. const T& functor,
  2308. size_t range_i,
  2309. size_t range_j,
  2310. size_t range_k,
  2311. size_t range_l,
  2312. size_t range_m,
  2313. size_t range_n,
  2314. uint32_t flags = 0)
  2315. {
  2316. pthreadpool_parallelize_6d(
  2317. threadpool,
  2318. &libpthreadpool::detail::call_wrapper_6d<const T>,
  2319. const_cast<void*>(static_cast<const void*>(&functor)),
  2320. range_i,
  2321. range_j,
  2322. range_k,
  2323. range_l,
  2324. range_m,
  2325. range_n,
  2326. flags);
  2327. }
  2328. /**
  2329. * Process items on a 6D grid with the specified maximum tile size along the
  2330. * last grid dimension.
  2331. *
  2332. * The function implements a parallel version of the following snippet:
  2333. *
  2334. * for (size_t i = 0; i < range_i; i++)
  2335. * for (size_t j = 0; j < range_j; j++)
  2336. * for (size_t k = 0; k < range_k; k++)
  2337. * for (size_t l = 0; l < range_l; l++)
  2338. * for (size_t m = 0; m < range_m; m++)
  2339. * for (size_t n = 0; n < range_n; n += tile_n)
  2340. * functor(i, j, k, l, m, n, min(range_n - n, tile_n));
  2341. *
  2342. * When the function returns, all items have been processed and the thread pool
  2343. * is ready for a new task.
  2344. *
  2345. * @note If multiple threads call this function with the same thread pool, the
  2346. * calls are serialized.
  2347. *
  2348. * @param threadpool the thread pool to use for parallelisation. If threadpool
  2349. * is NULL, all items are processed serially on the calling thread.
  2350. * @param functor the functor to call for each tile.
  2351. * @param range_i the number of items to process along the first dimension
  2352. * of the 6D grid.
  2353. * @param range_j the number of items to process along the second dimension
  2354. * of the 6D grid.
  2355. * @param range_k the number of items to process along the third dimension
  2356. * of the 6D grid.
  2357. * @param range_l the number of items to process along the fourth dimension
  2358. * of the 6D grid.
  2359. * @param range_m the number of items to process along the fifth dimension
  2360. * of the 6D grid.
  2361. * @param range_n the number of items to process along the sixth dimension
  2362. * of the 6D grid.
  2363. * @param tile_n the maximum number of items along the sixth dimension of
  2364. * the 6D grid to process in one functor call.
  2365. * @param flags a bitwise combination of zero or more optional flags
  2366. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  2367. */
  2368. template<class T>
  2369. inline void pthreadpool_parallelize_6d_tile_1d(
  2370. pthreadpool_t threadpool,
  2371. const T& functor,
  2372. size_t range_i,
  2373. size_t range_j,
  2374. size_t range_k,
  2375. size_t range_l,
  2376. size_t range_m,
  2377. size_t range_n,
  2378. size_t tile_n,
  2379. uint32_t flags = 0)
  2380. {
  2381. pthreadpool_parallelize_6d_tile_1d(
  2382. threadpool,
  2383. &libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>,
  2384. const_cast<void*>(static_cast<const void*>(&functor)),
  2385. range_i,
  2386. range_j,
  2387. range_k,
  2388. range_l,
  2389. range_m,
  2390. range_n,
  2391. tile_n,
  2392. flags);
  2393. }
  2394. /**
  2395. * Process items on a 6D grid with the specified maximum tile size along the
  2396. * last two grid dimensions.
  2397. *
  2398. * The function implements a parallel version of the following snippet:
  2399. *
  2400. * for (size_t i = 0; i < range_i; i++)
  2401. * for (size_t j = 0; j < range_j; j++)
  2402. * for (size_t k = 0; k < range_k; k++)
  2403. * for (size_t l = 0; l < range_l; l++)
  2404. * for (size_t m = 0; m < range_m; m += tile_m)
  2405. * for (size_t n = 0; n < range_n; n += tile_n)
  2406. * functor(i, j, k, l, m, n,
  2407. * min(range_m - m, tile_m), min(range_n - n, tile_n));
  2408. *
  2409. * When the function returns, all items have been processed and the thread pool
  2410. * is ready for a new task.
  2411. *
  2412. * @note If multiple threads call this function with the same thread pool, the
  2413. * calls are serialized.
  2414. *
  2415. * @param threadpool the thread pool to use for parallelisation. If threadpool
  2416. * is NULL, all items are processed serially on the calling thread.
  2417. * @param functor the functor to call for each tile.
  2418. * @param range_i the number of items to process along the first dimension
  2419. * of the 6D grid.
  2420. * @param range_j the number of items to process along the second dimension
  2421. * of the 6D grid.
  2422. * @param range_k the number of items to process along the third dimension
  2423. * of the 6D grid.
  2424. * @param range_l the number of items to process along the fourth dimension
  2425. * of the 6D grid.
  2426. * @param range_m the number of items to process along the fifth dimension
  2427. * of the 6D grid.
  2428. * @param range_n the number of items to process along the sixth dimension
  2429. * of the 6D grid.
  2430. * @param tile_m the maximum number of items along the fifth dimension of
  2431. * the 6D grid to process in one functor call.
  2432. * @param tile_n the maximum number of items along the sixth dimension of
  2433. * the 6D grid to process in one functor call.
  2434. * @param flags a bitwise combination of zero or more optional flags
  2435. * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
  2436. */
  2437. template<class T>
  2438. inline void pthreadpool_parallelize_6d_tile_2d(
  2439. pthreadpool_t threadpool,
  2440. const T& functor,
  2441. size_t range_i,
  2442. size_t range_j,
  2443. size_t range_k,
  2444. size_t range_l,
  2445. size_t range_m,
  2446. size_t range_n,
  2447. size_t tile_m,
  2448. size_t tile_n,
  2449. uint32_t flags = 0)
  2450. {
  2451. pthreadpool_parallelize_6d_tile_2d(
  2452. threadpool,
  2453. &libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>,
  2454. const_cast<void*>(static_cast<const void*>(&functor)),
  2455. range_i,
  2456. range_j,
  2457. range_k,
  2458. range_l,
  2459. range_m,
  2460. range_n,
  2461. tile_m,
  2462. tile_n,
  2463. flags);
  2464. }
  2465. #endif /* __cplusplus */
  2466. #endif /* PTHREADPOOL_H_ */
  2467. #else
  2468. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  2469. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)