XPUEvent.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #include <c10/xpu/XPUStream.h>
  4. namespace c10::xpu {
  5. /*
  6. * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
  7. * constructed lazily when first recorded. It has a device, and this device is
  8. * acquired from the first recording stream. Later streams that record the event
  9. * must match the same device.
  10. *
  11. * Currently, XPUEvent does NOT support to export an inter-process event from
  12. * another process via inter-process communication(IPC). So it means that
  13. * inter-process communication for event handles between different processes is
  14. * not available. This could impact some applications that rely on cross-process
  15. * synchronization and communication.
  16. */
  17. struct XPUEvent {
  18. // Constructors
  19. XPUEvent(bool enable_timing = false) noexcept
  20. : enable_timing_{enable_timing} {}
  21. ~XPUEvent() {
  22. if (isCreated()) {
  23. const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
  24. if (C10_UNLIKELY(interp)) {
  25. (*interp)->trace_gpu_event_deletion(
  26. c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
  27. }
  28. }
  29. }
  30. C10_DISABLE_COPY_AND_ASSIGN(XPUEvent);
  31. XPUEvent(XPUEvent&& other) = default;
  32. XPUEvent& operator=(XPUEvent&& other) = default;
  33. operator sycl::event&() const {
  34. return event();
  35. }
  36. std::optional<c10::Device> device() const {
  37. if (isCreated()) {
  38. return c10::Device(c10::kXPU, device_index_);
  39. } else {
  40. return std::nullopt;
  41. }
  42. }
  43. inline bool isCreated() const {
  44. return (event_.get() != nullptr);
  45. }
  46. DeviceIndex device_index() const {
  47. return device_index_;
  48. }
  49. sycl::event& event() const {
  50. return *event_;
  51. }
  52. bool query() const {
  53. using namespace sycl::info;
  54. if (!isCreated()) {
  55. return true;
  56. }
  57. return event().get_info<event::command_execution_status>() ==
  58. event_command_status::complete;
  59. }
  60. void record() {
  61. record(getCurrentXPUStream());
  62. }
  63. void recordOnce(const XPUStream& stream) {
  64. if (!isCreated()) {
  65. record(stream);
  66. }
  67. }
  68. void record(const XPUStream& stream) {
  69. if (!isCreated()) {
  70. device_index_ = stream.device_index();
  71. assignEvent(stream.queue());
  72. const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
  73. if (C10_UNLIKELY(interp)) {
  74. (*interp)->trace_gpu_event_creation(
  75. c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
  76. }
  77. } else {
  78. TORCH_CHECK(
  79. device_index_ == stream.device_index(),
  80. "Event device ",
  81. device_index_,
  82. " does not match recording stream's device ",
  83. stream.device_index(),
  84. ".");
  85. reassignEvent(stream.queue());
  86. }
  87. const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
  88. if (C10_UNLIKELY(interp)) {
  89. (*interp)->trace_gpu_event_record(
  90. c10::kXPU,
  91. reinterpret_cast<uintptr_t>(event_.get()),
  92. reinterpret_cast<uintptr_t>(&stream.queue()));
  93. }
  94. }
  95. void block(const XPUStream& stream) {
  96. if (isCreated()) {
  97. std::vector<sycl::event> event_list{event()};
  98. // Make this stream wait until event_ is completed.
  99. stream.queue().ext_oneapi_submit_barrier(event_list);
  100. const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
  101. if (C10_UNLIKELY(interp)) {
  102. (*interp)->trace_gpu_event_wait(
  103. c10::kXPU,
  104. reinterpret_cast<uintptr_t>(event_.get()),
  105. reinterpret_cast<uintptr_t>(&stream.queue()));
  106. }
  107. }
  108. }
  109. double elapsed_time(const XPUEvent& other) const {
  110. TORCH_CHECK(
  111. isCreated() && other.isCreated(),
  112. "Both events must be recorded before calculating elapsed time.");
  113. TORCH_CHECK(
  114. query() && other.query(),
  115. "Both events must be completed before calculating elapsed time.");
  116. TORCH_CHECK(
  117. enable_timing_ && other.enable_timing_,
  118. "Both events must be created with argument 'enable_timing=True'.");
  119. using namespace sycl::info::event_profiling;
  120. // Block until both of the recorded events are completed.
  121. uint64_t end_time_ns = other.event().get_profiling_info<command_end>();
  122. uint64_t start_time_ns = event().get_profiling_info<command_end>();
  123. // Return the eplased time in milliseconds.
  124. return 1e-6 *
  125. (static_cast<double>(end_time_ns) - static_cast<double>(start_time_ns));
  126. }
  127. void synchronize() const {
  128. if (isCreated()) {
  129. const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
  130. if (C10_UNLIKELY(interp)) {
  131. (*interp)->trace_gpu_event_synchronization(
  132. c10::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
  133. }
  134. event().wait_and_throw();
  135. }
  136. }
  137. private:
  138. void assignEvent(sycl::queue& queue) {
  139. if (enable_timing_) {
  140. event_ = std::make_unique<sycl::event>(
  141. sycl::ext::oneapi::experimental::submit_profiling_tag(queue));
  142. } else {
  143. event_ = std::make_unique<sycl::event>(queue.ext_oneapi_submit_barrier());
  144. }
  145. }
  146. void reassignEvent(sycl::queue& queue) {
  147. event_.reset();
  148. assignEvent(queue);
  149. }
  150. bool enable_timing_ = false;
  151. c10::DeviceIndex device_index_ = -1;
  152. // Only need to track the last event, as events in an in-order queue are
  153. // executed sequentially.
  154. std::unique_ptr<sycl::event> event_;
  155. };
  156. } // namespace c10::xpu
  157. #else
  158. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  159. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)