yichael
/
xhs-note-crawling


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
							#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
#pragma once

#include <cuda_runtime_api.h>

#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Exception.h>

/*
 * Stream pool note.
 *
 * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
 * are backed by cuStreams, but they use several pools to minimize the costs
 * associated with creating, retaining, and destroying cuStreams.
 *
 * There are three pools per device, and a device's pools are lazily created.
 *
 * The first pool contains only the default stream. When the default stream
 * is requested it's returned.
 *
 * The second pool is the "low priority" or "default priority" streams. In
 * HIP builds there is no distinction between streams in this pool and streams
 * in the third pool (below). There are 32 of these streams per device, and
 * when a stream is requested one of these streams is returned round-robin.
 * That is, the first stream requested is at index 0, the second at index 1...
 * to index 31, then index 0 again.
 *
 * This means that if 33 low priority streams are requested, the first and
 * last streams requested are actually the same stream (under the covers)
 * and kernels enqueued on them cannot run concurrently.
 *
 * The third pool is the "high priority" streams. The third pool acts like
 * the second pool except the streams are created with a higher priority.
 *
 * These pools suggest that stream users should prefer many short-lived streams,
 * as the cost of acquiring and releasing streams is effectively zero. If
 * many longer-lived streams are required in performance critical scenarios
 * then the functionality here may need to be extended to allow, for example,
 * "reserving" a subset of the pool so that other streams do not accidentally
 * overlap the performance critical streams.
 *
 * Note: although the notion of "current stream for device" is thread local
 * (every OS thread has a separate current stream, as one might expect),
 * the stream pool is global across all threads; stream 0 is always stream 0
 * no matter which thread you use it on.  Multiple threads can synchronize
 * on the same stream.  Although the CUDA documentation is not very clear
 * on the matter, streams are thread safe; e.g., it is safe to enqueue
 * a kernel on the same stream from two different threads.
 */

namespace c10::cuda {

static constexpr int max_compile_time_stream_priorities = 4;

// Value object representing a CUDA stream.  This is just a wrapper
// around c10::Stream, but it comes with a little extra CUDA-specific
// functionality (conversion to cudaStream_t), and a guarantee that
// the wrapped c10::Stream really is a CUDA stream.
class C10_CUDA_API CUDAStream {
 public:
  enum Unchecked { UNCHECKED };

  /// Construct a CUDAStream from a Stream.  This construction is checked,
  /// and will raise an error if the Stream is not, in fact, a CUDA stream.
  explicit CUDAStream(Stream stream) : stream_(stream) {
    TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
  }

  /// Construct a CUDAStream from a Stream with no error checking.
  /// This constructor uses the "named" constructor idiom, and can
  /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
  explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}

  bool operator==(const CUDAStream& other) const noexcept {
    return unwrap() == other.unwrap();
  }

  bool operator!=(const CUDAStream& other) const noexcept {
    return unwrap() != other.unwrap();
  }

  /// Implicit conversion to cudaStream_t.
  operator cudaStream_t() const {
    return stream();
  }

  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
  /// CUDA stream).
  operator Stream() const {
    return unwrap();
  }

  /// Used to avoid baking in device type explicitly to Python-side API.
  DeviceType device_type() const {
    return DeviceType::CUDA;
  }

  /// Get the CUDA device index that this stream is associated with.
  DeviceIndex device_index() const {
    return stream_.device_index();
  }

  /// Get the full Device that this stream is associated with.  The Device
  /// is guaranteed to be a CUDA device.
  Device device() const {
    return Device(DeviceType::CUDA, device_index());
  }

  /// Return the stream ID corresponding to this particular stream.
  StreamId id() const {
    return stream_.id();
  }

  bool query() const;

  void synchronize() const;

  int priority() const {
    DeviceGuard guard{stream_.device()};
    int priority = 0;
    C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
    return priority;
  }

  /// Explicit conversion to cudaStream_t.
  cudaStream_t stream() const;

  /// Explicit conversion to Stream.
  Stream unwrap() const {
    return stream_;
  }

  /// Reversibly pack a CUDAStream into a struct representation.
  /// Previously the stream's data was packed into a single int64_t,
  /// as it was assumed the fields would not require more than
  /// 64 bits of storage in total.
  /// See https://github.com/pytorch/pytorch/issues/75854
  /// for more information regarding newer platforms that may violate
  /// this assumption.
  ///
  /// The CUDAStream can be unpacked using unpack().
  struct c10::StreamData3 pack3() const {
    return stream_.pack3();
  }

  // Unpack a CUDAStream from the 3 fields generated by pack().
  static CUDAStream unpack3(
      StreamId stream_id,
      DeviceIndex device_index,
      DeviceType device_type) {
    return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
  }

  static std::tuple<int, int> priority_range() {
    // Note: this returns the range of priority **supported by PyTorch**, not
    // the range of priority **supported by CUDA**. The former is a subset of
    // the latter.
    int least_priority = 0, greatest_priority = 0;
    C10_CUDA_CHECK(
        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
#ifdef USE_ROCM
    // See Note [HIP stream priorities]
    TORCH_INTERNAL_ASSERT(
        least_priority == 1, "Unexpected HIP stream priority range");
    least_priority = 0;
#else
    TORCH_INTERNAL_ASSERT(
        least_priority == 0, "Unexpected CUDA stream priority range");
#endif
    TORCH_INTERNAL_ASSERT(
        greatest_priority <= -1, "Unexpected CUDA stream priority range");
    greatest_priority = std::max(
        -c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
    return std::make_tuple(least_priority, greatest_priority);
  }

  // Deleted for now; use CUDAEvent::block instead
  // void synchronize_with(const CUDAEvent& event) const;

 private:
  Stream stream_;
};

/**
 * Get a new stream from the CUDA stream pool.  You can think of this
 * as "creating" a new stream, but no such creation actually happens;
 * instead, streams are preallocated from the pool and returned in a
 * round-robin fashion.
 *
 * You can request a stream from the high priority pool by setting
 * isHighPriority to true, or a stream for a specific device by setting device
 * (defaulting to the current CUDA stream.)
 */
C10_API CUDAStream
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
// no default priority to disambiguate overloads
C10_API CUDAStream
getStreamFromPool(const int priority, DeviceIndex device = -1);

/**
 * Get a CUDAStream from a externally allocated one.
 *
 * This is mainly for interoperability with different libraries where we
 * want to operate on a non-torch allocated stream for data exchange or similar
 * purposes
 */
C10_API CUDAStream
getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);

/**
 * Get the default CUDA stream, for the passed CUDA device, or for the
 * current device if no device index is passed.  The default stream is
 * where most computation occurs when you aren't explicitly using
 * streams.
 */
C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);

/**
 * Get the current CUDA stream, for the passed CUDA device, or for the
 * current device if no device index is passed.  The current CUDA stream
 * will usually be the default CUDA stream for the device, but it may
 * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
 * or 'CUDAStreamGuard'.
 */
C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);

/**
 * Set the current stream on the device of the passed in stream to be
 * the passed in stream.  Yes, you read that right: this function
 * has *nothing* to do with the current device: it toggles the current
 * stream of the device of the passed stream.
 *
 * Confused?  Avoid using this function; prefer using 'CUDAStreamGuard' instead
 * (which will switch both your current device and current stream in the way you
 * expect, and reset it back to its original state afterwards).
 */
C10_API void setCurrentCUDAStream(CUDAStream stream);

C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);

} // namespace c10::cuda

// hipify v2 backward compat in external projects
#ifdef USE_ROCM
namespace c10::hip {
using c10::cuda::getStreamFromExternal;
using c10::cuda::getStreamFromPool;
// must use inline wrappers instead of reference aliases due to default args
inline c10::cuda::CUDAStream getDefaultHIPStream(
    DeviceIndex device_index = -1) {
  return c10::cuda::getDefaultCUDAStream(device_index);
}
inline c10::cuda::CUDAStream getCurrentHIPStream(
    DeviceIndex device_index = -1) {
  return c10::cuda::getCurrentCUDAStream(device_index);
}
inline auto& setCurrentHIPStream = c10::cuda::setCurrentCUDAStream;
inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA(
    const bool isHighPriority = false,
    DeviceIndex device = -1) {
  return c10::cuda::getStreamFromPool(isHighPriority, device);
}
inline c10::cuda::CUDAStream getStreamFromPoolMasqueradingAsCUDA(
    const int priority,
    DeviceIndex device = -1) {
  return c10::cuda::getStreamFromPool(priority, device);
}
inline auto& getStreamFromExternalMasqueradingAsCUDA =
    c10::cuda::getStreamFromExternal;
inline c10::cuda::CUDAStream getDefaultHIPStreamMasqueradingAsCUDA(
    DeviceIndex device_index = -1) {
  return c10::cuda::getDefaultCUDAStream(device_index);
}
inline c10::cuda::CUDAStream getCurrentHIPStreamMasqueradingAsCUDA(
    DeviceIndex device_index = -1) {
  return c10::cuda::getCurrentCUDAStream(device_index);
}
inline auto& setCurrentHIPStreamMasqueradingAsCUDA =
    c10::cuda::setCurrentCUDAStream;
} // namespace c10::hip
#endif

namespace std {
template <>
struct hash<c10::cuda::CUDAStream> {
  size_t operator()(c10::cuda::CUDAStream s) const noexcept {
    return std::hash<c10::Stream>{}(s.unwrap());
  }
};
} // namespace std

#else
#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)