PaddlePaddle · youge325 · Apr 13, 2026 · Apr 13, 2026 · Copilot · Apr 13, 2026
diff --git a/paddle/phi/api/include/compat/ATen/core/TensorAccessor.h b/paddle/phi/api/include/compat/ATen/core/TensorAccessor.h
@@ -18,203 +18,76 @@
 
 #pragma once
 
+#include <torch/headeronly/core/TensorAccessor.h>
+
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
 
 namespace at {
-template <typename T>
-struct DefaultPtrTraits {
-  typedef T* PtrType;
-};
+
+using torch::headeronly::DefaultPtrTraits;
+#if defined(__CUDACC__) || defined(__HIPCC__)
+using torch::headeronly::RestrictPtrTraits;
+#endif
 
 template <typename T,
           size_t N,
           template <typename U> class PtrTraits = DefaultPtrTraits,
           typename index_t = int64_t>
-class TensorAccessorBase {
- public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST_DEVICE TensorAccessorBase(PtrType data_,
-                                     const index_t* sizes_,
-                                     const index_t* strides_)  // NOLINT
-      : data_(data_), sizes_(sizes_), strides_(strides_) {}    // NOLINT
-  C10_HOST IntArrayRef sizes() const { return IntArrayRef(sizes_, N); }
-  C10_HOST IntArrayRef strides() const { return IntArrayRef(strides_, N); }
-  C10_HOST_DEVICE index_t stride(index_t i) const { return strides_[i]; }
-  C10_HOST_DEVICE index_t size(index_t i) const { return sizes_[i]; }
-  C10_HOST_DEVICE PtrType data() { return data_; }
-  C10_HOST_DEVICE const PtrType data() const { return data_; }
-
- protected:
-  PtrType data_;
-  const index_t* sizes_;
-  const index_t* strides_;
-};
+using TensorAccessorBase = torch::headeronly::detail::
+    TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;
 
-// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
-// `Tensor.accessor<T, N>()`.
-// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and
-// only indexing on the device uses `TensorAccessor`s.
 template <typename T,
           size_t N,
           template <typename U> class PtrTraits = DefaultPtrTraits,
           typename index_t = int64_t>
-class TensorAccessor : public TensorAccessorBase<T, N, PtrTraits, index_t> {
- public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST_DEVICE TensorAccessor(PtrType data_,
-                                 const index_t* sizes_,
-                                 const index_t* strides_)
-      : TensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
-      index_t i) {
-    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
-        this->data_ + this->strides_[0] * i,
-        this->sizes_ + 1,
-        this->strides_ + 1);
-  }
-
-  C10_HOST_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
-      index_t i) const {
-    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
-        this->data_ + this->strides_[0] * i,
-        this->sizes_ + 1,
-        this->strides_ + 1);
+using TensorAccessor = torch::headeronly::detail::
+    TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;
+
+namespace detail {
+
+template <size_t N, typename index_t>
+struct IndexBoundsCheck {
+  explicit IndexBoundsCheck(index_t i) {
+    TORCH_CHECK(0 <= i && i < index_t{N},
+                "Index ",
+                i,
+                " is not within bounds of a tensor of dimension ",
+                N);
   }
 };
 
-template <typename T, template <typename U> class PtrTraits, typename index_t>
-class TensorAccessor<T, 1, PtrTraits, index_t>
-    : public TensorAccessorBase<T, 1, PtrTraits, index_t> {
- public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST_DEVICE TensorAccessor(PtrType data_,
-                                 const index_t* sizes_,
-                                 const index_t* strides_)
-      : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
-  C10_HOST_DEVICE T& operator[](index_t i) {
-    return this->data_[this->strides_[0] * i];
-  }
-  C10_HOST_DEVICE const T& operator[](index_t i) const {
-    return this->data_[this->strides_[0] * i];
-  }
-};
+}  // namespace detail
 
-// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on
-// for CUDA `Tensor`s on the host and as In contrast to `TensorAccessor`s, they
-// copy the strides and sizes on instantiation (on the host) in order to
-// transfer them on the device when calling kernels. On the device, indexing of
-// multidimensional tensors gives to `TensorAccessor`s. Use RestrictPtrTraits as
-// PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
-// Instantiation from data, sizes, strides is only needed on the host and
-// std::copy isn't available on the device, so those functions are host only.
 template <typename T,
           size_t N,
           template <typename U> class PtrTraits = DefaultPtrTraits,
           typename index_t = int64_t>
-class GenericPackedTensorAccessorBase {
- public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-  C10_HOST GenericPackedTensorAccessorBase(PtrType data_arg,
-                                           const index_t* sizes_arg,
-                                           const index_t* strides_arg)
-      : data_(data_arg) {
-    std::copy(sizes_arg, sizes_arg + N, std::begin(this->sizes_));
-    std::copy(strides_arg, strides_arg + N, std::begin(this->strides_));
-  }
-
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t,
-            class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessorBase(PtrType data_arg,
-                                           const source_index_t* sizes_arg,
-                                           const source_index_t* strides_arg)
-      : data_(data_arg) {
-    for (size_t i = 0; i < N; ++i) {
-      this->sizes_[i] = sizes_arg[i];
-      this->strides_[i] = strides_arg[i];
-    }
-  }
-
-  C10_HOST_DEVICE index_t stride(index_t i) const { return strides_[i]; }
-  C10_HOST_DEVICE index_t size(index_t i) const { return sizes_[i]; }
-  C10_HOST_DEVICE PtrType data() { return data_; }
-  C10_HOST_DEVICE const PtrType data() const { return data_; }
-
- protected:
-  PtrType data_;
-  // NOLINTNEXTLINE
-  index_t sizes_[N];
-  // NOLINTNEXTLINE
-  index_t strides_[N];
-  C10_HOST void bounds_check_(index_t i) const {
-    TORCH_CHECK_INDEX(0 <= i && i < index_t{N},
-                      "Index ",
-                      i,
-                      " is not within bounds of a tensor of dimension ",
-                      N);
-  }
-};
+using GenericPackedTensorAccessorBase =
+    torch::headeronly::detail::GenericPackedTensorAccessorBase<
+        detail::IndexBoundsCheck<N, index_t>,
+        T,
+        N,
+        PtrTraits,
+        index_t>;
 
 template <typename T,
           size_t N,
           template <typename U> class PtrTraits = DefaultPtrTraits,
           typename index_t = int64_t>
-class GenericPackedTensorAccessor
-    : public GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t> {
- public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST GenericPackedTensorAccessor(PtrType data_,
-                                       const index_t* sizes_,
-                                       const index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(
-            data_, sizes_, strides_) {}
-
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t,
-            class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessor(PtrType data_,
-                                       const source_index_t* sizes_,
-                                       const source_index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(
-            data_, sizes_, strides_) {}
-
-  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
-      index_t i) {
-    index_t* new_sizes = this->sizes_ + 1;
-    index_t* new_strides = this->strides_ + 1;
-    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
-        this->data_ + this->strides_[0] * i, new_sizes, new_strides);
-  }
-
-  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](
-      index_t i) const {
-    const index_t* new_sizes = this->sizes_ + 1;
-    const index_t* new_strides = this->strides_ + 1;
-    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
-        this->data_ + this->strides_[0] * i, new_sizes, new_strides);
-  }
-
-  /// Returns a PackedTensorAccessor of the same dimension after transposing the
-  /// two dimensions given. Does not actually move elements; transposition is
-  /// made by permuting the size/stride arrays. If the dimensions are not valid,
-  /// asserts.
-  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
-      index_t dim1, index_t dim2) const {
-    this->bounds_check_(dim1);
-    this->bounds_check_(dim2);
-    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
-        this->data_, this->sizes_, this->strides_);
-    std::swap(result.strides_[dim1], result.strides_[dim2]);
-    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
-    return result;
-  }
-};
+using GenericPackedTensorAccessor =
+    torch::headeronly::detail::GenericPackedTensorAccessor<
+        TensorAccessor<T, N - 1, PtrTraits, index_t>,
+        detail::IndexBoundsCheck<N, index_t>,
+        T,
+        N,
+        PtrTraits,
+        index_t>;
 
 template <typename T,
           size_t N,

diff --git a/paddle/phi/api/include/compat/c10/core/DeviceType.h b/paddle/phi/api/include/compat/c10/core/DeviceType.h
@@ -14,30 +14,16 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
+// If you modified DeviceType in this file, please also sync your changes into
+// torch/headeronly/core/DeviceType.h.
+#include <torch/headeronly/core/DeviceType.h>
+
 #include <ostream>
 
 #include "paddle/phi/common/place.h"
 
 namespace c10 {
 
-enum class DeviceType : int8_t {
-  CPU = 0,
-  CUDA = 1,
-  XPU = 12,
-  IPU = 18,
-  CUSTOM = 20,
-  PrivateUse1 = CUSTOM,
-};
-
-constexpr DeviceType kCUDA = DeviceType::CUDA;
-constexpr DeviceType kCPU = DeviceType::CPU;
-constexpr DeviceType kCUSTOM = DeviceType::CUSTOM;
-constexpr DeviceType kXPU = DeviceType::XPU;
-constexpr DeviceType kIPU = DeviceType::IPU;
-constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
-
 inline phi::AllocationType DeviceTypeToPhi(DeviceType d) {
   switch (d) {
     case DeviceType::CPU:
@@ -106,32 +92,3 @@ inline std::ostream& operator<<(std::ostream& os, DeviceType d) {
 }
 
 }  // namespace c10
-
-namespace std {
-template <>
-struct hash<c10::DeviceType> {
-  std::size_t operator()(c10::DeviceType k) const noexcept {
-    return std::hash<int>()(static_cast<int>(k));
-  }
-};
-}  // namespace std
-
-namespace at {
-using c10::DeviceType;
-using c10::kCPU;
-using c10::kCUDA;
-using c10::kCUSTOM;
-using c10::kIPU;
-using c10::kPrivateUse1;
-using c10::kXPU;
-}  // namespace at
-
-namespace torch {
-using c10::DeviceType;
-using c10::kCPU;
-using c10::kCUDA;
-using c10::kCUSTOM;
-using c10::kIPU;
-using c10::kPrivateUse1;
-using c10::kXPU;
-}  // namespace torch