Skip to content

Commit d8baca4

Browse files
committed
Remove ExecuTorch copy of Vectorized
Pull Request resolved: #7042 All uses are outside ExecuTorch core, so we can just use ATen Vectorized. ghstack-source-id: 263918323 @exported-using-ghexport Differential Revision: [D66396016](https://our.internmc.facebook.com/intern/diff/D66396016/)
1 parent b12e49e commit d8baca4

27 files changed

+131
-5150
lines changed

extension/llm/custom_ops/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ else()
5454
endif()
5555

5656
add_library(custom_ops ${_custom_ops__srcs})
57-
57+
find_package_torch_headers()
5858
target_include_directories(custom_ops PUBLIC "${_common_include_directories}")
5959
target_include_directories(
6060
custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
61+
${TORCH_INCLUDE_DIRS}
6162
)
6263
target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core)
6364

@@ -68,8 +69,6 @@ target_compile_options(
6869
install(TARGETS custom_ops DESTINATION lib)
6970

7071
if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
71-
# Add a AOT library
72-
find_package(Torch CONFIG REQUIRED)
7372
add_library(
7473
custom_ops_aot_lib SHARED
7574
${_custom_ops__srcs}
@@ -83,6 +82,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
8382
)
8483
target_include_directories(
8584
custom_ops_aot_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
85+
${TORCH_INCLUDE_DIRS}
8686
)
8787
# TODO: This only works if we install portable_lib.so to
8888
# <site-packages>/executorch/extension/pybindings/.

extension/llm/custom_ops/op_sdpa.cpp

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88

99
#include <executorch/extension/llm/custom_ops/op_sdpa.h>
1010

11+
#include <ATen/cpu/vec/functional.h>
12+
#include <ATen/cpu/vec/vec.h>
1113
#include <executorch/kernels/optimized/blas/CPUBlas.h>
1214
#include <executorch/kernels/optimized/vec/functional.h>
13-
#include <executorch/kernels/optimized/vec/vec.h>
1415
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
1516
// @lint-ignore CLANGTIDY facebook-unused-include-check
1617
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
@@ -34,18 +35,10 @@ namespace util {
3435
constexpr size_t kKVDim = 4;
3536

3637
template <typename T>
37-
inline void _store(T* dst, ::executorch::vec::Vectorized<T> src) {
38+
inline void _store(T* dst, ::at::vec::Vectorized<T> src) {
3839
src.store(dst);
3940
}
4041

41-
/*
42-
inline void _store(::Half* dst, at::vec::Vectorized<float> src) {
43-
//fp16_ieee_to_fp32_value
44-
auto res = at::vec::convert_float_half(src, src);
45-
res.store(dst, at::vec::Vectorized<float>::size());
46-
}
47-
*/
48-
4942
template <typename T>
5043
inline T data_index_init(T offset) {
5144
return offset;
@@ -78,7 +71,7 @@ inline double calculate_scale(const Tensor& query, optional<double> scale) {
7871
}
7972

8073
} // namespace util
81-
namespace vec = ::executorch::vec;
74+
namespace vec = ::at::vec;
8275
using Tensor = ::executorch::aten::Tensor;
8376

8477
namespace {

kernels/optimized/cpu/moments_utils.h

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
// for use in optimized ExecuTorch ops. Template specializations of BFloat16
1313
// are excluded.
1414

15-
#include <executorch/kernels/optimized/vec/vec.h>
15+
#include <ATen/cpu/vec/vec.h>
1616

1717
#include <executorch/kernels/optimized/utils/math_utils.h>
1818
#include <executorch/runtime/platform/compiler.h>
@@ -47,12 +47,12 @@ void AddMoments(
4747
template <typename T>
4848
ET_INLINE void AddMomentsVec(
4949
int64_t m0_add,
50-
const executorch::vec::Vectorized<T>& m1_add,
51-
const executorch::vec::Vectorized<T>& m2_add,
50+
const at::vec::Vectorized<T>& m1_add,
51+
const at::vec::Vectorized<T>& m2_add,
5252
int64_t& m0,
53-
executorch::vec::Vectorized<T>& m1,
54-
executorch::vec::Vectorized<T>& m2) {
55-
using Vec = executorch::vec::Vectorized<T>;
53+
at::vec::Vectorized<T>& m1,
54+
at::vec::Vectorized<T>& m2) {
55+
using Vec = at::vec::Vectorized<T>;
5656
const int64_t n = m0 + m0_add;
5757
const T c =
5858
n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
@@ -67,11 +67,11 @@ template <typename T>
6767
inline void UpdateMomentsVec(
6868
int64_t m0,
6969
const T* X_ptr,
70-
const std::array<executorch::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
70+
const std::array<at::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
7171
int64_t& m0_stk0,
72-
executorch::vec::Vectorized<acc_t<T>>& m1_stk0,
73-
executorch::vec::Vectorized<acc_t<T>>& m2_stk0) {
74-
using Vec = executorch::vec::Vectorized<acc_t<T>>;
72+
at::vec::Vectorized<acc_t<T>>& m1_stk0,
73+
at::vec::Vectorized<acc_t<T>>& m2_stk0) {
74+
using Vec = at::vec::Vectorized<acc_t<T>>;
7575
Vec m1_vec(0);
7676
Vec m2_vec(0);
7777
for (int64_t j = 0; j < m0; ++j) {
@@ -92,13 +92,13 @@ std::pair<acc_t<T>, acc_t<T>>
9292
RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
9393
using T_ACC = acc_t<T>;
9494

95-
constexpr int64_t kVecSize = executorch::vec::Vectorized<T>::size();
96-
constexpr int64_t kAccVecSize = executorch::vec::Vectorized<T_ACC>::size();
95+
constexpr int64_t kVecSize = at::vec::Vectorized<T>::size();
96+
constexpr int64_t kAccVecSize = at::vec::Vectorized<T_ACC>::size();
9797
const int64_t n = N / kVecSize;
9898
const int64_t m = executorch::utils::divup(n, kChunkSize);
9999
const int64_t depth = executorch::utils::CeilLog2(m);
100100

101-
using Vec = executorch::vec::Vectorized<T_ACC>;
101+
using Vec = at::vec::Vectorized<T_ACC>;
102102
const Vec kZeroVec(T_ACC(0));
103103
std::array<int64_t, kMaxDepth> m0_stk;
104104
std::array<Vec, kMaxDepth> m1_stk;
@@ -168,7 +168,7 @@ RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
168168
template <typename T>
169169
std::pair<acc_t<T>, acc_t<T>>
170170
RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
171-
using Vec = executorch::vec::Vectorized<T>;
171+
using Vec = at::vec::Vectorized<T>;
172172
constexpr int64_t kVecSize = Vec::size();
173173
const int64_t n = N / kVecSize;
174174
const int64_t m = executorch::utils::divup(n, kChunkSize);

kernels/optimized/cpu/op_add.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <ATen/cpu/vec/functional.h>
10+
#include <ATen/cpu/vec/vec.h>
911
#include <executorch/kernels/optimized/cpu/binary_ops.h>
1012
#include <executorch/kernels/optimized/vec/functional.h>
11-
#include <executorch/kernels/optimized/vec/vec.h>
1213
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1314
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
1415
#include <executorch/runtime/kernel/kernel_includes.h>
@@ -99,8 +100,8 @@ Tensor& opt_add_out(
99100
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
100101
CTYPE b_casted = static_cast<CTYPE>(b_val);
101102

102-
using Vec = executorch::vec::Vectorized<CTYPE>;
103-
executorch::vec::map<CTYPE>(
103+
using Vec = at::vec::Vectorized<CTYPE>;
104+
at::vec::map<CTYPE>(
104105
[alpha_val, b_casted](Vec x) {
105106
return x + Vec(alpha_val * b_casted);
106107
},
@@ -131,8 +132,8 @@ Tensor& opt_add_out(
131132
ET_KERNEL_CHECK(
132133
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
133134

134-
using Vec = executorch::vec::Vectorized<CTYPE>;
135-
executorch::vec::map2<CTYPE>(
135+
using Vec = at::vec::Vectorized<CTYPE>;
136+
at::vec::map2<CTYPE>(
136137
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
137138
out.mutable_data_ptr<CTYPE>(),
138139
a.const_data_ptr<CTYPE>(),
@@ -166,7 +167,7 @@ Tensor& opt_add_out(
166167
ET_KERNEL_CHECK(
167168
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
168169

169-
using Vec = executorch::vec::Vectorized<CTYPE>;
170+
using Vec = at::vec::Vectorized<CTYPE>;
170171
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
171172
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
172173
out.mutable_data_ptr<CTYPE>(),
@@ -244,8 +245,8 @@ Tensor& opt_add_scalar_out(
244245
CTYPE alpha_val;
245246
ET_EXTRACT_SCALAR(alpha, alpha_val);
246247

247-
using Vec = executorch::vec::Vectorized<CTYPE>;
248-
executorch::vec::map<CTYPE>(
248+
using Vec = at::vec::Vectorized<CTYPE>;
249+
at::vec::map<CTYPE>(
249250
[alpha_val, b_casted](Vec x) {
250251
return x + Vec(alpha_val * b_casted);
251252
},

kernels/optimized/cpu/op_div.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <ATen/cpu/vec/functional.h>
10+
#include <ATen/cpu/vec/vec.h>
911
#include <executorch/kernels/optimized/cpu/binary_ops.h>
1012
#include <executorch/kernels/optimized/vec/functional.h>
11-
#include <executorch/kernels/optimized/vec/vec.h>
1213
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1314
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
1415
#include <executorch/runtime/kernel/kernel_includes.h>
@@ -76,16 +77,16 @@ Tensor& opt_div_out(
7677
CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
7778
CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);
7879

79-
using Vec = executorch::vec::Vectorized<CTYPE>;
80+
using Vec = at::vec::Vectorized<CTYPE>;
8081
if (a.numel() == 1) {
81-
executorch::vec::map<CTYPE>(
82+
at::vec::map<CTYPE>(
8283
[scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
8384
out.mutable_data_ptr<CTYPE>(),
8485
tensor->const_data_ptr<CTYPE>(),
8586
out.numel());
8687
} else {
8788
Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
88-
executorch::vec::map<CTYPE>(
89+
at::vec::map<CTYPE>(
8990
[inv_scalar_casted_vec](Vec x) {
9091
return x * inv_scalar_casted_vec;
9192
},
@@ -111,8 +112,8 @@ Tensor& opt_div_out(
111112
"Failed to resize output tensor.");
112113

113114
ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "div.out", CTYPE, [&]() {
114-
using Vec = executorch::vec::Vectorized<CTYPE>;
115-
executorch::vec::map2<CTYPE>(
115+
using Vec = at::vec::Vectorized<CTYPE>;
116+
at::vec::map2<CTYPE>(
116117
[](Vec x, Vec y) { return x / y; },
117118
out.mutable_data_ptr<CTYPE>(),
118119
a.const_data_ptr<CTYPE>(),
@@ -142,7 +143,7 @@ Tensor& opt_div_out(
142143
out,
143144
"Failed to resize output tensor.");
144145
ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
145-
using Vec = executorch::vec::Vectorized<CTYPE>;
146+
using Vec = at::vec::Vectorized<CTYPE>;
146147
if (selected_optimized_path ==
147148
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
148149
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
@@ -222,9 +223,9 @@ Tensor& opt_div_scalar_out(
222223
ET_EXTRACT_SCALAR(b, b_val);
223224
CTYPE b_casted = static_cast<CTYPE>(b_val);
224225

225-
using Vec = executorch::vec::Vectorized<CTYPE>;
226+
using Vec = at::vec::Vectorized<CTYPE>;
226227
Vec inv_b_casted_vec(CTYPE(1) / b_casted);
227-
executorch::vec::map<CTYPE>(
228+
at::vec::map<CTYPE>(
228229
[inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
229230
out.mutable_data_ptr<CTYPE>(),
230231
a.const_data_ptr<CTYPE>(),

kernels/optimized/cpu/op_exp.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88

99
#include <cmath>
1010

11+
#include <ATen/cpu/vec/functional.h>
12+
#include <ATen/cpu/vec/vec.h>
1113
#include <executorch/kernels/optimized/vec/functional.h>
12-
#include <executorch/kernels/optimized/vec/vec.h>
1314
#include <executorch/runtime/kernel/kernel_includes.h>
1415

1516
namespace torch {
@@ -34,8 +35,8 @@ void exp_data(
3435
const CTYPE_IN* in_data,
3536
const size_t numel,
3637
CTYPE_OUT* out_data) {
37-
using Vec = executorch::vec::Vectorized<CTYPE_IN>;
38-
executorch::vec::map<CTYPE_IN>(
38+
using Vec = at::vec::Vectorized<CTYPE_IN>;
39+
at::vec::map<CTYPE_IN>(
3940
[](Vec x) { return x.exp(); }, out_data, in_data, numel);
4041
}
4142

kernels/optimized/cpu/op_le.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <ATen/cpu/vec/functional.h>
10+
#include <ATen/cpu/vec/vec.h>
911
#include <executorch/kernels/optimized/vec/functional.h>
10-
#include <executorch/kernels/optimized/vec/vec.h>
1112
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1213
#include <executorch/runtime/kernel/kernel_includes.h>
1314
#include <executorch/runtime/platform/assert.h>
@@ -44,8 +45,8 @@ Tensor& opt_le_tensor_out(
4445
if (a_type == b_type && a_type == out_type) {
4546
ET_SWITCH_REAL_TYPES_AND(
4647
Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
47-
using Vec = executorch::vec::Vectorized<CTYPE>;
48-
executorch::vec::map2<CTYPE>(
48+
using Vec = at::vec::Vectorized<CTYPE>;
49+
at::vec::map2<CTYPE>(
4950
[](Vec x, Vec y) { return x.le(y); },
5051
out.mutable_data_ptr<CTYPE>(),
5152
a.const_data_ptr<CTYPE>(),
@@ -109,8 +110,8 @@ Tensor& opt_le_scalar_out(
109110
CTYPE_B b_val = 0;
110111
ET_EXTRACT_SCALAR(b, b_val);
111112
CTYPE b_casted = static_cast<CTYPE>(b_val);
112-
using Vec = executorch::vec::Vectorized<CTYPE>;
113-
executorch::vec::map<CTYPE>(
113+
using Vec = at::vec::Vectorized<CTYPE>;
114+
at::vec::map<CTYPE>(
114115
[b_casted](Vec x) { return x.le(Vec(b_casted)); },
115116
out.mutable_data_ptr<CTYPE>(),
116117
a.const_data_ptr<CTYPE>(),

kernels/optimized/cpu/op_mul.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <ATen/cpu/vec/functional.h>
10+
#include <ATen/cpu/vec/vec.h>
911
#include <executorch/kernels/optimized/cpu/binary_ops.h>
1012
#include <executorch/kernels/optimized/vec/functional.h>
11-
#include <executorch/kernels/optimized/vec/vec.h>
1213
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1314
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
1415
#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
@@ -95,7 +96,7 @@ Tensor& handle_last_dim_broadcast(
9596
const size_t outer_size = getLeadingDims(out, out.dim() - 1);
9697
const auto broadcast_size = out.size(out.dim() - 1);
9798
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
98-
using Vec = executorch::vec::Vectorized<CTYPE>;
99+
using Vec = at::vec::Vectorized<CTYPE>;
99100
executorch::vec::broadcasting_map_broadcast_last_dim<CTYPE>(
100101
[](Vec x, Vec y) { return x * y; },
101102
out.mutable_data_ptr<CTYPE>(),
@@ -164,7 +165,7 @@ Tensor& handle_broadcast_mul(
164165
inner_size = lhs->sizes()[lhs->dim() - 1];
165166
}
166167
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
167-
using Vec = executorch::vec::Vectorized<CTYPE>;
168+
using Vec = at::vec::Vectorized<CTYPE>;
168169
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
169170
[](Vec x, Vec y) { return x * y; },
170171
out.mutable_data_ptr<CTYPE>(),
@@ -203,8 +204,8 @@ Tensor& opt_mul_out(
203204
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
204205
CTYPE b_casted = static_cast<CTYPE>(b_val);
205206

206-
using Vec = executorch::vec::Vectorized<CTYPE>;
207-
executorch::vec::map<CTYPE>(
207+
using Vec = at::vec::Vectorized<CTYPE>;
208+
at::vec::map<CTYPE>(
208209
[b_casted](Vec x) { return x * Vec(b_casted); },
209210
out.mutable_data_ptr<CTYPE>(),
210211
a.const_data_ptr<CTYPE>(),
@@ -229,8 +230,8 @@ Tensor& opt_mul_out(
229230
"Failed to resize output tensor.");
230231

231232
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
232-
using Vec = executorch::vec::Vectorized<CTYPE>;
233-
executorch::vec::map2<CTYPE>(
233+
using Vec = at::vec::Vectorized<CTYPE>;
234+
at::vec::map2<CTYPE>(
234235
[](Vec x, Vec y) { return x * y; },
235236
out.mutable_data_ptr<CTYPE>(),
236237
a.const_data_ptr<CTYPE>(),
@@ -306,8 +307,8 @@ Tensor& opt_mul_scalar_out(
306307
ET_EXTRACT_SCALAR(b, b_val);
307308
CTYPE b_casted = static_cast<CTYPE>(b_val);
308309

309-
using Vec = executorch::vec::Vectorized<CTYPE>;
310-
executorch::vec::map<CTYPE>(
310+
using Vec = at::vec::Vectorized<CTYPE>;
311+
at::vec::map<CTYPE>(
311312
[b_casted](Vec x) { return x * Vec(b_casted); },
312313
out.mutable_data_ptr<CTYPE>(),
313314
a.const_data_ptr<CTYPE>(),

0 commit comments

Comments
 (0)