Ginkgo Generated from branch based on master. Ginkgo version 1.8.0
A numerical linear algebra library targeting many-core architectures
Loading...
Searching...
No Matches
executor.hpp
1// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
2//
3// SPDX-License-Identifier: BSD-3-Clause
4
5#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7
8
9#include <array>
10#include <atomic>
11#include <iostream>
12#include <memory>
13#include <mutex>
14#include <sstream>
15#include <string>
16#include <tuple>
17#include <type_traits>
18#include <vector>
19
20
21#include <ginkgo/core/base/device.hpp>
22#include <ginkgo/core/base/fwd_decls.hpp>
23#include <ginkgo/core/base/machine_topology.hpp>
24#include <ginkgo/core/base/memory.hpp>
25#include <ginkgo/core/base/scoped_device_id_guard.hpp>
26#include <ginkgo/core/base/types.hpp>
27#include <ginkgo/core/log/logger.hpp>
28#include <ginkgo/core/synthesizer/containers.hpp>
29
30
31namespace gko {
32
33
41 never,
48};
49
50
63enum class allocation_mode { device, unified_global, unified_host };
64
65
66#ifdef NDEBUG
67
68// When in release, prefer device allocations
69constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
70
71constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
72
73#else
74
75// When in debug, always UM allocations.
76constexpr allocation_mode default_cuda_alloc_mode =
77 allocation_mode::unified_global;
78
79#if (GINKGO_HIP_PLATFORM_HCC == 1)
80
81// HIP on AMD GPUs does not support UM, so always prefer device allocations.
82constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
83
84#else
85
86// HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
87constexpr allocation_mode default_hip_alloc_mode =
88 allocation_mode::unified_global;
89
90#endif
91
92#endif
93
94
95} // namespace gko
96
97
102enum class dpcpp_queue_property {
106 in_order = 1,
107
111 enable_profiling = 2
112};
113
114GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
115 dpcpp_queue_property b)
116{
117 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
118 static_cast<int>(b));
119}
120
121
122namespace gko {
123
124
125#define GKO_FORWARD_DECLARE(_type, ...) class _type
126
127GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128
129#undef GKO_FORWARD_DECLARE
130
131
132class ReferenceExecutor;
133
134
135namespace detail {
136
137
138template <typename>
139class ExecutorBase;
140
141
142} // namespace detail
143
144
260public:
261#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
262 virtual void run(std::shared_ptr<const _type>) const
263
264 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265
266#undef GKO_DECLARE_RUN_OVERLOAD
267
268 // ReferenceExecutor overload can be defaulted to OmpExecutor's
269 virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
270
276 virtual const char* get_name() const noexcept;
277};
278
279
280namespace detail {
281
282
292template <typename Closure>
293class RegisteredOperation : public Operation {
294public:
301 RegisteredOperation(const char* name, Closure op)
302 : name_(name), op_(std::move(op))
303 {}
304
305 const char* get_name() const noexcept override { return name_; }
306
307 void run(std::shared_ptr<const ReferenceExecutor> exec) const override
308 {
309 op_(exec);
310 }
311
312 void run(std::shared_ptr<const OmpExecutor> exec) const override
313 {
314 op_(exec);
315 }
316
317 void run(std::shared_ptr<const CudaExecutor> exec) const override
318 {
319 op_(exec);
320 }
321
322 void run(std::shared_ptr<const HipExecutor> exec) const override
323 {
324 op_(exec);
325 }
326
327 void run(std::shared_ptr<const DpcppExecutor> exec) const override
328 {
329 op_(exec);
330 }
331
332private:
333 const char* name_;
334 Closure op_;
335};
336
337
338template <typename Closure>
339RegisteredOperation<Closure> make_register_operation(const char* name,
340 Closure op)
341{
342 return RegisteredOperation<Closure>{name, std::move(op)};
343}
344
345
346} // namespace detail
347
348
420#define GKO_REGISTER_OPERATION(_name, _kernel) \
421 template <typename... Args> \
422 auto make_##_name(Args&&... args) \
423 { \
424 return ::gko::detail::make_register_operation( \
425 #_kernel, [&args...](auto exec) { \
426 using exec_type = decltype(exec); \
427 if (std::is_same< \
428 exec_type, \
429 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
430 value) { \
431 ::gko::kernels::reference::_kernel( \
432 std::dynamic_pointer_cast< \
433 const ::gko::ReferenceExecutor>(exec), \
434 std::forward<Args>(args)...); \
435 } else if (std::is_same< \
436 exec_type, \
437 std::shared_ptr<const ::gko::OmpExecutor>>:: \
438 value) { \
439 ::gko::kernels::omp::_kernel( \
440 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
441 exec), \
442 std::forward<Args>(args)...); \
443 } else if (std::is_same< \
444 exec_type, \
445 std::shared_ptr<const ::gko::CudaExecutor>>:: \
446 value) { \
447 ::gko::kernels::cuda::_kernel( \
448 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
449 exec), \
450 std::forward<Args>(args)...); \
451 } else if (std::is_same< \
452 exec_type, \
453 std::shared_ptr<const ::gko::HipExecutor>>:: \
454 value) { \
455 ::gko::kernels::hip::_kernel( \
456 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
457 exec), \
458 std::forward<Args>(args)...); \
459 } else if (std::is_same< \
460 exec_type, \
461 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
462 value) { \
463 ::gko::kernels::dpcpp::_kernel( \
464 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
465 exec), \
466 std::forward<Args>(args)...); \
467 } else { \
468 GKO_NOT_IMPLEMENTED; \
469 } \
470 }); \
471 } \
472 static_assert(true, \
473 "This assert is used to counter the false positive extra " \
474 "semi-colon warnings")
475
476
514#define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
515 template <typename... Args> \
516 auto make_##_name(Args&&... args) \
517 { \
518 return ::gko::detail::make_register_operation( \
519 #_kernel, \
520 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
521 } \
522 static_assert(true, \
523 "This assert is used to counter the false positive extra " \
524 "semi-colon warnings")
525
526
527#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
528
616class Executor : public log::EnableLogging<Executor> {
617 template <typename T>
618 friend class detail::ExecutorBase;
619
620 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
621 friend class ReferenceExecutor;
622
623public:
624 virtual ~Executor() = default;
625
626 Executor() = default;
627 Executor(Executor&) = delete;
628 Executor(Executor&&) = delete;
629 Executor& operator=(Executor&) = delete;
630 Executor& operator=(Executor&&) = delete;
631
637 virtual void run(const Operation& op) const = 0;
638
653 template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
654 typename ClosureDpcpp>
655 void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
656 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
657 {
658 LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
659 op_omp, op_cuda, op_hip, op_dpcpp);
660 this->run(op);
661 }
662
674 template <typename T>
675 T* alloc(size_type num_elems) const
676 {
677 this->template log<log::Logger::allocation_started>(
678 this, num_elems * sizeof(T));
679 T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
680 this->template log<log::Logger::allocation_completed>(
681 this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
682 return allocated;
683 }
684
692 void free(void* ptr) const noexcept
693 {
694 this->template log<log::Logger::free_started>(
695 this, reinterpret_cast<uintptr>(ptr));
696 this->raw_free(ptr);
697 this->template log<log::Logger::free_completed>(
698 this, reinterpret_cast<uintptr>(ptr));
699 }
700
713 template <typename T>
715 const T* src_ptr, T* dest_ptr) const
716 {
717 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
718 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
719 this->template log<log::Logger::copy_started>(
720 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
721 if (this != src_exec.get()) {
722 src_exec->template log<log::Logger::copy_started>(
723 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
724 }
725 try {
726 this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
727 dest_ptr);
728 } catch (NotSupported&) {
729#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
730 // Unoptimized copy. Try to go through the masters.
731 // output to log when verbose >= 1 and debug build
732 std::clog << "Not direct copy. Try to copy data from the masters."
733 << std::endl;
734#endif
735 auto src_master = src_exec->get_master().get();
736 if (num_elems > 0 && src_master != src_exec.get()) {
737 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
738 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
739 master_ptr);
740 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
741 src_master->free(master_ptr);
742 }
743 }
744 this->template log<log::Logger::copy_completed>(
745 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
746 if (this != src_exec.get()) {
747 src_exec->template log<log::Logger::copy_completed>(
748 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
749 }
750 }
751
763 template <typename T>
764 void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
765 {
766 this->copy_from(this, num_elems, src_ptr, dest_ptr);
767 }
768
778 template <typename T>
779 T copy_val_to_host(const T* ptr) const
780 {
781 T out{};
782 this->get_master()->copy_from(this, 1, ptr, &out);
783 return out;
784 }
785
790 virtual std::shared_ptr<Executor> get_master() noexcept = 0;
791
795 virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
796
800 virtual void synchronize() const = 0;
801
808 void add_logger(std::shared_ptr<const log::Logger> logger) override
809 {
810 this->propagating_logger_refcount_.fetch_add(
811 logger->needs_propagation() ? 1 : 0);
812 this->EnableLogging<Executor>::add_logger(logger);
813 }
814
821 void remove_logger(const log::Logger* logger) override
822 {
823 this->propagating_logger_refcount_.fetch_sub(
824 logger->needs_propagation() ? 1 : 0);
825 this->EnableLogging<Executor>::remove_logger(logger);
826 }
827
828 using EnableLogging<Executor>::remove_logger;
829
838 {
839 log_propagation_mode_ = mode;
840 }
841
850 {
851 return this->propagating_logger_refcount_.load() > 0 &&
852 log_propagation_mode_ == log_propagation_mode::automatic;
853 }
854
862 bool memory_accessible(const std::shared_ptr<const Executor>& other) const
863 {
864 return this->verify_memory_from(other.get());
865 }
866
867 virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
868
869protected:
874 struct exec_info {
878 int device_id = -1;
879
883 std::string device_type;
884
888 int numa_node = -1;
889
898 int num_computing_units = -1;
899
911 int num_pu_per_cu = -1;
912
921 std::vector<int> subgroup_sizes{};
922
931 int max_subgroup_size = -1;
932
943 std::vector<int> max_workitem_sizes{};
944
954 int max_workgroup_size;
955
959 int major = -1;
960
964 int minor = -1;
965
971 std::string pci_bus_id = std::string(13, 'x');
972
983 std::vector<int> closest_pu_ids{};
984 };
985
991 const exec_info& get_exec_info() const { return this->exec_info_; }
992
1002 virtual void* raw_alloc(size_type size) const = 0;
1003
1011 virtual void raw_free(void* ptr) const noexcept = 0;
1012
1023 virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1024 const void* src_ptr, void* dest_ptr) const = 0;
1025
1035#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1036 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1037 const void* src_ptr, void* dest_ptr) const = 0
1038
1039 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1040
1041#undef GKO_ENABLE_RAW_COPY_TO
1042
1050 virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1051
1061#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1062 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1063
1064 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1065
1066 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1067
1068#undef GKO_ENABLE_VERIFY_MEMORY_TO
1069
1076 virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1077
1083 exec_info& get_exec_info() { return this->exec_info_; }
1084
1085 exec_info exec_info_;
1086
1087 log_propagation_mode log_propagation_mode_{log_propagation_mode::automatic};
1088
1089 std::atomic<int> propagating_logger_refcount_{};
1090
1091private:
1106 template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
1107 typename ClosureDpcpp>
1108 class LambdaOperation : public Operation {
1109 public:
1120 LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1121 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1122 : op_omp_(op_omp),
1123 op_cuda_(op_cuda),
1124 op_hip_(op_hip),
1125 op_dpcpp_(op_dpcpp)
1126 {}
1127
1128 void run(std::shared_ptr<const OmpExecutor>) const override
1129 {
1130 op_omp_();
1131 }
1132
1133 void run(std::shared_ptr<const ReferenceExecutor>) const override
1134 {
1135 op_omp_();
1136 }
1137
1138 void run(std::shared_ptr<const CudaExecutor>) const override
1139 {
1140 op_cuda_();
1141 }
1142
1143 void run(std::shared_ptr<const HipExecutor>) const override
1144 {
1145 op_hip_();
1146 }
1147
1148 void run(std::shared_ptr<const DpcppExecutor>) const override
1149 {
1150 op_dpcpp_();
1151 }
1152
1153 private:
1154 ClosureOmp op_omp_;
1155 ClosureCuda op_cuda_;
1156 ClosureHip op_hip_;
1157 ClosureDpcpp op_dpcpp_;
1158 };
1159};
1160
1161
1170template <typename T>
1172public:
1173 using pointer = T*;
1174
1180 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1181 : exec_{exec}
1182 {}
1183
1189 void operator()(pointer ptr) const
1190 {
1191 if (exec_) {
1192 exec_->free(ptr);
1193 }
1194 }
1195
1196private:
1197 std::shared_ptr<const Executor> exec_;
1198};
1199
1200// a specialization for arrays
1201template <typename T>
1203public:
1204 using pointer = T[];
1205
1206 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1207 : exec_{exec}
1208 {}
1209
1210 void operator()(pointer ptr) const
1211 {
1212 if (exec_) {
1213 exec_->free(ptr);
1214 }
1215 }
1216
1217private:
1218 std::shared_ptr<const Executor> exec_;
1219};
1220
1221
1222namespace detail {
1223
1224
1225template <typename ConcreteExecutor>
1226class ExecutorBase : public Executor {
1227 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1228 friend class ReferenceExecutor;
1229
1230public:
1231 using Executor::run;
1232
1233 void run(const Operation& op) const override
1234 {
1235 this->template log<log::Logger::operation_launched>(this, &op);
1236 auto scope_guard = get_scoped_device_id_guard();
1237 op.run(self()->shared_from_this());
1238 this->template log<log::Logger::operation_completed>(this, &op);
1239 }
1240
1241protected:
1242 void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1243 const void* src_ptr, void* dest_ptr) const override
1244 {
1245 src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1246 }
1247
1248 virtual bool verify_memory_from(const Executor* src_exec) const override
1249 {
1250 return src_exec->verify_memory_to(self());
1251 }
1252
1253private:
1254 ConcreteExecutor* self() noexcept
1255 {
1256 return static_cast<ConcreteExecutor*>(this);
1257 }
1258
1259 const ConcreteExecutor* self() const noexcept
1260 {
1261 return static_cast<const ConcreteExecutor*>(this);
1262 }
1263};
1264
1265#undef GKO_DECLARE_EXECUTOR_FRIEND
1266
1267
1275class EnableDeviceReset {
1276public:
1282 GKO_DEPRECATED(
1283 "device_reset is no longer supported, call "
1284 "cudaDeviceReset/hipDeviceReset manually")
1285 void set_device_reset(bool device_reset) {}
1286
1292 GKO_DEPRECATED(
1293 "device_reset is no longer supported, call "
1294 "cudaDeviceReset/hipDeviceReset manually")
1295 bool get_device_reset() { return false; }
1296
1297protected:
1303 EnableDeviceReset() {}
1304
1305 GKO_DEPRECATED(
1306 "device_reset is no longer supported, call "
1307 "cudaDeviceReset/hipDeviceReset manually")
1308 EnableDeviceReset(bool device_reset) {}
1309};
1310
1311
1312} // namespace detail
1313
1314
1315#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1316 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1317 const void* src_ptr, void* dest_ptr) const override
1318
1319
1320#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1321 virtual bool verify_memory_to(const dest_* other) const override \
1322 { \
1323 return bool_; \
1324 } \
1325 static_assert(true, \
1326 "This assert is used to counter the false positive extra " \
1327 "semi-colon warnings")
1328
1329
1337class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1338 public std::enable_shared_from_this<OmpExecutor> {
1339 friend class detail::ExecutorBase<OmpExecutor>;
1340
1341public:
1345 static std::shared_ptr<OmpExecutor> create(
1346 std::shared_ptr<CpuAllocatorBase> alloc =
1347 std::make_shared<CpuAllocator>())
1348 {
1349 return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1350 }
1351
1352 std::shared_ptr<Executor> get_master() noexcept override;
1353
1354 std::shared_ptr<const Executor> get_master() const noexcept override;
1355
1356 void synchronize() const override;
1357
1358 int get_num_cores() const
1359 {
1360 return this->get_exec_info().num_computing_units;
1361 }
1362
1363 int get_num_threads_per_core() const
1364 {
1365 return this->get_exec_info().num_pu_per_cu;
1366 }
1367
1368 static int get_num_omp_threads();
1369
1370 scoped_device_id_guard get_scoped_device_id_guard() const override;
1371
1372protected:
1373 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1374 : alloc_{std::move(alloc)}
1375 {
1376 this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1377 }
1378
1379 void populate_exec_info(const machine_topology* mach_topo) override;
1380
1381 void* raw_alloc(size_type size) const override;
1382
1383 void raw_free(void* ptr) const noexcept override;
1384
1385 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1386
1387 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1388
1389 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1390
1391 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1392
1393 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1394
1395 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1396
1397 std::shared_ptr<CpuAllocatorBase> alloc_;
1398};
1399
1400
1401namespace kernels {
1402namespace omp {
1403using DefaultExecutor = OmpExecutor;
1404} // namespace omp
1405} // namespace kernels
1406
1407
1416public:
1417 static std::shared_ptr<ReferenceExecutor> create(
1418 std::shared_ptr<CpuAllocatorBase> alloc =
1419 std::make_shared<CpuAllocator>())
1420 {
1421 return std::shared_ptr<ReferenceExecutor>(
1422 new ReferenceExecutor(std::move(alloc)));
1423 }
1424
1425 scoped_device_id_guard get_scoped_device_id_guard() const override
1426 {
1427 return {this, 0};
1428 }
1429
1430 void run(const Operation& op) const override
1431 {
1432 this->template log<log::Logger::operation_launched>(this, &op);
1433 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1434 this->shared_from_this()));
1435 this->template log<log::Logger::operation_completed>(this, &op);
1436 }
1437
1438protected:
1439 ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1440 : OmpExecutor{std::move(alloc)}
1441 {
1442 this->ReferenceExecutor::populate_exec_info(
1444 }
1445
1446 void populate_exec_info(const machine_topology*) override
1447 {
1448 this->get_exec_info().device_id = -1;
1449 this->get_exec_info().num_computing_units = 1;
1450 this->get_exec_info().num_pu_per_cu = 1;
1451 }
1452
1453 bool verify_memory_from(const Executor* src_exec) const override
1454 {
1455 return src_exec->verify_memory_to(this);
1456 }
1457
1458 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1459
1460 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1461
1462 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1463
1464 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1465
1466 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1467};
1468
1469
1470namespace kernels {
1471namespace reference {
1472using DefaultExecutor = ReferenceExecutor;
1473} // namespace reference
1474} // namespace kernels
1475
1476
1483class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1484 public std::enable_shared_from_this<CudaExecutor>,
1485 public detail::EnableDeviceReset {
1486 friend class detail::ExecutorBase<CudaExecutor>;
1487
1488public:
1500 GKO_DEPRECATED(
1501 "calling this CudaExecutor::create method is deprecated, because"
1502 "device_reset no longer has an effect"
1503 "call CudaExecutor::create("
1504 " int device_id, std::shared_ptr<Executor> master,"
1505 " std::shared_ptr<CudaAllocatorBase> alloc,"
1506 " CUstream_st* stream);"
1507 "instead")
1508 static std::shared_ptr<CudaExecutor> create(
1509 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1510 allocation_mode alloc_mode = default_cuda_alloc_mode,
1511 CUstream_st* stream = nullptr);
1512
1522 static std::shared_ptr<CudaExecutor> create(
1523 int device_id, std::shared_ptr<Executor> master,
1524 std::shared_ptr<CudaAllocatorBase> alloc =
1525 std::make_shared<CudaAllocator>(),
1526 CUstream_st* stream = nullptr);
1527
1528 std::shared_ptr<Executor> get_master() noexcept override;
1529
1530 std::shared_ptr<const Executor> get_master() const noexcept override;
1531
1532 void synchronize() const override;
1533
1534 scoped_device_id_guard get_scoped_device_id_guard() const override;
1535
1539 int get_device_id() const noexcept
1540 {
1541 return this->get_exec_info().device_id;
1542 }
1543
1547 static int get_num_devices();
1548
1552 int get_num_warps_per_sm() const noexcept
1553 {
1554 return this->get_exec_info().num_pu_per_cu;
1555 }
1556
1560 int get_num_multiprocessor() const noexcept
1561 {
1562 return this->get_exec_info().num_computing_units;
1563 }
1564
1568 int get_num_warps() const noexcept
1569 {
1570 return this->get_exec_info().num_computing_units *
1571 this->get_exec_info().num_pu_per_cu;
1572 }
1573
1577 int get_warp_size() const noexcept
1578 {
1579 return this->get_exec_info().max_subgroup_size;
1580 }
1581
1585 int get_major_version() const noexcept
1586 {
1587 return this->get_exec_info().major;
1588 }
1589
1593 int get_minor_version() const noexcept
1594 {
1595 return this->get_exec_info().minor;
1596 }
1597
1603 cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
1604
1610 cusparseContext* get_cusparse_handle() const
1611 {
1612 return cusparse_handle_.get();
1613 }
1614
1620 std::vector<int> get_closest_pus() const
1621 {
1622 return this->get_exec_info().closest_pu_ids;
1623 }
1624
1630 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1631
1638 CUstream_st* get_stream() const { return stream_; }
1639
1640protected:
1641 void set_gpu_property();
1642
1643 void init_handles();
1644
1645 CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1646 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1647 : alloc_{std::move(alloc)}, master_(master), stream_{stream}
1648 {
1649 this->get_exec_info().device_id = device_id;
1650 this->get_exec_info().num_computing_units = 0;
1651 this->get_exec_info().num_pu_per_cu = 0;
1652 this->CudaExecutor::populate_exec_info(
1654 this->set_gpu_property();
1655 this->init_handles();
1656 }
1657
1658 void* raw_alloc(size_type size) const override;
1659
1660 void raw_free(void* ptr) const noexcept override;
1661
1662 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1663
1664 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1665
1666 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1667
1668 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1669
1670 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1671
1672 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1673
1674 void populate_exec_info(const machine_topology* mach_topo) override;
1675
1676private:
1677 std::shared_ptr<Executor> master_;
1678
1679 template <typename T>
1680 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1681 handle_manager<cublasContext> cublas_handle_;
1682 handle_manager<cusparseContext> cusparse_handle_;
1683 std::shared_ptr<CudaAllocatorBase> alloc_;
1684 CUstream_st* stream_;
1685};
1686
1687
1688namespace kernels {
1689namespace cuda {
1690using DefaultExecutor = CudaExecutor;
1691} // namespace cuda
1692} // namespace kernels
1693
1694
1701class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1702 public std::enable_shared_from_this<HipExecutor>,
1703 public detail::EnableDeviceReset {
1704 friend class detail::ExecutorBase<HipExecutor>;
1705
1706public:
1718 GKO_DEPRECATED(
1719 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1720 "alloc_mode was replaced by the Allocator type "
1721 "hierarchy.")
1722 static std::shared_ptr<HipExecutor> create(
1723 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1724 allocation_mode alloc_mode = default_hip_alloc_mode,
1725 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1726
1727 static std::shared_ptr<HipExecutor> create(
1728 int device_id, std::shared_ptr<Executor> master,
1729 std::shared_ptr<HipAllocatorBase> alloc =
1730 std::make_shared<HipAllocator>(),
1731 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1732
1733 std::shared_ptr<Executor> get_master() noexcept override;
1734
1735 std::shared_ptr<const Executor> get_master() const noexcept override;
1736
1737 void synchronize() const override;
1738
1739 scoped_device_id_guard get_scoped_device_id_guard() const override;
1740
1744 int get_device_id() const noexcept
1745 {
1746 return this->get_exec_info().device_id;
1747 }
1748
1752 static int get_num_devices();
1753
1757 int get_num_warps_per_sm() const noexcept
1758 {
1759 return this->get_exec_info().num_pu_per_cu;
1760 }
1761
1765 int get_num_multiprocessor() const noexcept
1766 {
1767 return this->get_exec_info().num_computing_units;
1768 }
1769
1773 int get_major_version() const noexcept
1774 {
1775 return this->get_exec_info().major;
1776 }
1777
1781 int get_minor_version() const noexcept
1782 {
1783 return this->get_exec_info().minor;
1784 }
1785
1789 int get_num_warps() const noexcept
1790 {
1791 return this->get_exec_info().num_computing_units *
1792 this->get_exec_info().num_pu_per_cu;
1793 }
1794
1798 int get_warp_size() const noexcept
1799 {
1800 return this->get_exec_info().max_subgroup_size;
1801 }
1802
1808 hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
1809
1815 hipsparseContext* get_hipsparse_handle() const
1816 {
1817 return hipsparse_handle_.get();
1818 }
1819
1825 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1826
1832 std::vector<int> get_closest_pus() const
1833 {
1834 return this->get_exec_info().closest_pu_ids;
1835 }
1836
1837 GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1838
1839protected:
1840 void set_gpu_property();
1841
1842 void init_handles();
1843
1844 HipExecutor(int device_id, std::shared_ptr<Executor> master,
1845 std::shared_ptr<HipAllocatorBase> alloc,
1846 GKO_HIP_STREAM_STRUCT* stream)
1847 : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1848 {
1849 this->get_exec_info().device_id = device_id;
1850 this->get_exec_info().num_computing_units = 0;
1851 this->get_exec_info().num_pu_per_cu = 0;
1852 this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1853 this->set_gpu_property();
1854 this->init_handles();
1855 }
1856
1857 void* raw_alloc(size_type size) const override;
1858
1859 void raw_free(void* ptr) const noexcept override;
1860
1861 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1862
1863 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1864
1865 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1866
1867 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1868
1869 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1870
1871 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1872
1873 void populate_exec_info(const machine_topology* mach_topo) override;
1874
1875private:
1876 std::shared_ptr<Executor> master_;
1877
1878 template <typename T>
1879 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1880 handle_manager<hipblasContext> hipblas_handle_;
1881 handle_manager<hipsparseContext> hipsparse_handle_;
1882 std::shared_ptr<HipAllocatorBase> alloc_;
1883 GKO_HIP_STREAM_STRUCT* stream_;
1884};
1885
1886
1887namespace kernels {
1888namespace hip {
1889using DefaultExecutor = HipExecutor;
1890} // namespace hip
1891} // namespace kernels
1892
1893
1900class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1901 public std::enable_shared_from_this<DpcppExecutor> {
1902 friend class detail::ExecutorBase<DpcppExecutor>;
1903
1904public:
1914 static std::shared_ptr<DpcppExecutor> create(
1915 int device_id, std::shared_ptr<Executor> master,
1916 std::string device_type = "all",
1917 dpcpp_queue_property property = dpcpp_queue_property::in_order);
1918
1919 std::shared_ptr<Executor> get_master() noexcept override;
1920
1921 std::shared_ptr<const Executor> get_master() const noexcept override;
1922
1923 void synchronize() const override;
1924
1925 scoped_device_id_guard get_scoped_device_id_guard() const override;
1926
1932 int get_device_id() const noexcept
1933 {
1934 return this->get_exec_info().device_id;
1935 }
1936
1937 sycl::queue* get_queue() const { return queue_.get(); }
1938
1946 static int get_num_devices(std::string device_type);
1947
1953 const std::vector<int>& get_subgroup_sizes() const noexcept
1954 {
1955 return this->get_exec_info().subgroup_sizes;
1956 }
1957
1963 int get_num_computing_units() const noexcept
1964 {
1965 return this->get_exec_info().num_computing_units;
1966 }
1967
1971 int get_num_subgroups() const noexcept
1972 {
1973 return this->get_exec_info().num_computing_units *
1974 this->get_exec_info().num_pu_per_cu;
1975 }
1976
1982 const std::vector<int>& get_max_workitem_sizes() const noexcept
1983 {
1984 return this->get_exec_info().max_workitem_sizes;
1985 }
1986
1992 int get_max_workgroup_size() const noexcept
1993 {
1994 return this->get_exec_info().max_workgroup_size;
1995 }
1996
2002 int get_max_subgroup_size() const noexcept
2003 {
2004 return this->get_exec_info().max_subgroup_size;
2005 }
2006
2012 std::string get_device_type() const noexcept
2013 {
2014 return this->get_exec_info().device_type;
2015 }
2016
2017protected:
2018 void set_device_property(
2019 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2020
2022 int device_id, std::shared_ptr<Executor> master,
2023 std::string device_type = "all",
2024 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2025 : master_(master)
2026 {
2027 std::for_each(device_type.begin(), device_type.end(),
2028 [](char& c) { c = std::tolower(c); });
2029 this->get_exec_info().device_type = std::string(device_type);
2030 this->get_exec_info().device_id = device_id;
2031 this->set_device_property(property);
2032 }
2033
2034 void populate_exec_info(const machine_topology* mach_topo) override;
2035
2036 void* raw_alloc(size_type size) const override;
2037
2038 void raw_free(void* ptr) const noexcept override;
2039
2040 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2041
2042 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2043
2044 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2045
2046 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2047
2048 bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2049
2050 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2051
2052private:
2053 std::shared_ptr<Executor> master_;
2054
2055 template <typename T>
2056 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2057 queue_manager<sycl::queue> queue_;
2058};
2059
2060
2061namespace kernels {
2062namespace dpcpp {
2063using DefaultExecutor = DpcppExecutor;
2064} // namespace dpcpp
2065} // namespace kernels
2066
2067
2068#undef GKO_OVERRIDE_RAW_COPY_TO
2069
2070
2071} // namespace gko
2072
2073
2074#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:40
Allocator using cudaMalloc.
Definition memory.hpp:102
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1485
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1620
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1630
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1552
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1585
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1560
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1577
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1610
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1638
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1593
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1568
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1603
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1901
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:1953
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:1963
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:1992
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:1971
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:1982
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:2002
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2012
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:616
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:692
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:655
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:849
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:862
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:764
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:714
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:837
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:675
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:779
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:821
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:65
Definition memory.hpp:172
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1703
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1757
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1773
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1832
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1781
static int get_num_devices()
Get the number of devices present on the system.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1765
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1789
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1815
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1825
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1798
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1808
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:128
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1338
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1345
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:259
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1415
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1430
This is a deleter that uses an executor's free method to deallocate the data.
Definition executor.hpp:1171
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1180
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1189
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:749
Definition logger.hpp:76
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:643
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:183
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:43
T * get() const
Definition utils_helper.hpp:77
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:76
The Ginkgo namespace.
Definition abstract_factory.hpp:20
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:138
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:86
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:35
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
@ never
Events only get reported at loggers attached to the triggering object.
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:63