10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
24 template <
typename Index,
typename InputDims,
size_t NumKernelDims,
int Layout>
27 IndexMapper(
const InputDims& input_dims,
const array<Index, NumKernelDims>& kernel_dims,
28 const array<Index, NumKernelDims>& indices) {
30 array<Index, NumDims> dimensions = input_dims;
31 for (
int i = 0; i < NumKernelDims; ++i) {
32 const Index index = indices[i];
33 const Index input_dim = input_dims[index];
34 const Index kernel_dim = kernel_dims[i];
35 const Index result_dim = input_dim - kernel_dim + 1;
36 dimensions[index] = result_dim;
39 array<Index, NumDims> inputStrides;
40 array<Index, NumDims> outputStrides;
41 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
44 for (
int i = 1; i < NumDims; ++i) {
45 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
49 inputStrides[NumDims - 1] = 1;
50 outputStrides[NumDims - 1] = 1;
51 for (
int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
57 array<Index, NumDims> cudaInputDimensions;
58 array<Index, NumDims> cudaOutputDimensions;
59 array<Index, NumDims> tmp = dimensions;
60 array<Index, NumDims> ordering;
61 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
63 : NumDims - NumKernelDims;
64 for (
int i = 0; i < NumKernelDims; ++i) {
65 const Index index = i + offset;
66 ordering[index] = indices[i];
68 cudaInputDimensions[index] = input_dims[indices[i]];
69 cudaOutputDimensions[index] = dimensions[indices[i]];
72 int written =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
75 for (
int i = 0; i < NumDims; ++i) {
77 ordering[written] = i;
78 cudaInputDimensions[written] = input_dims[i];
79 cudaOutputDimensions[written] = dimensions[i];
84 for (
int i = 0; i < NumDims; ++i) {
85 m_inputStrides[i] = inputStrides[ordering[i]];
86 m_outputStrides[i] = outputStrides[ordering[i]];
89 if (static_cast<int>(Layout) ==
static_cast<int>(ColMajor)) {
90 for (
int i = 0; i < NumDims; ++i) {
91 if (i > NumKernelDims) {
92 m_cudaInputStrides[i] =
93 m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
94 m_cudaOutputStrides[i] =
95 m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
97 m_cudaInputStrides[i] = 1;
98 m_cudaOutputStrides[i] = 1;
102 for (
int i = NumDims - 1; i >= 0; --i) {
103 if (i + 1 < offset) {
104 m_cudaInputStrides[i] =
105 m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
106 m_cudaOutputStrides[i] =
107 m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
109 m_cudaInputStrides[i] = 1;
110 m_cudaOutputStrides[i] = 1;
116 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p)
const {
117 Index inputIndex = 0;
118 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
120 const Index idx = p / m_cudaInputStrides[d];
121 inputIndex += idx * m_inputStrides[d];
122 p -= idx * m_cudaInputStrides[d];
124 inputIndex += p * m_inputStrides[NumKernelDims];
127 if (NumKernelDims < NumDims) {
128 limit = NumDims - NumKernelDims - 1;
130 for (
int d = 0; d < limit; ++d) {
131 const Index idx = p / m_cudaInputStrides[d];
132 inputIndex += idx * m_inputStrides[d];
133 p -= idx * m_cudaInputStrides[d];
135 inputIndex += p * m_inputStrides[limit];
140 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p)
const {
141 Index outputIndex = 0;
142 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
144 const Index idx = p / m_cudaOutputStrides[d];
145 outputIndex += idx * m_outputStrides[d];
146 p -= idx * m_cudaOutputStrides[d];
148 outputIndex += p * m_outputStrides[NumKernelDims];
151 if (NumKernelDims < NumDims) {
152 limit = NumDims - NumKernelDims - 1;
154 for (
int d = 0; d < limit; ++d) {
155 const Index idx = p / m_cudaOutputStrides[d];
156 outputIndex += idx * m_outputStrides[d];
157 p -= idx * m_cudaOutputStrides[d];
159 outputIndex += p * m_outputStrides[limit];
164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i)
const {
165 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
167 : NumDims - NumKernelDims;
168 return i * m_inputStrides[offset];
171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i)
const {
172 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
174 : NumDims - NumKernelDims;
175 return i * m_outputStrides[offset];
178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j)
const {
179 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
181 : NumDims - NumKernelDims;
182 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j)
const {
186 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
188 : NumDims - NumKernelDims;
189 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
192 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k)
const {
193 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
195 : NumDims - NumKernelDims;
196 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197 k * m_inputStrides[offset + 2];
200 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k)
const {
201 const size_t offset =
static_cast<int>(Layout) == static_cast<int>(ColMajor)
203 : NumDims - NumKernelDims;
204 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205 k * m_outputStrides[offset + 2];
209 static const size_t NumDims = internal::array_size<InputDims>::value;
210 array<Index, NumDims> m_inputStrides;
211 array<Index, NumDims> m_outputStrides;
212 array<Index, NumDims> m_cudaInputStrides;
213 array<Index, NumDims> m_cudaOutputStrides;
218 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
222 typedef typename promote_storage_type<
typename InputXprType::Scalar,
223 typename KernelXprType::Scalar>::ret Scalar;
224 typedef typename packet_traits<Scalar>::type Packet;
225 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
226 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
227 typedef typename promote_index_type<typename traits<InputXprType>::Index,
228 typename traits<KernelXprType>::Index>::type Index;
229 typedef typename InputXprType::Nested LhsNested;
230 typedef typename KernelXprType::Nested RhsNested;
231 typedef typename remove_reference<LhsNested>::type _LhsNested;
232 typedef typename remove_reference<RhsNested>::type _RhsNested;
233 static const int NumDimensions = traits<InputXprType>::NumDimensions;
234 static const int Layout = traits<InputXprType>::Layout;
241 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
242 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>,
Eigen::Dense>
244 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
247 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
248 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
250 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
257 template<
typename Indices,
typename InputXprType,
typename KernelXprType>
258 class TensorConvolutionOp :
public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
261 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
262 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet;
263 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
264 typedef typename internal::promote_storage_type<
typename InputXprType::CoeffReturnType,
265 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
266 typedef typename internal::promote_storage_type<
typename InputXprType::PacketReturnType,
267 typename KernelXprType::PacketReturnType>::ret PacketReturnType;
268 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
269 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
270 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
272 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(
const InputXprType& input,
const KernelXprType& kernel,
const Indices& dims)
273 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276 const Indices& indices()
const {
return m_indices; }
279 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
280 const typename internal::remove_all<typename InputXprType::Nested>::type&
281 inputExpression()
const {
return m_input_xpr; }
283 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
284 const typename internal::remove_all<typename KernelXprType::Nested>::type&
285 kernelExpression()
const {
return m_kernel_xpr; }
288 typename InputXprType::Nested m_input_xpr;
289 typename KernelXprType::Nested m_kernel_xpr;
290 const Indices m_indices;
294 template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
295 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
297 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
299 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
300 static const int NumKernelDims = internal::array_size<Indices>::value;
301 typedef typename XprType::Index Index;
302 typedef DSizes<Index, NumDims> Dimensions;
305 IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
306 PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
307 Layout = TensorEvaluator<InputArgType, Device>::Layout,
311 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(
const XprType& op,
const Device& device)
312 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
314 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
316 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
317 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
319 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
320 m_inputStride[0] = 1;
321 for (
int i = 1; i < NumDims; ++i) {
322 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
325 m_inputStride[NumDims - 1] = 1;
326 for (
int i = NumDims - 2; i >= 0; --i) {
327 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
331 m_dimensions = m_inputImpl.dimensions();
332 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
333 for (
int i = 0; i < NumKernelDims; ++i) {
334 const Index index = op.indices()[i];
335 const Index input_dim = input_dims[index];
336 const Index kernel_dim = kernel_dims[i];
337 const Index result_dim = input_dim - kernel_dim + 1;
338 m_dimensions[index] = result_dim;
340 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
342 m_kernelStride[0] = 1;
344 m_indexStride[i] = m_inputStride[index];
347 m_outputStride[0] = 1;
348 for (
int i = 1; i < NumDims; ++i) {
349 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
352 for (
int i = NumKernelDims - 1; i >= 0; --i) {
353 const Index index = op.indices()[i];
354 const Index input_dim = input_dims[index];
355 const Index kernel_dim = kernel_dims[i];
356 const Index result_dim = input_dim - kernel_dim + 1;
357 m_dimensions[index] = result_dim;
358 if (i < NumKernelDims - 1) {
359 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
361 m_kernelStride[NumKernelDims - 1] = 1;
363 m_indexStride[i] = m_inputStride[index];
366 m_outputStride[NumDims - 1] = 1;
367 for (
int i = NumDims - 2; i >= 0; --i) {
368 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
373 typedef typename XprType::Scalar Scalar;
374 typedef typename XprType::CoeffReturnType CoeffReturnType;
375 typedef typename XprType::PacketReturnType PacketReturnType;
377 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions()
const {
return m_dimensions; }
379 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(Scalar*) {
380 m_inputImpl.evalSubExprsIfNeeded(NULL);
384 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void cleanup() {
385 m_inputImpl.cleanup();
386 if (m_local_kernel) {
387 m_device.deallocate((
void*)m_kernel);
388 m_local_kernel =
false;
393 void evalTo(
typename XprType::Scalar* buffer) {
394 evalSubExprsIfNeeded(NULL);
395 for (
int i = 0; i < dimensions().TotalSize(); ++i) {
396 buffer[i] += coeff(i);
401 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const
403 CoeffReturnType result = CoeffReturnType(0);
404 convolve(firstInput(index), 0, NumKernelDims-1, result);
408 template<
int LoadMode>
409 EIGEN_DEVICE_FUNC PacketReturnType packet(
const Index index)
const
411 const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
412 Index indices[2] = {index, index+PacketSize-1};
413 Index startInputs[2] = {0, 0};
414 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
415 for (
int i = NumDims - 1; i > 0; --i) {
416 const Index idx0 = indices[0] / m_outputStride[i];
417 const Index idx1 = indices[1] / m_outputStride[i];
418 startInputs[0] += idx0 * m_inputStride[i];
419 startInputs[1] += idx1 * m_inputStride[i];
420 indices[0] -= idx0 * m_outputStride[i];
421 indices[1] -= idx1 * m_outputStride[i];
424 for (
int i = 0; i < NumDims - 1; ++i) {
425 const Index idx0 = indices[0] / m_outputStride[i];
426 const Index idx1 = indices[1] / m_outputStride[i];
427 startInputs[0] += idx0 * m_inputStride[i];
428 startInputs[1] += idx1 * m_inputStride[i];
429 indices[0] -= idx0 * m_outputStride[i];
430 indices[1] -= idx1 * m_outputStride[i];
433 startInputs[0] += indices[0];
434 startInputs[1] += indices[1];
436 if (startInputs[1]-startInputs[0] == PacketSize-1) {
437 PacketReturnType result = internal::pset1<PacketReturnType>(0);
438 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
441 EIGEN_ALIGN_MAX Scalar data[PacketSize];
443 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
444 for (
int i = 1; i < PacketSize-1; ++i) {
446 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
448 data[PacketSize-1] = Scalar(0);
449 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
450 return internal::pload<PacketReturnType>(data);
454 EIGEN_DEVICE_FUNC Scalar* data()
const {
return NULL; }
457 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index)
const {
458 Index startInput = 0;
459 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
460 for (
int i = NumDims - 1; i > 0; --i) {
461 const Index idx = index / m_outputStride[i];
462 startInput += idx * m_inputStride[i];
463 index -= idx * m_outputStride[i];
466 for (
int i = 0; i < NumDims - 1; ++i) {
467 const Index idx = index / m_outputStride[i];
468 startInput += idx * m_inputStride[i];
469 index -= idx * m_outputStride[i];
476 EIGEN_DEVICE_FUNC
void convolve(Index firstIndex, Index firstKernel,
int DimIndex, CoeffReturnType& accum)
const {
477 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
478 const Index input = firstIndex + j * m_indexStride[DimIndex];
479 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
481 convolve(input, kernel, DimIndex-1, accum);
483 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
488 template <
typename Packet>
489 EIGEN_DEVICE_FUNC
void convolvePacket(Index firstIndex, Index firstKernel,
int DimIndex, Packet& accum)
const {
490 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
491 const Index input = firstIndex + j * m_indexStride[DimIndex];
492 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
494 convolvePacket(input, kernel, DimIndex-1, accum);
496 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
501 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void preloadKernel() {
504 const Scalar* in_place = m_kernelImpl.data();
507 m_local_kernel =
false;
509 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
510 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
511 typedef TensorEvalToOp<const KernelArgType> EvalTo;
512 EvalTo evalToTmp(local, m_kernelArg);
513 const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
514 internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
517 m_local_kernel =
true;
521 array<Index, NumDims> m_inputStride;
522 array<Index, NumDims> m_outputStride;
524 array<Index, NumKernelDims> m_indexStride;
525 array<Index, NumKernelDims> m_kernelStride;
526 TensorEvaluator<InputArgType, Device> m_inputImpl;
527 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
528 Dimensions m_dimensions;
530 KernelArgType m_kernelArg;
531 const Scalar* m_kernel;
533 const Device& m_device;
540 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
542 template <
int StaticKernelSize>
543 struct GetKernelSize {
544 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int operator() (
const int )
const {
545 return StaticKernelSize;
549 struct GetKernelSize<Dynamic> {
550 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int operator() (
const int kernelSize)
const {
555 template <
typename InputEvaluator,
typename Index,
typename InputDims,
556 int StaticKernelSize>
557 __global__
void EigenConvolutionKernel1D(
559 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
561 const float* __restrict kernel,
const int numPlanes,
const int numX,
562 const int maxX,
const int kernelSize,
float* buffer) {
563 extern __shared__
float s[];
565 const int first_x = blockIdx.x * maxX;
566 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
567 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
568 const int num_x_output = last_x - first_x + 1;
570 const int first_plane = blockIdx.y * blockDim.y;
571 const int plane_stride = blockDim.y * gridDim.y;
573 for (
int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
575 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
576 const int plane_kernel_offset = threadIdx.y * num_x_input;
578 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
579 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
580 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
586 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
589 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
590 const int kernel_offset = plane_kernel_offset + i;
593 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
594 result += s[k + kernel_offset] * kernel[k];
596 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
597 buffer[tensor_index] = result;
603 template <
typename InputEvaluator,
typename Index,
typename InputDims,
604 int StaticKernelSizeX,
int StaticKernelSizeY>
605 __global__
void EigenConvolutionKernel2D(
607 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
609 const float* __restrict kernel,
const int numPlanes,
const int numX,
610 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
611 const int kernelSizeY,
float* buffer) {
612 extern __shared__
float s[];
614 const int first_x = blockIdx.x * maxX;
615 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
616 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
617 const int num_x_output = last_x - first_x + 1;
619 const int first_y = blockIdx.y * maxY;
620 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
621 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
622 const int num_y_output = last_y - first_y + 1;
624 const int first_plane = blockIdx.z * blockDim.z;
625 const int plane_stride = blockDim.z * gridDim.z;
627 for (
int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
629 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
630 const int plane_kernel_offset = threadIdx.z * num_y_input;
634 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
635 const int input_offset = num_x_input * (j + plane_kernel_offset);
637 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
638 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
639 s[i + input_offset] = eval.coeff(tensor_index);
646 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
649 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
651 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
654 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
655 const int kernel_offset = kernelSizeX * l;
656 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
658 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
659 result += s[k + input_offset] * kernel[k + kernel_offset];
662 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
663 buffer[tensor_index] = result;
671 template <
typename InputEvaluator,
typename Index,
typename InputDims>
672 __global__
void EigenConvolutionKernel3D(
674 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
676 const float* __restrict kernel,
const size_t numPlanes,
const size_t numX,
677 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
678 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
679 const size_t kernelSizeZ,
float* buffer) {
680 extern __shared__
float s[];
683 const int first_x = blockIdx.x * maxX;
684 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
685 const int num_x_input = last_x - first_x + kernelSizeX;
687 const int first_y = blockIdx.y * maxY;
688 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
689 const int num_y_input = last_y - first_y + kernelSizeY;
691 const int first_z = blockIdx.z * maxZ;
692 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
693 const int num_z_input = last_z - first_z + kernelSizeZ;
695 for (
int p = 0; p < numPlanes; ++p) {
697 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
698 const int plane_kernel_offset = 0;
700 for (
int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
701 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
702 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
703 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
704 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
712 const int num_z_output = last_z - first_z + 1;
713 const int num_y_output = last_y - first_y + 1;
714 const int num_x_output = last_x - first_x + 1;
715 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
717 for (
int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
718 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
719 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
721 for (
int n = 0; n < kernelSizeZ; ++n) {
722 for (
int m = 0; m < kernelSizeY; ++m) {
723 for (
int l = 0; l < kernelSizeX; ++l) {
724 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
728 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
729 buffer[tensor_index] = result;
739 template<
typename Indices,
typename InputArgType,
typename KernelArgType>
740 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
742 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
744 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
745 static const int NumKernelDims = internal::array_size<Indices>::value;
746 typedef typename XprType::Index Index;
747 typedef DSizes<Index, NumDims> Dimensions;
748 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
751 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
752 PacketAccess =
false,
753 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
757 EIGEN_DEVICE_FUNC TensorEvaluator(
const XprType& op,
const GpuDevice& device)
758 : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
760 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
762 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
763 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
765 m_dimensions = m_inputImpl.dimensions();
766 for (
int i = 0; i < NumKernelDims; ++i) {
767 const Index index = op.indices()[i];
768 const Index input_dim = input_dims[index];
769 const Index kernel_dim = kernel_dims[i];
770 const Index result_dim = input_dim - kernel_dim + 1;
771 m_dimensions[index] = result_dim;
775 typedef typename XprType::CoeffReturnType CoeffReturnType;
776 typedef typename XprType::PacketReturnType PacketReturnType;
777 typedef typename InputArgType::Scalar Scalar;
779 EIGEN_DEVICE_FUNC
const Dimensions& dimensions()
const {
return m_dimensions; }
781 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(Scalar* data) {
783 m_inputImpl.evalSubExprsIfNeeded(NULL);
788 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() *
sizeof(Scalar));
794 EIGEN_STRONG_INLINE
void cleanup() {
795 m_inputImpl.cleanup();
797 m_device.deallocate(m_buf);
800 if (m_local_kernel) {
801 m_device.deallocate((
void*)m_kernel);
802 m_local_kernel =
false;
807 EIGEN_STRONG_INLINE
void preloadKernel() {
810 const Scalar* in_place = m_kernelImpl.data();
813 m_local_kernel =
false;
815 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
816 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
817 typedef TensorEvalToOp<const KernelArgType> EvalTo;
818 EvalTo evalToTmp(local, m_kernelArg);
819 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
820 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
823 m_local_kernel =
true;
827 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
828 const unsigned int rounded_toward_zero = num / denom;
829 if (num > rounded_toward_zero * denom) {
830 return rounded_toward_zero + 1;
832 return rounded_toward_zero;
835 void executeEval(Scalar* data)
const {
836 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
838 const int maxSharedMem = m_device.sharedMemPerBlock();
839 const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
840 const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
841 const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
842 const int warpSize = 32;
844 switch (NumKernelDims) {
846 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
848 const int numX = dimensions()[m_indices[0]];
849 const int numP = dimensions().TotalSize() / numX;
853 const int single_stride_dim =
854 static_cast<int>(Layout) == static_cast<int>(ColMajor)
856 : m_inputImpl.dimensions().rank() - 1;
857 if (m_indices[0] == single_stride_dim) {
859 const int inner_dim = ((maxSharedMem / (
sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
860 maxX = numext::mini<int>(inner_dim, numX);
861 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(Scalar)), numP);
862 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
863 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
867 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(Scalar));
868 const int maxP = numext::mini<int>(inner_dim, numP);
869 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(Scalar)) - kernel_size + 1, numX);
871 block_size.x = numext::mini(warpSize, maxX);
872 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
875 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(Scalar);
876 assert(shared_mem <= maxSharedMem);
878 const int num_x_blocks = ceil(numX, maxX);
879 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
880 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
882 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
887 const array<Index, 1> indices(m_indices[0]);
888 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
889 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
890 m_inputImpl.dimensions(), kernel_dims, indices);
891 switch(kernel_size) {
893 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
897 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
901 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
909 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
911 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
912 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
913 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
915 const int numX = dimensions()[m_indices[idxX]];
916 const int numY = dimensions()[m_indices[idxY]];
917 const int numP = dimensions().TotalSize() / (numX*numY);
919 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (
sizeof(Scalar) * kernel_size_y * kernel_size_x));
922 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
923 const int maxX = numext::mini<int>(inner_dim, numX);
924 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
925 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(Scalar)), numP);
928 block_size.x = numext::mini(1024, maxX);
929 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
930 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
932 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(Scalar);
933 assert(shared_mem <= maxSharedMem);
935 const int num_x_blocks = ceil(numX, maxX);
936 const int num_y_blocks = ceil(numY, maxY);
937 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
938 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
940 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
945 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
946 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
947 m_kernelImpl.dimensions()[idxY]);
948 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
949 m_inputImpl.dimensions(), kernel_dims, indices);
950 switch (kernel_size_x) {
952 switch (kernel_size_y) {
954 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
958 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
965 switch (kernel_size_y) {
967 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
971 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
978 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
987 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
989 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
991 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
993 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
994 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
995 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
997 const int numX = dimensions()[m_indices[idxX]];
998 const int numY = dimensions()[m_indices[idxY]];
999 const int numZ = dimensions()[m_indices[idxZ]];
1000 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1002 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1003 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1004 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1007 block_size.x = numext::mini(32, maxX);
1008 block_size.y = numext::mini(32, maxY);
1009 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1010 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1012 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(Scalar);
1013 assert(shared_mem <= maxSharedMem);
1016 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1018 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1019 m_kernelImpl.dimensions()[idxY],
1020 m_kernelImpl.dimensions()[idxZ]);
1021 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1022 m_inputImpl.dimensions(), kernel_dims, indices);
1024 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1029 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1034 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const
1036 eigen_assert(m_buf);
1037 eigen_assert(index < m_dimensions.TotalSize());
1038 return m_buf[index];
1041 template<
int LoadMode>
1042 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(
const Index index)
const
1044 eigen_assert(m_buf);
1045 eigen_assert(index < m_dimensions.TotalSize());
1046 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1051 TensorEvaluator& operator = (
const TensorEvaluator&);
1053 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1054 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1055 KernelArgType m_kernelArg;
1057 Dimensions m_dimensions;
1059 const Scalar* m_kernel;
1060 bool m_local_kernel;
1062 const GpuDevice& m_device;
1069 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13