-
Notifications
You must be signed in to change notification settings - Fork 4
/
THCTensorSort.cuh
103 lines (86 loc) · 3.46 KB
/
THCTensorSort.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#ifndef THC_TENSORSORT_CUH
#define THC_TENSORSORT_CUH
#include <THC/THCTensorMath.h>
#include <THC/THCGeneral.h>
#include <THC/THCReduceApplyUtils.cuh>
#include <THC/THCSortUtils.cuh>
#include <THC/THCTensorCopy.h>
#include <THC/THCTensorTypeUtils.cuh>
#include <THC/THCThrustAllocator.cuh>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#if CUDA_VERSION >= 7000 || defined(__HIP_PLATFORM_HCC__)
#include <thrust/system/cuda/execution_policy.h>
#endif
template <typename T, bool handleNaN = false>
struct ThrustGTOp {
__device__ bool operator()(const T& lhs, const T& rhs) const {
return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs);
}
};
template <typename T, bool handleNaN = false>
struct ThrustLTOp {
__device__ bool operator()(const T& lhs, const T& rhs) const {
return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs);
}
};
template <typename T, typename IndT, bool handleNaN = true>
struct ThrustSliceGTOp {
ThrustSliceGTOp(int64_t size) : sliceSize(size) {}
__device__ bool operator()(const thrust::tuple<int64_t, T>& lhs, const thrust::tuple<int64_t, T>& rhs) const {
IndT segA = (IndT)thrust::get<0>(lhs) / sliceSize;
IndT segB = (IndT)thrust::get<0>(rhs) / sliceSize;
if (segA != segB)
return segA < segB;
else
return (handleNaN && THCNumerics<T>::isnan(thrust::get<1>(lhs)) && !THCNumerics<T>::isnan(thrust::get<1>(rhs))) || THCNumerics<T>::gt(thrust::get<1>(lhs), thrust::get<1>(rhs));
}
const IndT sliceSize;
};
template <typename T, typename IndT, bool handleNaN = true>
struct ThrustSliceLTOp {
ThrustSliceLTOp(int64_t size) : sliceSize(size) {}
__device__ bool operator()(const thrust::tuple<int64_t, T>& lhs, const thrust::tuple<int64_t, T>& rhs) const {
IndT segA = (IndT)thrust::get<0>(lhs) / sliceSize;
IndT segB = (IndT)thrust::get<0>(rhs) / sliceSize;
if (segA != segB)
return segA < segB;
else
return (handleNaN && THCNumerics<T>::isnan(thrust::get<1>(rhs)) && !THCNumerics<T>::isnan(thrust::get<1>(lhs))) || THCNumerics<T>::lt(thrust::get<1>(lhs), thrust::get<1>(rhs));
}
const IndT sliceSize;
};
// `base` is the base address of a tensor
// For each slice (defined as a linear point of `out`, from 0 ->
// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
// `sliceSize - 1`.
template <typename IndexType, int Dim>
__global__ void
fillSliceWithIndex(TensorInfo<int64_t, IndexType> out,
IndexType totalSlices,
IndexType sliceSize,
IndexType sliceStride) {
IndexType slice = getLinearBlockId<IndexType>();
if (slice >= totalSlices) {
return;
}
const uint64_t offset =
IndexToOffset<int64_t, IndexType, Dim>::get(slice, out);
int64_t* base = &out.data[offset];
for (int64_t i = threadIdx.x; i < sliceSize; i += blockDim.x) {
// Torch indices are 1-based (hence the +1)
base[i * sliceStride] = i;
}
}
// For sorting in Thurst; extracts a within-slice index from a linear index
struct GlobalIndexToPerSliceIndex {
GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {}
__device__ inline void operator()(int64_t& v) const {
v = v % sliceSize;
}
const int64_t sliceSize;
};
void THCudaLongTensor_fillSliceWithIndex(THCState* state,
THCudaLongTensor* t,
int dim);
#endif // THC_TENSORSORT_CUH