From 52ab4df94c7a279e115ceb11f93478fe8c90ba98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Reboud?= Date: Sun, 12 May 2024 22:17:05 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A8=20refactor:=20handle=20float16=20a?= =?UTF-8?q?long=20float=20on=20GPU=20(#120)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + Package.swift | 2 +- .../GrAIdient/Core/Function/Activation.swift | 20 +- Sources/GrAIdient/Core/Layer/LayerInput.swift | 60 +- .../Core/Layer/LayerNormalization.swift | 210 +- .../GrAIdient/Core/Layer/LayerUpdate.swift | 291 +- Sources/GrAIdient/Core/Model/Model.swift | 2 +- .../Core/Optimizer/OptimizerAlgorithm.swift | 46 +- .../Core/Optimizer/OptimizerImpl.swift | 20 +- Sources/GrAIdient/Core/State/Weights.swift | 16 +- Sources/GrAIdient/GrAI.swift | 76 + Sources/GrAIdient/Layer1D/Activation1D.swift | 2 +- Sources/GrAIdient/Layer1D/BCE1D.swift | 7 +- Sources/GrAIdient/Layer1D/BCESigmoid1D.swift | 7 +- Sources/GrAIdient/Layer1D/Base/Layer1D.swift | 18 +- .../GrAIdient/Layer1D/Base/LayerInput1D.swift | 12 +- .../Layer1D/Base/LayerOutput1D.swift | 23 +- Sources/GrAIdient/Layer1D/Concat1D.swift | 5 +- Sources/GrAIdient/Layer1D/Constant1D.swift | 35 +- Sources/GrAIdient/Layer1D/DotProduct1D.swift | 9 +- .../GrAIdient/Layer1D/FullyConnected.swift | 70 +- Sources/GrAIdient/Layer1D/Input1D.swift | 8 +- Sources/GrAIdient/Layer1D/LinearError1D.swift | 5 +- Sources/GrAIdient/Layer1D/MSE1D.swift | 7 +- Sources/GrAIdient/Layer1D/Sum1D.swift | 6 +- Sources/GrAIdient/Layer2D/Activation2D.swift | 2 +- Sources/GrAIdient/Layer2D/AdaIN.swift | 9 +- Sources/GrAIdient/Layer2D/BCE2D.swift | 7 +- Sources/GrAIdient/Layer2D/BCESigmoid2D.swift | 7 +- Sources/GrAIdient/Layer2D/BN2D.swift | 5 +- Sources/GrAIdient/Layer2D/Base/Layer2D.swift | 20 +- .../GrAIdient/Layer2D/Base/LayerInput2D.swift | 17 +- .../Layer2D/Base/LayerOutput2D.swift | 28 +- Sources/GrAIdient/Layer2D/Concat2D.swift | 5 +- Sources/GrAIdient/Layer2D/Constant2D.swift | 35 +- Sources/GrAIdient/Layer2D/Convolution2D.swift | 63 +- .../GrAIdient/Layer2D/Deconvolution2D.swift | 9 +- Sources/GrAIdient/Layer2D/Input2D.swift | 6 +- .../GrAIdient/Layer2D/InstanceNorm2D.swift | 5 +- Sources/GrAIdient/Layer2D/MSE2D.swift | 7 +- Sources/GrAIdient/Layer2D/Multiply2D.swift | 47 +- Sources/GrAIdient/Layer2D/Normalize2D.swift | 8 +- .../Layer2D/SimilarityBatchError2D.swift | 12 +- .../GrAIdient/Layer2D/SimilarityError2D.swift | 17 +- Sources/GrAIdient/Layer2D/Sum2D.swift | 6 +- Sources/GrAIdient/Layer2D/VQ2D.swift | 33 +- .../GrAIdient/LayerSeq/ActivationSeq.swift | 2 +- .../GrAIdient/LayerSeq/Base/LayerSeq.swift | 14 +- Sources/GrAIdient/LayerSeq/ConcatSeq.swift | 10 +- Sources/GrAIdient/LayerSeq/ConstantSeq.swift | 59 +- .../LayerSeq/FullyConnectedPatch.swift | 61 +- .../LayerSeq/FullyConnectedSeq.swift | 62 +- Sources/GrAIdient/LayerSeq/LayerNormSeq.swift | 3 +- Sources/GrAIdient/LayerSeq/QuerySeq.swift | 11 +- Sources/GrAIdient/LayerSeq/SumSeq.swift | 6 +- Sources/GrAIdient/LayerSeq/VQSeq.swift | 37 +- Sources/GrAIdient/LayerSeq/ValueSeq.swift | 22 +- ...Activation.metal => ActivationFloat.metal} | 24 +- .../Metal/Kernel/ActivationHalf.metal | 403 ++ .../{BatchNorm.metal => BatchNormFloat.metal} | 14 +- .../Metal/Kernel/BatchNormHalf.metal | 415 ++ .../{Biases.metal => BiasesFloat.metal} | 2 +- .../GrAIdient/Metal/Kernel/BiasesHalf.metal | 53 + ...nvolution.metal => ConvolutionFloat.metal} | 20 +- .../Metal/Kernel/ConvolutionHalf.metal | 1049 +++++ ...olution.metal => DeconvolutionFloat.metal} | 8 +- .../Metal/Kernel/DeconvolutionHalf.metal | 419 ++ ...nected.metal => FullyConnectedFloat.metal} | 14 +- .../Metal/Kernel/FullyConnectedHalf.metal | 347 ++ ...h.metal => FullyConnectedPatchFloat.metal} | 16 +- .../Kernel/FullyConnectedPatchHalf.metal | 529 +++ ...Seq.metal => FullyConnectedSeqFloat.metal} | 20 +- .../Metal/Kernel/FullyConnectedSeqHalf.metal | 609 +++ ...anceNorm.metal => InstanceNormFloat.metal} | 16 +- .../Metal/Kernel/InstanceNormHalf.metal | 467 +++ .../{Layer1D.metal => Layer1DFloat.metal} | 38 +- .../GrAIdient/Metal/Kernel/Layer1DHalf.metal | 915 +++++ .../{Layer2D.metal => Layer2DFloat.metal} | 110 +- .../GrAIdient/Metal/Kernel/Layer2DHalf.metal | 3570 +++++++++++++++++ ...LayerMerge.metal => LayerMergeFloat.metal} | 12 +- .../Metal/Kernel/LayerMergeHalf.metal | 161 + .../{LayerNorm.metal => LayerNormFloat.metal} | 24 +- .../Metal/Kernel/LayerNormHalf.metal | 583 +++ .../{LayerSeq.metal => LayerSeqFloat.metal} | 90 +- .../GrAIdient/Metal/Kernel/LayerSeqHalf.metal | 2745 +++++++++++++ .../{Optimizer.metal => OptimizerFloat.metal} | 18 +- .../Metal/Kernel/OptimizerHalf.metal | 438 ++ .../{Reduce.metal => ReduceFloat.metal} | 8 +- .../GrAIdient/Metal/Kernel/ReduceHalf.metal | 184 + .../Kernel/{Reset.metal => ResetFloat.metal} | 2 +- .../GrAIdient/Metal/Kernel/ResetHalf.metal | 77 + .../Kernel/{VQ2D.metal => VQ2DFloat.metal} | 16 +- Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal | 544 +++ .../Kernel/{VQSeq.metal => VQSeqFloat.metal} | 14 +- .../GrAIdient/Metal/Kernel/VQSeqHalf.metal | 472 +++ Sources/GrAIdient/Metal/MetalBuffer.swift | 236 ++ Sources/GrAIdient/Metal/MetalConfig.swift | 815 ++-- Sources/GrAIdient/Metal/MetalKernel.swift | 24 +- Sources/GrAIdient/Utils/Buffer.swift | 159 +- Sources/GrAIdient/Utils/Image.swift | 24 +- Tests/GrAIExamples/AutoEncoderExample.swift | 2 + Tests/GrAIExamples/AutoEncoderTests.swift | 2 + Tests/GrAIExamples/Base/setup.py | 2 +- Tests/GrAIExamples/TransformerBenchmark.swift | 38 +- Tests/GrAIExamples/TransformerExample.swift | 14 +- Tests/GrAIExamples/VGGBenchmark.swift | 42 +- Tests/GrAIExamples/VGGExample.swift | 14 +- .../Base/Input1D/Input1DBCE1DCase.swift | 2 + .../Input1D/Input1DBCESigmoid1DCase.swift | 2 + .../Input1D/Input1DLinearError1DCase.swift | 2 + .../Base/Input1D/Input1DMSE1DCase.swift | 2 + .../Base/Input2D/Input2DBCE2DCase.swift | 2 + .../Input2D/Input2DBCESigmoid2DCase.swift | 2 + .../Base/Input2D/Input2DMSE1DCase.swift | 2 + .../Base/Input2D/Input2DMSE2DCase.swift | 2 + .../Input2DSimilarityBatchError2DCase.swift | 2 + .../Input2DSimilarityError2DCase.swift | 2 + .../Base/Input2D/Input2DVQ2DCase.swift | 2 + .../Base/Input2D/Input2DVQSeqCase.swift | 2 + Tests/GrAITests/ImageTests.swift | 7 +- Tests/GrAITests/Layer2DTests.swift | 12 +- Tests/GrAITests/OptimizerTests.swift | 4 + Tests/GrAITests/ReduceTests.swift | 155 +- Tests/GrAITests/UpdateManagementTests.swift | 18 +- Tests/GrAITorchTests/Base/setup.py | 2 +- Tests/GrAITorchTests/GrAITorchTests.swift | 2 + 126 files changed, 16078 insertions(+), 1557 deletions(-) rename Sources/GrAIdient/Metal/Kernel/{Activation.metal => ActivationFloat.metal} (94%) create mode 100644 Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal rename Sources/GrAIdient/Metal/Kernel/{BatchNorm.metal => BatchNormFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Biases.metal => BiasesFloat.metal} (96%) create mode 100644 Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Convolution.metal => ConvolutionFloat.metal} (98%) create mode 100644 Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Deconvolution.metal => DeconvolutionFloat.metal} (98%) create mode 100644 Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal rename Sources/GrAIdient/Metal/Kernel/{FullyConnected.metal => FullyConnectedFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal rename Sources/GrAIdient/Metal/Kernel/{FullyConnectedPatch.metal => FullyConnectedPatchFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal rename Sources/GrAIdient/Metal/Kernel/{FullyConnectedSeq.metal => FullyConnectedSeqFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal rename Sources/GrAIdient/Metal/Kernel/{InstanceNorm.metal => InstanceNormFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Layer1D.metal => Layer1DFloat.metal} (96%) create mode 100644 Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Layer2D.metal => Layer2DFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal rename Sources/GrAIdient/Metal/Kernel/{LayerMerge.metal => LayerMergeFloat.metal} (93%) create mode 100644 Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal rename Sources/GrAIdient/Metal/Kernel/{LayerNorm.metal => LayerNormFloat.metal} (96%) create mode 100644 Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal rename Sources/GrAIdient/Metal/Kernel/{LayerSeq.metal => LayerSeqFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Optimizer.metal => OptimizerFloat.metal} (96%) create mode 100644 Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Reduce.metal => ReduceFloat.metal} (97%) create mode 100644 Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal rename Sources/GrAIdient/Metal/Kernel/{Reset.metal => ResetFloat.metal} (94%) create mode 100644 Sources/GrAIdient/Metal/Kernel/ResetHalf.metal rename Sources/GrAIdient/Metal/Kernel/{VQ2D.metal => VQ2DFloat.metal} (98%) create mode 100644 Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal rename Sources/GrAIdient/Metal/Kernel/{VQSeq.metal => VQSeqFloat.metal} (98%) create mode 100644 Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal diff --git a/CHANGELOG.md b/CHANGELOG.md index df809de1..0fe68551 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## [unreleased] +🔨 **refactor:** handle float16 along float on GPU ([#120](https://github.com/owkin/GrAIdient/pull/120))\ 🚀 **perf:** copy & generate weights faster ([119](https://github.com/owkin/GrAIdient/pull/119))\ 🚀 **perf:** Convolution2D ([118](https://github.com/owkin/GrAIdient/pull/118))\ 🪜 **feat:** LayerCAM2D -> VQGrad2D, LayerCAMSeq -> VQGradSeq ([#117](https://github.com/owkin/GrAIdient/pull/117))\ diff --git a/Package.swift b/Package.swift index 8cc64efb..a386a0a9 100644 --- a/Package.swift +++ b/Package.swift @@ -7,7 +7,7 @@ import PackageDescription let package = Package( name: "GrAIdient", platforms: [ - .macOS(.v10_15) + .macOS(.v13) ], products: [ .library( diff --git a/Sources/GrAIdient/Core/Function/Activation.swift b/Sources/GrAIdient/Core/Function/Activation.swift index edb79edd..0e6bc93e 100644 --- a/Sources/GrAIdient/Core/Function/Activation.swift +++ b/Sources/GrAIdient/Core/Function/Activation.swift @@ -307,8 +307,8 @@ open class ActivationFunction: Codable /// - deviceID: GPU device where to execute the operation. /// private func _forwardGPU( - tmp: MetalBuffer, - outs: MetalBuffer, + tmp: FloatBuffer, + outs: FloatBuffer, deviceID: Int) { let nbElems = outs.nbElems @@ -335,8 +335,9 @@ open class ActivationFunction: Codable let nbElems = layer.outs.nbElems if layer._tmp == nil { - layer._tmp = MetalPrivateBuffer( - nbElems, deviceID: layer.deviceID) + layer._tmp = FloatBuffer( + nbElems: nbElems, deviceID: layer.deviceID + ) } _forwardGPU( tmp: layer._tmp, @@ -355,7 +356,7 @@ open class ActivationFunction: Codable let nbElems = layer.outs.nbElems if layer._tmp == nil { - layer._tmp = MetalPrivateBuffer( + layer._tmp = FloatBuffer(nbElems: nbElems, deviceID: layer.deviceID) } _forwardGPU( @@ -375,8 +376,9 @@ open class ActivationFunction: Codable let nbElems = layer.outs.nbElems if layer._tmp == nil { - layer._tmp = MetalPrivateBuffer( - nbElems, deviceID: layer.deviceID) + layer._tmp = FloatBuffer( + nbElems: nbElems, deviceID: layer.deviceID + ) } _forwardGPU( tmp: layer._tmp, @@ -394,8 +396,8 @@ open class ActivationFunction: Codable /// - deviceID: GPU device where to execute the operation. /// private func _backwardGPU( - tmp: MetalBuffer, - delta: MetalBuffer, + tmp: FloatBuffer, + delta: FloatBuffer, deviceID: Int) { let nbElems = delta.nbElems diff --git a/Sources/GrAIdient/Core/Layer/LayerInput.swift b/Sources/GrAIdient/Core/Layer/LayerInput.swift index c3cf7e81..d9ba95b5 100644 --- a/Sources/GrAIdient/Core/Layer/LayerInput.swift +++ b/Sources/GrAIdient/Core/Layer/LayerInput.swift @@ -105,14 +105,13 @@ class InputBuffers { /// The link to the layer. unowned let _layer: T - /// Number of elements in the different buffers. - let nbElems: Int - /// GPU device where the buffers are sent. - let deviceID: Int - var _m: MetalBuffer! = nil - var _v: MetalBuffer! = nil - var _vHat: MetalBuffer! = nil + /// Momentum buffer. + public let m: FloatBuffer + /// Velocity buffer. + public let v: FloatBuffer + /// Velocity normalized buffer. + public let vHat: FloatBuffer /// /// Create a container of buffers. @@ -127,51 +126,16 @@ class InputBuffers deviceID: Int) { _layer = layer - self.nbElems = nbElems - self.deviceID = deviceID - } - - /// Momentum buffer. - var m: MetalBuffer - { - get { - if _m == nil - { - _m = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _m - } - } - - /// Velocity buffer. - var v: MetalBuffer - { - get { - if _v == nil - { - _v = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _v - } - } - - /// Velocity normalized buffer. - var vHat: MetalBuffer - { - get { - if _vHat == nil - { - _vHat = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _vHat - } + m = FloatBuffer(nbElems: nbElems, deviceID: deviceID) + v = FloatBuffer(nbElems: nbElems, deviceID: deviceID) + vHat = FloatBuffer(nbElems: nbElems, deviceID: deviceID) } /// Clean the momentum..., preserving the weights. func reset() { - _m = nil - _v = nil - _vHat = nil + m.reset() + v.reset() + vHat.reset() } } diff --git a/Sources/GrAIdient/Core/Layer/LayerNormalization.swift b/Sources/GrAIdient/Core/Layer/LayerNormalization.swift index c572ff77..2ac13f33 100644 --- a/Sources/GrAIdient/Core/Layer/LayerNormalization.swift +++ b/Sources/GrAIdient/Core/Layer/LayerNormalization.swift @@ -620,7 +620,7 @@ public class BatchNormalization: LayerWeightsStatsNormalization } /// Get the weights in the CPU execution context. - func collectWeights() -> [IWeightArrays] + func collectWeights() -> [WeightArrays] { return [_Ɣ, _β] } @@ -633,50 +633,50 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization /// Buffer of weights to scale the normalization result. /// Shape ~ (nbNeurons,). /// - var _Ɣ: IWeightBuffers! = nil + var _Ɣ: WeightBuffers! = nil /// /// Buffer of biases to add to the normalization result. /// Shape ~ (nbNeurons,). /// - var _β: IWeightBuffers! = nil + var _β: WeightBuffers! = nil /// /// Buffer of averages of data for the different independent batch normalization units. /// Shape ~ (nbNeurons,). /// - var _μ: MetalBuffer! = nil + var _μ: FloatBuffer! = nil /// /// Buffer of global averages of data for the different independent batch normalization units. /// Shape ~ (nbNeurons,). /// - var _Eμ: MetalPrivateBuffer! = nil + var _Eμ: FloatBuffer! = nil /// /// Buffer of deviations of data for the different independent batch normalization units. /// Shape ~ (nbNeurons,). /// - var _σ2: MetalBuffer! = nil + var _σ2: FloatBuffer! = nil /// /// Buffer of global deviations of data for the different independent batch normalization units. /// Shape ~ (nbNeurons,). /// - var _Eσ2: MetalPrivateBuffer! = nil + var _Eσ2: FloatBuffer! = nil /// /// Buffer of data normalized without taking into account the biases and the weights. /// Shape ~ (batch, nbNeurons, height, width). /// - var _xHat: MetalBuffer! = nil + var _xHat: FloatBuffer! = nil /// /// Buffer used to compute backward pass. /// Shape ~ (nbNeurons,). /// - var _sum1: MetalBuffer! = nil + var _sum1: FloatBuffer! = nil /// /// Buffer used to compute backward pass. /// Shape ~ (nbNeurons,). /// - var _sum2: MetalBuffer! = nil + var _sum2: FloatBuffer! = nil /// GPU device on which model is executed. var _deviceID = 0 @@ -690,11 +690,8 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization return super.weights } - MetalKernel.get.download([_β.w_p!, _Ɣ.w_p!]) - - var weightsTmp = [Float]() - weightsTmp += _Ɣ.w_p!.shared.array - weightsTmp += _β.w_p!.shared.array + var weightsTmp = _Ɣ!.w.download() + weightsTmp += _β!.w.download() return weightsTmp } set { @@ -717,11 +714,8 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization return super.stats } - MetalKernel.get.download([_Eμ, _Eσ2]) - - var statsTmp = [Float]() - statsTmp += _Eμ.shared.array - statsTmp += _Eσ2.shared.array + var statsTmp = _Eμ.download() + statsTmp += _Eσ2.download() return statsTmp } set { @@ -781,58 +775,38 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization _β = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID) _Ɣ = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID) - let βPtr = _β.w_p!.shared.buffer - let ƔPtr = _Ɣ.w_p!.shared.buffer - if _weightsList.count == 0 { + _weightsList = [Float](repeating: 0.0, count: 2 * _nbNeurons) for depth in 0..<_nbNeurons { - ƔPtr[depth] = 1.0 - βPtr[depth] = 0.0 - } - } - else - { - for depth in 0..<_nbNeurons - { - ƔPtr[depth] = _weightsList[depth] - βPtr[depth] = _weightsList[_nbNeurons + depth] + _weightsList[depth] = 1.0 } - _weightsList = [] } - MetalKernel.get.upload([_β.w_p!, _Ɣ.w_p!]) + _Ɣ.w.initialize(array: &_weightsList) + _β.w.initialize(array: &_weightsList, start: _nbNeurons) + + _weightsList = [] } /// Initialize stats in the GPU execution context. func initStats() { - _Eμ = MetalPrivateBuffer(_nbNeurons, deviceID: _deviceID) - _Eσ2 = MetalPrivateBuffer(_nbNeurons, deviceID: _deviceID) - - let EμPtr = _Eμ.shared.buffer - let Eσ2Ptr = _Eσ2.shared.buffer + _Eμ = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID) + _Eσ2 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID) - if _statsList.count == 0 + if _statsList.count != 0 { - for depth in 0..<_nbNeurons - { - EμPtr[depth] = 0.0 - Eσ2Ptr[depth] = 0.0 - } + _Eμ.initialize(array: &_statsList) + _Eσ2.initialize(array: &_statsList, start: _nbNeurons) } else { - for depth in 0..<_nbNeurons - { - EμPtr[depth] = _statsList[depth] - Eσ2Ptr[depth] = _statsList[_nbNeurons + depth] - } - _statsList = [] + _Eμ.initialize() + _Eσ2.initialize() } - - MetalKernel.get.upload([_Eμ, _Eσ2]) + _statsList = [] } /// @@ -880,7 +854,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization if _μ == nil { - _μ = MetalPrivateBuffer(_nbNeurons, deviceID: _deviceID) + _μ = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID) } let command = MetalKernel.get.createCommand( @@ -913,7 +887,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization if _σ2 == nil { - _σ2 = MetalPrivateBuffer(_nbNeurons, deviceID: _deviceID) + _σ2 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID) } let command = MetalKernel.get.createCommand( @@ -948,7 +922,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization if _xHat == nil { - _xHat = MetalPrivateBuffer( + _xHat = FloatBuffer(nbElems: batchSize * _nbNeurons * width * height, deviceID: _deviceID ) @@ -1039,8 +1013,8 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization if _sum1 == nil { - _sum1 = MetalPrivateBuffer(_nbNeurons, deviceID: _deviceID) - _sum2 = MetalPrivateBuffer(_nbNeurons, deviceID: _deviceID) + _sum1 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID) + _sum2 = FloatBuffer(nbElems: _nbNeurons, deviceID: _deviceID) } let command = MetalKernel.get.createCommand( @@ -1126,7 +1100,7 @@ class BatchNormalizationGPU: LayerWeightsStatsNormalization } /// Get the weights in the GPU execution context. - func collectWeights() -> [IWeightBuffers] + func collectWeights() -> [WeightBuffers] { return [_Ɣ, _β] } @@ -1475,7 +1449,7 @@ public class InstanceNormalization: LayerWeightsNormalization } /// Get the weights in the CPU execution context. - func collectWeights() -> [IWeightArrays] + func collectWeights() -> [WeightArrays] { return [_Ɣ, _β] } @@ -1488,40 +1462,40 @@ class InstanceNormalizationGPU: LayerWeightsNormalization /// Buffer of weights to scale the normalization result. /// Shape ~ (nbNeurons,). /// - var _Ɣ: IWeightBuffers! = nil + var _Ɣ: WeightBuffers! = nil /// /// Buffer of biases to add to the normalization result. /// Shape ~ (nbNeurons,). /// - var _β: IWeightBuffers! = nil + var _β: WeightBuffers! = nil /// /// Buffer of averages of data for the different independent batch normalization units. /// Shape ~ (batch, nbNeurons). /// - var _μ: MetalBuffer! = nil + var _μ: FloatBuffer! = nil /// /// Buffer of deviations of data for the different independent batch normalization units. /// Shape ~ (batch, nbNeurons). /// - var _σ2: MetalBuffer! = nil + var _σ2: FloatBuffer! = nil /// /// Buffer of data normalized without taking into account the biases and the weights. /// Shape ~ (batch, nbNeurons, height, width). /// - var _xHat: MetalBuffer! = nil + var _xHat: FloatBuffer! = nil /// /// Buffer used to compute backward pass. /// Shape ~ (nbNeurons,). /// - var _sum1: MetalBuffer! = nil + var _sum1: FloatBuffer! = nil /// /// Buffer used to compute backward pass. /// Shape ~ (nbNeurons,). /// - var _sum2: MetalBuffer! = nil + var _sum2: FloatBuffer! = nil /// GPU device on which model is executed. var _deviceID = 0 @@ -1535,11 +1509,8 @@ class InstanceNormalizationGPU: LayerWeightsNormalization return super.weights } - MetalKernel.get.download([_β.w_p!, _Ɣ.w_p!]) - - var weightsTmp = [Float]() - weightsTmp += _Ɣ.w_p!.shared.array - weightsTmp += _β.w_p!.shared.array + var weightsTmp = _Ɣ!.w.download() + weightsTmp += _β!.w.download() return weightsTmp } set { @@ -1597,28 +1568,19 @@ class InstanceNormalizationGPU: LayerWeightsNormalization _β = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID) _Ɣ = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID) - let βPtr = _β.w_p!.shared.buffer - let ƔPtr = _Ɣ.w_p!.shared.buffer - if _weightsList.count == 0 { + _weightsList = [Float](repeating: 0.0, count: 2 * _nbNeurons) for depth in 0..<_nbNeurons { - ƔPtr[depth] = 1.0 - βPtr[depth] = 0.0 - } - } - else - { - for depth in 0..<_nbNeurons - { - ƔPtr[depth] = _weightsList[depth] - βPtr[depth] = _weightsList[_nbNeurons + depth] + _weightsList[depth] = 1.0 } - _weightsList = [] } - MetalKernel.get.upload([_β.w_p!, _Ɣ.w_p!]) + _Ɣ.w.initialize(array: &_weightsList) + _β.w.initialize(array: &_weightsList, start: _nbNeurons) + + _weightsList = [] } /// @@ -1654,7 +1616,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _xHat == nil { - _xHat = MetalPrivateBuffer( + _xHat = FloatBuffer(nbElems: batchSize * _nbNeurons * width * height, deviceID: _deviceID ) @@ -1698,7 +1660,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _xHat == nil { - _xHat = MetalPrivateBuffer( + _xHat = FloatBuffer(nbElems: batchSize * _nbNeurons * width * height, deviceID: _deviceID ) @@ -1738,7 +1700,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _μ == nil { - _μ = MetalPrivateBuffer( + _μ = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) } @@ -1771,7 +1733,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _μ == nil { - _μ = MetalPrivateBuffer( + _μ = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) } @@ -1803,7 +1765,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _σ2 == nil { - _σ2 = MetalPrivateBuffer( + _σ2 = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) } @@ -1837,7 +1799,7 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _σ2 == nil { - _σ2 = MetalPrivateBuffer( + _σ2 = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) } @@ -1941,10 +1903,10 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _sum1 == nil { - _sum1 = MetalPrivateBuffer( + _sum1 = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) - _sum2 = MetalPrivateBuffer( + _sum2 = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) } @@ -1983,10 +1945,10 @@ class InstanceNormalizationGPU: LayerWeightsNormalization if _sum1 == nil { - _sum1 = MetalPrivateBuffer( + _sum1 = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) - _sum2 = MetalPrivateBuffer( + _sum2 = FloatBuffer(nbElems: batchSize * _nbNeurons, deviceID: _deviceID ) } @@ -2359,40 +2321,40 @@ class LayerNormalizationGPU: LayerWeightsNormalization /// Buffer of weights to scale the normalization result. /// Shape ~ (nbNeurons,). /// - var _Ɣ: IWeightBuffers! = nil + var _Ɣ: WeightBuffers! = nil /// /// Buffer of biases to add to the normalization result. /// Shape ~ (nbNeurons,). /// - var _β: IWeightBuffers! = nil + var _β: WeightBuffers! = nil /// /// Buffer of averages of data for the different independent batch normalization units. /// Shape ~ (batch, sequence). /// - var _μ: MetalBuffer! = nil + var _μ: FloatBuffer! = nil /// /// Buffer of deviations of data for the different independent batch normalization units. /// Shape ~ (batch, sequence). /// - var _σ2: MetalBuffer! = nil + var _σ2: FloatBuffer! = nil /// /// Buffer of data normalized without taking into account the biases and the weights. /// Shape ~ (batch, sequence, nbNeurons). /// - var _xHat: MetalBuffer! = nil + var _xHat: FloatBuffer! = nil /// /// Buffer used to compute backward pass. /// Shape ~ (batch, sequence). /// - var _sum1: MetalBuffer! = nil + var _sum1: FloatBuffer! = nil /// /// Buffer used to compute backward pass. /// Shape ~ (batch, sequence). /// - var _sum2: MetalBuffer! = nil + var _sum2: FloatBuffer! = nil /// GPU device on which model is executed. var _deviceID = 0 @@ -2406,11 +2368,8 @@ class LayerNormalizationGPU: LayerWeightsNormalization return super.weights } - MetalKernel.get.download([_β.w_p!, _Ɣ.w_p!]) - - var weightsTmp = [Float]() - weightsTmp += _Ɣ.w_p!.shared.array - weightsTmp += _β.w_p!.shared.array + var weightsTmp = _Ɣ!.w.download() + weightsTmp += _β!.w.download() return weightsTmp } set { @@ -2468,28 +2427,19 @@ class LayerNormalizationGPU: LayerWeightsNormalization _β = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID) _Ɣ = WeightBuffers(nbElems: _nbNeurons, deviceID: _deviceID) - let βPtr = _β.w_p!.shared.buffer - let ƔPtr = _Ɣ.w_p!.shared.buffer - if _weightsList.count == 0 { + _weightsList = [Float](repeating: 0.0, count: 2 * _nbNeurons) for depth in 0..<_nbNeurons { - ƔPtr[depth] = 1.0 - βPtr[depth] = 0.0 + _weightsList[depth] = 1.0 } } - else - { - for depth in 0..<_nbNeurons - { - ƔPtr[depth] = _weightsList[depth] - βPtr[depth] = _weightsList[_nbNeurons + depth] - } - _weightsList = [] - } - MetalKernel.get.upload([_β.w_p!, _Ɣ.w_p!]) + _Ɣ.w.initialize(array: &_weightsList) + _β.w.initialize(array: &_weightsList, start: _nbNeurons) + + _weightsList = [] } /// @@ -2524,7 +2474,7 @@ class LayerNormalizationGPU: LayerWeightsNormalization if _xHat == nil { - _xHat = MetalPrivateBuffer( + _xHat = FloatBuffer(nbElems: batchSize * sequence * _nbNeurons, deviceID: _deviceID ) @@ -2565,7 +2515,7 @@ class LayerNormalizationGPU: LayerWeightsNormalization if _μ == nil { - _μ = MetalPrivateBuffer( + _μ = FloatBuffer(nbElems: batchSize * sequence, deviceID: _deviceID ) } @@ -2597,7 +2547,7 @@ class LayerNormalizationGPU: LayerWeightsNormalization if _σ2 == nil { - _σ2 = MetalPrivateBuffer( + _σ2 = FloatBuffer(nbElems: batchSize * sequence, deviceID: _deviceID ) } @@ -2666,10 +2616,10 @@ class LayerNormalizationGPU: LayerWeightsNormalization if _sum1 == nil { - _sum1 = MetalPrivateBuffer( + _sum1 = FloatBuffer(nbElems: batchSize * sequence, deviceID: _deviceID ) - _sum2 = MetalPrivateBuffer( + _sum2 = FloatBuffer(nbElems: batchSize * sequence, deviceID: _deviceID ) } diff --git a/Sources/GrAIdient/Core/Layer/LayerUpdate.swift b/Sources/GrAIdient/Core/Layer/LayerUpdate.swift index 92adb1fa..0a94648c 100644 --- a/Sources/GrAIdient/Core/Layer/LayerUpdate.swift +++ b/Sources/GrAIdient/Core/Layer/LayerUpdate.swift @@ -74,15 +74,15 @@ public protocol IWeightBuffers var nbElems: Int { get } /// Weights buffer: the buffer to be update. - var w: MetalBuffer { get } + var w: FloatBuffer { get } /// Gradients buffer. - var g: MetalBuffer { get } + var g: FloatBuffer { get } /// Momentum buffer. - var m: MetalBuffer { get } + var m: FloatBuffer { get } /// Velocity buffer. - var v: MetalBuffer { get } + var v: FloatBuffer { get } /// Velocity normalized buffer. - var vHat: MetalBuffer { get } + var vHat: FloatBuffer { get } /// Clean the momentum..., preserving the weights. func reset() @@ -90,50 +90,35 @@ public protocol IWeightBuffers extension IWeightBuffers { - /// Get the weights as a private buffer. - var w_p: MetalPrivateBuffer? - { - get { - return w as? MetalPrivateBuffer - } - } - /// Get the weights as a shared buffer. - var w_s: MetalSharedBuffer? - { - get { - return w as? MetalSharedBuffer - } - } - - /// Get the gradient buffer as a private buffer. - var g_p: MetalPrivateBuffer? + /// GPU device where the buffers are sent. + public var deviceID: Int { get { - return g as? MetalPrivateBuffer + return w.deviceID } } - /// Get the gradient buffer as a shared buffer. - var g_s: MetalSharedBuffer? + /// Number of elements in the different buffers. + public var nbElems: Int { get { - return g as? MetalSharedBuffer + return w.nbElems } } } /// GPU buffers needed to update the weights. -class WeightBuffers: IWeightBuffers +public class WeightBuffers: IWeightBuffers { - /// Number of elements in the different buffers. - let nbElems: Int - /// GPU device where the buffers are sent. - let deviceID: Int - - var _w: MetalBuffer! = nil - var _g: MetalBuffer! = nil - var _m: MetalBuffer! = nil - var _v: MetalBuffer! = nil - var _vHat: MetalBuffer! = nil + /// Weights buffer: the buffer to be update. + public let w: FloatBuffer + /// Gradients buffer. + public let g: FloatBuffer + /// Momentum buffer. + public let m: FloatBuffer + /// Velocity buffer. + public let v: FloatBuffer + /// Velocity normalized buffer. + public let vHat: FloatBuffer /// /// Create a container of buffers. @@ -144,78 +129,21 @@ class WeightBuffers: IWeightBuffers /// init(nbElems: Int, deviceID: Int) { - self.nbElems = nbElems - self.deviceID = deviceID - } - - /// Weights buffer: the buffer to be update. - var w: MetalBuffer - { - get { - if _w == nil - { - _w = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _w - } - } - - /// Gradients buffer. - var g: MetalBuffer - { - get { - if _g == nil - { - _g = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _g - } - } - - /// Momentum buffer. - var m: MetalBuffer - { - get { - if _m == nil - { - _m = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _m - } - } - - /// Velocity buffer. - var v: MetalBuffer - { - get { - if _v == nil - { - _v = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _v - } + w = FloatBuffer(nbElems: nbElems, deviceID: deviceID) + g = FloatBuffer(nbElems: nbElems, deviceID: deviceID) + m = FloatBuffer(nbElems: nbElems, deviceID: deviceID) + v = FloatBuffer(nbElems: nbElems, deviceID: deviceID) + vHat = FloatBuffer(nbElems: nbElems, deviceID: deviceID) } - /// Velocity normalized buffer. - var vHat: MetalBuffer + /// Clean the buffers. + public func reset() { - get { - if _vHat == nil - { - _vHat = MetalPrivateBuffer(nbElems, deviceID: deviceID) - } - return _vHat - } - } - - /// Clean the momentum..., preserving the weights. - func reset() - { - // do not touch _w - _g = nil - _m = nil - _v = nil - _vHat = nil + // do not touch w + g.reset() + m.reset() + v.reset() + vHat.reset() } } @@ -257,7 +185,11 @@ extension LayerWeightInit } } + /// /// Generate list of weights values. + /// + /// - Returns: The generated list of values. + /// public func generateWeightsList() -> [Float] { let nbElems = weightListSize @@ -289,8 +221,16 @@ extension LayerWeightInit return weightsList } + /// + /// Generate weights values. + /// + /// - Parameters: + /// - out: The output buffer. + /// - deviceID: GPU device. + /// public func generateWeightsList( - buffer: UnsafeMutableBufferPointer) + out: FloatBuffer, + deviceID: Int) { let nbElems = weightListSize switch weightInitClass { @@ -298,27 +238,31 @@ extension LayerWeightInit Self.XavierUniform( nbElems: nbElems, connectivityIO: connectivityIO, - buffer: buffer + out: out, + deviceID: deviceID ) case .XavierNormal: Self.XavierNormal( nbElems: nbElems, connectivityIO: connectivityIO, - buffer: buffer + out: out, + deviceID: deviceID ) case .KaimingUniform: Self.KaimingUniform( nbElems: nbElems, coeff: coeffInitWeights, connectivityIO: connectivityIO, - buffer: buffer + out: out, + deviceID: deviceID ) case .KaimingNormal: Self.KaimingNormal( nbElems: nbElems, coeff: coeffInitWeights, connectivityIO: connectivityIO, - buffer: buffer + out: out, + deviceID: deviceID ) } } @@ -350,23 +294,28 @@ extension LayerWeightInit /// - Parameters: /// - nbElems: Number of weights to initialize. /// - connectivityIO: Number of input and output connections. - /// - buffer: The buffer of values. + /// - out: The output buffer. + /// - deviceID: GPU device. /// static func XavierUniform( nbElems: Int, connectivityIO: (Int, Int), - buffer: UnsafeMutableBufferPointer) + out: FloatBuffer, + deviceID: Int) { - let bound = sqrt(6) / sqrt(Float(connectivityIO.0 + connectivityIO.1)) - if #available(macOS 13.0, *) + var array = [Float](repeating: 0.0, count: nbElems) + array.withUnsafeMutableBufferPointer { - guard - var arrayDescriptor = BNNSNDArrayDescriptor( - data: buffer, - shape: .vector(nbElems)), - let randomNumberGenerator = BNNSCreateRandomGenerator( - BNNSRandomGeneratorMethodAES_CTR, - nil) else + ptr in + + let bound = + sqrt(6) / sqrt(Float(connectivityIO.0 + connectivityIO.1)) + guard var arrayDescriptor = BNNSNDArrayDescriptor( + data: ptr, + shape: .vector(nbElems)), + let randomNumberGenerator = BNNSCreateRandomGenerator( + BNNSRandomGeneratorMethodAES_CTR, + nil) else { fatalError() } @@ -379,11 +328,8 @@ extension LayerWeightInit ) BNNSDestroyRandomGenerator(randomNumberGenerator) - } - else - { - fatalError() } + out.initialize(array: &array) } /// @@ -413,23 +359,27 @@ extension LayerWeightInit /// - Parameters: /// - nbElems: Number of weights to initialize. /// - connectivityIO: Number of input and output connections. - /// - buffer: The buffer of values. + /// - out: The output buffer. + /// - deviceID: GPU device. /// static func XavierNormal( nbElems: Int, connectivityIO: (Int, Int), - buffer: UnsafeMutableBufferPointer) + out: FloatBuffer, + deviceID: Int) { - let std = sqrt(2) / sqrt(Float(connectivityIO.0 + connectivityIO.1)) - if #available(macOS 13.0, *) + var array = [Float](repeating: 0.0, count: nbElems) + array.withUnsafeMutableBufferPointer { - guard - var arrayDescriptor = BNNSNDArrayDescriptor( - data: buffer, - shape: .vector(nbElems)), - let randomNumberGenerator = BNNSCreateRandomGenerator( - BNNSRandomGeneratorMethodAES_CTR, - nil) else + ptr in + + let std = sqrt(2) / sqrt(Float(connectivityIO.0 + connectivityIO.1)) + guard var arrayDescriptor = BNNSNDArrayDescriptor( + data: ptr, + shape: .vector(nbElems)), + let randomNumberGenerator = BNNSCreateRandomGenerator( + BNNSRandomGeneratorMethodAES_CTR, + nil) else { fatalError() } @@ -443,10 +393,7 @@ extension LayerWeightInit BNNSDestroyRandomGenerator(randomNumberGenerator) } - else - { - fatalError() - } + out.initialize(array: &array) } /// @@ -479,24 +426,28 @@ extension LayerWeightInit /// - nbElems: Number of weights to initialize. /// - coeff: Multiplicative coefficient. /// - connectivityIO: Number of input and output connections. - /// - buffer: The buffer of values. + /// - out: The output buffer. + /// - deviceID: GPU device. /// static func KaimingUniform( nbElems: Int, coeff: Float, connectivityIO: (Int, Int), - buffer: UnsafeMutableBufferPointer) + out: FloatBuffer, + deviceID: Int) { - let bound = sqrt(3) * coeff / sqrt(Float(connectivityIO.0)) - if #available(macOS 13.0, *) + var array = [Float](repeating: 0.0, count: nbElems) + array.withUnsafeMutableBufferPointer { - guard - var arrayDescriptor = BNNSNDArrayDescriptor( - data: buffer, - shape: .vector(nbElems)), - let randomNumberGenerator = BNNSCreateRandomGenerator( - BNNSRandomGeneratorMethodAES_CTR, - nil) else + ptr in + + let bound = sqrt(3) * coeff / sqrt(Float(connectivityIO.0)) + guard var arrayDescriptor = BNNSNDArrayDescriptor( + data: ptr, + shape: .vector(nbElems)), + let randomNumberGenerator = BNNSCreateRandomGenerator( + BNNSRandomGeneratorMethodAES_CTR, + nil) else { fatalError() } @@ -510,10 +461,7 @@ extension LayerWeightInit BNNSDestroyRandomGenerator(randomNumberGenerator) } - else - { - fatalError() - } + out.initialize(array: &array) } /// @@ -546,24 +494,28 @@ extension LayerWeightInit /// - nbElems: Number of weights to initialize. /// - coeff: Multiplicative coefficient. /// - connectivityIO: Number of input and output connections. - /// - buffer: The buffer of values. + /// - out: The output buffer. + /// - deviceID: GPU device. /// static func KaimingNormal( nbElems: Int, coeff: Float, connectivityIO: (Int, Int), - buffer: UnsafeMutableBufferPointer) + out: FloatBuffer, + deviceID: Int) { - let std = coeff / sqrt(Float(connectivityIO.0)) - if #available(macOS 13.0, *) + var array = [Float](repeating: 0.0, count: nbElems) + array.withUnsafeMutableBufferPointer { - guard - var arrayDescriptor = BNNSNDArrayDescriptor( - data: buffer, - shape: .vector(nbElems)), - let randomNumberGenerator = BNNSCreateRandomGenerator( - BNNSRandomGeneratorMethodAES_CTR, - nil) else + ptr in + + let std = coeff / sqrt(Float(connectivityIO.0)) + guard var arrayDescriptor = BNNSNDArrayDescriptor( + data: ptr, + shape: .vector(nbElems)), + let randomNumberGenerator = BNNSCreateRandomGenerator( + BNNSRandomGeneratorMethodAES_CTR, + nil) else { fatalError() } @@ -577,10 +529,7 @@ extension LayerWeightInit BNNSDestroyRandomGenerator(randomNumberGenerator) } - else - { - fatalError() - } + out.initialize(array: &array) } } diff --git a/Sources/GrAIdient/Core/Model/Model.swift b/Sources/GrAIdient/Core/Model/Model.swift index 5828020a..583c0a8b 100644 --- a/Sources/GrAIdient/Core/Model/Model.swift +++ b/Sources/GrAIdient/Core/Model/Model.swift @@ -948,7 +948,7 @@ public class Model: BaseModel if GrAI.Opti.GPU { let gNorm: Float? = gradientNorm != nil ? - Float(gradientNorm!) : nil + Float(gradientNorm!) : nil try _kernel.algo.udpateGPU(layers: myLayers, gradientNorm: gNorm) } diff --git a/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift b/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift index 31f11259..e85cf693 100644 --- a/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift +++ b/Sources/GrAIdient/Core/Optimizer/OptimizerAlgorithm.swift @@ -170,7 +170,7 @@ public class OptimizerAlgorithm try clipGradientGPU( layers: layers, gradientNorm: gNorm, - normThreshold: _optimizer.params.normThreshold + normThreshold: Float(_optimizer.params.normThreshold) ) } @@ -233,7 +233,7 @@ public class OptimizerAlgorithm let nbElems = buffers.g.nbElems let pNbElems: [UInt32] = [UInt32(nbElems)] - let pFactor: [Float] = [Float(factor)] + let pFactor: [Float] = [factor] let command = MetalKernel.get.createCommand( "multiplyGradients", deviceID: layer.deviceID @@ -303,22 +303,7 @@ public class OptimizerAlgorithm for buffers in layerUpdate.collectWeightsGPU() { - let buffer: UnsafeMutableBufferPointer - if let g_p = buffers.g_p - { - MetalKernel.get.download([g_p]) - buffer = g_p.shared.buffer - } - else if let g_s = buffers.g_s - { - MetalKernel.get.download([g_s]) - buffer = g_s.buffer - } - else - { - fatalError("Unreachable.") - } - + let buffer = buffers.g.download() for i in 0.. - if let g_p = buffers.g_p - { - MetalKernel.get.download([g_p]) - buffer = g_p.shared.buffer - } - else if let g_s = buffers.g_s - { - MetalKernel.get.download([g_s]) - buffer = g_s.buffer - } - else - { - fatalError("Unreachable.") - } - + let buffer = buffers.g.download() for i in 0.. Float(normThreshold) { + if gradientNorm > normThreshold { for layer in layers { if let layerUpdate = layer as? LayerUpdate, @@ -486,8 +456,8 @@ public class OptimizerAlgorithm let nbElems = buffers.g.nbElems let pNbElems: [UInt32] = [UInt32(nbElems)] - let pGradientNorm: [Float] = [Float(gradientNorm)] - let pNormThreshold: [Float] = [Float(normThreshold)] + let pGradientNorm: [Float] = [gradientNorm] + let pNormThreshold: [Float] = [normThreshold] let command = MetalKernel.get.createCommand( "clipGradients", deviceID: layer.deviceID diff --git a/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift b/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift index 1a9899d9..5e237d3c 100644 --- a/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift +++ b/Sources/GrAIdient/Core/Optimizer/OptimizerImpl.swift @@ -294,12 +294,12 @@ class AdamOptimizer: OptimizerImpl override func stepGPU(_ weights: IWeightBuffers) { let nbElems = weights.nbElems - let t = Double(_kernel.params.t) + let t = Float(_kernel.params.t) let pNbElems: [UInt32] = [UInt32(nbElems)] let pAlpha: [Float] = [Float(alpha)] let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0] - let pT: [Float] = [Float(t)] + let pT: [Float] = [t] let command = MetalKernel.get.createCommand( "weightsAdam", deviceID: weights.deviceID @@ -366,12 +366,12 @@ class AMSGradOptimizer: OptimizerImpl override func stepGPU(_ weights: IWeightBuffers) { let nbElems = weights.nbElems - let t = Double(_kernel.params.t) + let t = Float(_kernel.params.t) let pNbElems: [UInt32] = [UInt32(nbElems)] let pAlpha: [Float] = [Float(alpha)] let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0] - let pT: [Float] = [Float(t)] + let pT: [Float] = [t] let command = MetalKernel.get.createCommand( "weightsAMSGrad", deviceID: weights.deviceID @@ -449,12 +449,12 @@ class AdamRectifiedOptimizer: OptimizerImpl override func stepGPU(_ weights: IWeightBuffers) { let nbElems = weights.nbElems - let t = Double(_kernel.params.t) + let t = Float(_kernel.params.t) let pNbElems: [UInt32] = [UInt32(nbElems)] let pAlpha: [Float] = [Float(alpha)] let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0] - let pT: [Float] = [Float(t)] + let pT: [Float] = [t] let command = MetalKernel.get.createCommand( "weightsAdamRectified", deviceID: weights.deviceID @@ -583,12 +583,12 @@ class AdaBoundOptimizer: BoundOptimizer override func stepGPU(_ weights: IWeightBuffers) { let nbElems = weights.nbElems - let t = Double(_kernel.params.t) + let t = Float(_kernel.params.t) let pNbElems: [UInt32] = [UInt32(nbElems)] let pAlpha: [Float] = [Float(alpha)] let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0] - let pT: [Float] = [Float(t)] + let pT: [Float] = [t] let pLowerBound: [Float] = [Float(lowerBound!)] let pUpperBound: [Float] = [Float(upperBound!)] @@ -667,12 +667,12 @@ class AMSBoundOptimizer: BoundOptimizer override func stepGPU(_ weights: IWeightBuffers) { let nbElems = weights.nbElems - let t = Double(_kernel.params.t) + let t = Float(_kernel.params.t) let pNbElems: [UInt32] = [UInt32(nbElems)] let pAlpha: [Float] = [Float(alpha)] let pLambda: [Float] = [lambda != nil ? Float(lambda!) : 0.0] - let pT: [Float] = [Float(t)] + let pT: [Float] = [t] let pLowerBound: [Float] = [Float(lowerBound!)] let pUpperBound: [Float] = [Float(upperBound!)] diff --git a/Sources/GrAIdient/Core/State/Weights.swift b/Sources/GrAIdient/Core/State/Weights.swift index 03e2b610..a45053dc 100644 --- a/Sources/GrAIdient/Core/State/Weights.swift +++ b/Sources/GrAIdient/Core/State/Weights.swift @@ -27,10 +27,10 @@ public protocol IWeightArrays } /// Arrays needed to update the weights. -class WeightArrays: IWeightArrays +public class WeightArrays: IWeightArrays { /// Number of elements in the different arrays. - let nbElems: Int + public let nbElems: Int var _w: [Double] = [] var _g: [Double] = [] @@ -49,7 +49,7 @@ class WeightArrays: IWeightArrays } /// Weights array: the array to update. - var w: [Double] + public var w: [Double] { get { if _w.count == 0 @@ -69,7 +69,7 @@ class WeightArrays: IWeightArrays } } /// Gradients array. - var g: [Double] + public var g: [Double] { get { if _g.count == 0 @@ -89,7 +89,7 @@ class WeightArrays: IWeightArrays } } /// Momentum array. - var m: [Double] + public var m: [Double] { get { if _m.count == 0 @@ -109,7 +109,7 @@ class WeightArrays: IWeightArrays } } /// Velocity array. - var v: [Double] + public var v: [Double] { get { if _v.count == 0 @@ -129,7 +129,7 @@ class WeightArrays: IWeightArrays } } /// Veclocity normalized array. - var vHat: [Double] + public var vHat: [Double] { get { if _vHat.count == 0 @@ -150,7 +150,7 @@ class WeightArrays: IWeightArrays } /// Clean the momentum..., preserving the weights. - func reset() + public func reset() { _g = [] _m = [] diff --git a/Sources/GrAIdient/GrAI.swift b/Sources/GrAIdient/GrAI.swift index ae370274..7ead7164 100644 --- a/Sources/GrAIdient/GrAI.swift +++ b/Sources/GrAIdient/GrAI.swift @@ -70,6 +70,68 @@ public class GrAI } } + /// Namespace for precision settings. + public class Precision + { + /// Get/Set precision. + public static var double: Bool + { + get { + return getCtx.precision == PrecisionMode.Double + } + set { + if newValue && GrAI.Opti.CPU + { + getCtx.precision = PrecisionMode.Double + } + else if newValue + { + fatalError( + "Cannot set double precision with GPU optimization." + ) + } + } + } + /// Get/Set precision. + public static var float: Bool + { + get { + return getCtx.precision == PrecisionMode.Float + } + set { + if newValue && GrAI.Opti.GPU + { + getCtx.precision = PrecisionMode.Float + } + else if newValue + { + fatalError( + "Cannot set float precision with CPU optimization." + ) + } + } + } + /// Get/Set precision. + public static var float16: Bool + { + get { + return getCtx.precision == PrecisionMode.Float16 + } + set { + if newValue && GrAI.Opti.GPU + { + getCtx.precision = PrecisionMode.Float16 + } + else if newValue + { + fatalError( + "Cannot set float precision with CPU optimization." + ) + } + } + } + } + /// Namespace for gradient settings. public class Gradient { @@ -346,6 +408,14 @@ public class GrAI } } +/// Precision mode. +public enum PrecisionMode +{ + case Double + case Float + case Float16 +} + /// A global context with stored variables. fileprivate class GrAIContext { @@ -370,6 +440,12 @@ fileprivate class GrAIContext case GPU } + //-------------------------------------------------------------------------- + // PRECISION + //-------------------------------------------------------------------------- + /// Precision variable. + var precision = PrecisionMode.Float + /// Used to select GPU device. var gpuNamedPriority = [String]() diff --git a/Sources/GrAIdient/Layer1D/Activation1D.swift b/Sources/GrAIdient/Layer1D/Activation1D.swift index 1afffaae..79fccd50 100644 --- a/Sources/GrAIdient/Layer1D/Activation1D.swift +++ b/Sources/GrAIdient/Layer1D/Activation1D.swift @@ -16,7 +16,7 @@ public class Activation1D: Layer1D /// used in the GPU execution context. /// Shape ~ (batch, nbNeurons). /// - var _tmp: MetalPrivateBuffer! = nil + var _tmp: FloatBuffer! = nil /// Get coefficient (depending on activation function) to apply during the weights initialization. public var coeffInitWeights: Float diff --git a/Sources/GrAIdient/Layer1D/BCE1D.swift b/Sources/GrAIdient/Layer1D/BCE1D.swift index da842382..8e3bdedc 100644 --- a/Sources/GrAIdient/Layer1D/BCE1D.swift +++ b/Sources/GrAIdient/Layer1D/BCE1D.swift @@ -207,7 +207,7 @@ public class BCE1D: LayerOutput1D /// - Returns: The loss value. /// public func getLossGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbNeurons: Int) throws -> Float { @@ -233,9 +233,8 @@ public class BCE1D: LayerOutput1D command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0.., + _ groundTruth: FloatBuffer, batchSize: Int, nbNeurons: Int) throws { diff --git a/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift b/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift index 237d3da3..79ff2e9d 100644 --- a/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift +++ b/Sources/GrAIdient/Layer1D/BCESigmoid1D.swift @@ -230,7 +230,7 @@ public class BCESigmoid1D: LayerOutput1D /// - Returns: The loss value. /// public func getLossGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbNeurons: Int) throws -> Float { @@ -256,9 +256,8 @@ public class BCESigmoid1D: LayerOutput1D command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0.., + _ groundTruth: FloatBuffer, batchSize: Int, nbNeurons: Int) throws { diff --git a/Sources/GrAIdient/Layer1D/Base/Layer1D.swift b/Sources/GrAIdient/Layer1D/Base/Layer1D.swift index 5e45c37f..ce2ab089 100644 --- a/Sources/GrAIdient/Layer1D/Base/Layer1D.swift +++ b/Sources/GrAIdient/Layer1D/Base/Layer1D.swift @@ -15,12 +15,12 @@ open class Layer1D: Layer /// Output buffer (result of the forward pass) used in the GPU execution context. /// Shape ~ (batch, nbNeurons). /// - public var outs: MetalPrivateBuffer! = nil + public var outs: FloatBuffer! = nil /// /// Gradient buffer (result of the backward pass) used in the GPU execution context. /// Shape ~ (batch, nbNeurons). /// - public var delta: MetalPrivateBuffer! = nil + public var delta: FloatBuffer! = nil /// Number of neurons. public let nbNeurons: Int @@ -138,8 +138,8 @@ open class Layer1D: Layer { if outs == nil { - outs = MetalPrivateBuffer( - batchSize * nbNeurons, deviceID: deviceID + outs = FloatBuffer( + nbElems: batchSize * nbNeurons, deviceID: deviceID ) } else if batchSize <= 0 || batchSize > outs.nbElems / nbNeurons @@ -159,8 +159,8 @@ open class Layer1D: Layer { if delta == nil { - delta = MetalPrivateBuffer( - batchSize * nbNeurons, deviceID: deviceID + delta = FloatBuffer( + nbElems: batchSize * nbNeurons, deviceID: deviceID ) } else if batchSize <= 0 || batchSize > delta.nbElems / nbNeurons @@ -194,9 +194,8 @@ open class Layer1D: Layer public func getOutsGPU(elem: Int) -> [T] { var outs = [T]() - MetalKernel.get.download([self.outs]) + let outsPtr = self.outs.download() - let outsPtr = self.outs.shared.buffer for depth in 0.., + _ data: FloatBuffer, batchSize: Int, nbNeurons: Int) throws { diff --git a/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift b/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift index 66ef7969..2479d066 100644 --- a/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift +++ b/Sources/GrAIdient/Layer1D/Base/LayerOutput1D.swift @@ -15,13 +15,13 @@ open class LayerOutput1D: Layer1D /// Ground truth buffer in the GPU execution context. /// Shape ~ (batch, nbNeurons). /// - public internal(set) var groundTruth: MetalSharedBuffer! = nil + public internal(set) var groundTruth: FloatBuffer! = nil /// /// Loss buffer in the GPU execution context. /// Shape ~ (batch,). /// - public internal(set) var loss: MetalSharedBuffer! = nil + public internal(set) var loss: FloatBuffer! = nil private enum Keys: String, CodingKey { @@ -147,9 +147,10 @@ open class LayerOutput1D: Layer1D if self.groundTruth == nil { - self.groundTruth = MetalSharedBuffer( - batchSize * nbNeurons, - deviceID: deviceID + self.groundTruth = FloatBuffer( + nbElems: batchSize * nbNeurons, + deviceID: deviceID, + shared: true ) } else if batchSize <= 0 || @@ -158,7 +159,7 @@ open class LayerOutput1D: Layer1D throw LayerError.BatchSize } - let bufferPtr = self.groundTruth.buffer + var buffer = [Float](repeating: 0.0, count: batchSize * nbNeurons) for (i, dataI) in groundTruth.enumerated() { if dataI.count != nbNeurons @@ -167,10 +168,10 @@ open class LayerOutput1D: Layer1D } for (j, dataIJ) in dataI.enumerated() { - bufferPtr[j + i * nbNeurons] = Float(dataIJ) + buffer[j + i * nbNeurons] = Float(dataIJ) } } - MetalKernel.get.upload([self.groundTruth]) + self.groundTruth.initialize(array: &buffer) } /// @@ -184,7 +185,7 @@ open class LayerOutput1D: Layer1D /// - nbNeurons: Number of neurons. /// public func checkGroundTruthGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbNeurons: Int) throws { @@ -211,7 +212,9 @@ open class LayerOutput1D: Layer1D { if loss == nil { - loss = MetalSharedBuffer(batchSize, deviceID: deviceID) + loss = FloatBuffer( + nbElems: batchSize, deviceID: deviceID, shared: true + ) } else if batchSize > loss.nbElems { diff --git a/Sources/GrAIdient/Layer1D/Concat1D.swift b/Sources/GrAIdient/Layer1D/Concat1D.swift index f163a8d5..afa46c15 100644 --- a/Sources/GrAIdient/Layer1D/Concat1D.swift +++ b/Sources/GrAIdient/Layer1D/Concat1D.swift @@ -146,9 +146,10 @@ public class Concat1D: LayerMerge1D { try checkStateCPU(batchSize: batchSize) + var buffersPrev = [[Float]]() for num in 0..<_layersPrev.count { - MetalKernel.get.download([(_layersPrev[num] as! Layer1D).outs]) + buffersPrev.append((_layersPrev[num] as! Layer1D).outs.download()) } let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -190,7 +191,7 @@ public class Concat1D: LayerMerge1D var curElem = 0 for num in 0..<_layersPrev.count { - let outsPrevPtr = (_layersPrev[num] as! Layer1D).outs.shared.buffer + let outsPrevPtr = buffersPrev[num] let neuronsPrev = (_layersPrev[num] as! Layer1D).neurons let nbNeurons = neuronsPrev.nbElems diff --git a/Sources/GrAIdient/Layer1D/Constant1D.swift b/Sources/GrAIdient/Layer1D/Constant1D.swift index 0c5f4bae..8976a21f 100644 --- a/Sources/GrAIdient/Layer1D/Constant1D.swift +++ b/Sources/GrAIdient/Layer1D/Constant1D.swift @@ -24,7 +24,7 @@ public class Constant1D: Layer1D, LayerUpdate /// Buffer of gradients per sample for biases. /// Shape ~ (batch, nbNeurons). /// - var _wDeltaWeights: MetalPrivateBuffer! = nil + var _wDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -64,12 +64,7 @@ public class Constant1D: Layer1D, LayerUpdate { return _weightsList } - - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - - return weightsTmp + return _wBuffers.w.download() } set { _weightsList = newValue @@ -258,19 +253,16 @@ public class Constant1D: Layer1D, LayerUpdate deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer if _weightsList.count != 0 { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: nbNeurons - ) + _wBuffers.w.initialize(array: &_weightsList) + } + else + { + _wBuffers.w.initialize() } - _weightsList = [] - MetalKernel.get.upload([_wBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil } @@ -287,7 +279,7 @@ public class Constant1D: Layer1D, LayerUpdate if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * nbNeurons, deviceID: deviceID ) } @@ -348,8 +340,7 @@ public class Constant1D: Layer1D, LayerUpdate neurons.get(depth)!.initGC(batchSize: batchSize, nbGC: newGC) } - MetalKernel.get.download([_wBuffers.w_p!]) - let weightsPtr = _wBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() for batch in 0..! = nil + var _wDeltaWeights: FloatBuffer! = nil /// /// Buffer of gradients per sample for biases. /// Shape ~ (batch, nbNeurons). /// - var _bDeltaWeights: MetalPrivateBuffer! = nil + var _bDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -105,7 +105,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit } /// Output buffer of previous layer. - var outsPrev: MetalPrivateBuffer + var outsPrev: FloatBuffer { get { if let layerPrev = self.layerPrev as? Layer1D @@ -124,7 +124,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit } /// Gradient buffer of previous layer. - var deltaPrev: MetalPrivateBuffer? + var deltaPrev: FloatBuffer? { get { if let layerPrev = self.layerPrev as? Layer1D @@ -199,14 +199,10 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit return _weightsList } - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - + var weightsTmp = _wBuffers.w.download() if _updateBiases { - MetalKernel.get.download([_bBuffers.w_p!]) - weightsTmp += _bBuffers.w_p!.shared.array + weightsTmp += _bBuffers.w.download() } return weightsTmp } @@ -576,35 +572,24 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer - + _bBuffers.w.initialize() if _weightsList.count == 0 { - generateWeightsList(buffer: weightsPtr) + generateWeightsList(out: _wBuffers.w, deviceID: deviceID) } else { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: weightHeight * weightWidth - ) + _wBuffers.w.initialize(array: &_weightsList) if _updateBiases { - copyFloatArrayToBuffer( + _bBuffers.w.initialize( array: &_weightsList, - buffer: biasesPtr, - start: weightHeight * weightWidth, - nbElems: weightHeight + start: weightHeight * weightWidth ) } } - _weightsList = [] - - MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil _bDeltaWeights = nil } @@ -622,13 +607,13 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * nbNeurons * weightWidth, deviceID: deviceID ) if _updateBiases { - _bDeltaWeights = MetalPrivateBuffer( + _bDeltaWeights = FloatBuffer(nbElems: batchSize * nbNeurons, deviceID: deviceID ) } @@ -771,11 +756,8 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit neurons.get(depth)!.initGC(batchSize: batchSize, nbGC: newGC) } - MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!]) - MetalKernel.get.download([outsPrev]) - - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() + let biasesPtr = _bBuffers.w.download() let neuronsPrev = self.neuronsPrev for batch in 0.. [IWeightArrays] { - var weights = [IWeightArrays]() + var weights = [WeightArrays]() weights.append(_wArrays) if _updateBiases { @@ -1248,8 +1230,7 @@ public class FullyConnected: Activation1D, LayerWithActivation, LayerWeightInit } var deltaWeights = [T]() - MetalKernel.get.download([_wDeltaWeights]) - var deltaWeightsPtr = _wDeltaWeights.shared.buffer + var deltaWeightsPtr = _wDeltaWeights.download() let offsetStart = elem * nbNeurons * weightWidth for depth in 0.., IWeightArrays /// GPU buffers needed to update the inputs of a layer. class InputBuffers1D: InputBuffers, IWeightBuffers -{ +{ /// Inputs buffer: the buffer to be update. - var w: MetalBuffer + var w: FloatBuffer { get { return _layer.outs @@ -71,7 +71,7 @@ class InputBuffers1D: InputBuffers, IWeightBuffers } /// Gradients buffer. - var g: MetalBuffer + var g: FloatBuffer { get { return _layer.delta @@ -304,7 +304,7 @@ public class Input1D: LayerInput1D, LayerUpdate /// - nbNeurons: Number of neurons. /// public func setDataGPU( - _ data: MetalPrivateBuffer, + _ data: FloatBuffer, batchSize: Int, nbNeurons: Int) throws { diff --git a/Sources/GrAIdient/Layer1D/LinearError1D.swift b/Sources/GrAIdient/Layer1D/LinearError1D.swift index 6549eeea..3ce12e28 100644 --- a/Sources/GrAIdient/Layer1D/LinearError1D.swift +++ b/Sources/GrAIdient/Layer1D/LinearError1D.swift @@ -201,7 +201,7 @@ public class LinearError1D: LayerOutput1D /// - Returns: The loss value. /// public func getLossGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int) throws -> Float { try checkLossGPU(batchSize: batchSize) @@ -225,9 +225,8 @@ public class LinearError1D: LayerOutput1D command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0.., + _ groundTruth: FloatBuffer, batchSize: Int, nbNeurons: Int) throws -> Float { @@ -229,9 +229,8 @@ public class MSE1D: LayerOutput1D command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0.., + _ groundTruth: FloatBuffer, batchSize: Int, nbNeurons: Int) throws { diff --git a/Sources/GrAIdient/Layer1D/Sum1D.swift b/Sources/GrAIdient/Layer1D/Sum1D.swift index 685b8416..01c66d44 100644 --- a/Sources/GrAIdient/Layer1D/Sum1D.swift +++ b/Sources/GrAIdient/Layer1D/Sum1D.swift @@ -155,9 +155,10 @@ public class Sum1D: LayerMerge1D { try checkStateCPU(batchSize: batchSize) + var buffersPrev = [[Float]]() for num in 0..<_layersPrev.count { - MetalKernel.get.download([(_layersPrev[num] as! Layer1D).outs]) + buffersPrev.append((_layersPrev[num] as! Layer1D).outs.download()) } let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -197,8 +198,7 @@ public class Sum1D: LayerMerge1D var sum = 0.0 for num in 0..<_layersPrev.count { - let outsPrevPtr = - (_layersPrev[num] as! Layer1D).outs.shared.buffer + let outsPrevPtr = buffersPrev[num] let neuronsPrev = (_layersPrev[num] as! Layer1D).neurons diff --git a/Sources/GrAIdient/Layer2D/Activation2D.swift b/Sources/GrAIdient/Layer2D/Activation2D.swift index fb57db0c..8b210d42 100644 --- a/Sources/GrAIdient/Layer2D/Activation2D.swift +++ b/Sources/GrAIdient/Layer2D/Activation2D.swift @@ -16,7 +16,7 @@ public class Activation2D: Layer2D /// used in the GPU execution context. /// Shape ~ (batch, nbChannels, height, width). /// - var _tmp: MetalPrivateBuffer! = nil + var _tmp: FloatBuffer! = nil /// Get coefficient (depending on activation function) to apply during the weights initialization. public var coeffInitWeights: Float diff --git a/Sources/GrAIdient/Layer2D/AdaIN.swift b/Sources/GrAIdient/Layer2D/AdaIN.swift index 2fd50d6c..c1f6beb6 100644 --- a/Sources/GrAIdient/Layer2D/AdaIN.swift +++ b/Sources/GrAIdient/Layer2D/AdaIN.swift @@ -362,10 +362,9 @@ public class AdaIN: LayerMerge2D let layerFirst = _layersPrev.first as! Layer2D let layerLast = _layersPrev.last as! Layer1D - MetalKernel.get.download([layerFirst.outs, layerLast.outs]) - let bufferOuts = layerFirst.outs.shared.buffer - let bufferStyles = layerLast.outs.shared.buffer + let bufferOuts = layerFirst.outs.download() + let bufferStyles = layerLast.outs.download() let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -663,7 +662,7 @@ public class AdaIN: LayerMerge2D /// - Returns: The outputs. /// func getOutsPrev( - buffer: UnsafeMutableBufferPointer, + buffer: [Float], depth: Int, batch: Int) -> [Double] { @@ -692,7 +691,7 @@ public class AdaIN: LayerMerge2D /// - Returns: The output. /// func getOutStyle( - buffer: UnsafeMutableBufferPointer, + buffer: [Float], depth: Int, batch: Int) -> Double { diff --git a/Sources/GrAIdient/Layer2D/BCE2D.swift b/Sources/GrAIdient/Layer2D/BCE2D.swift index 8b2b8010..cfcd5bc6 100644 --- a/Sources/GrAIdient/Layer2D/BCE2D.swift +++ b/Sources/GrAIdient/Layer2D/BCE2D.swift @@ -272,7 +272,7 @@ public class BCE2D: LayerOutput2D /// - Returns: The loss value. /// public func getLossGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws -> Float { @@ -300,9 +300,8 @@ public class BCE2D: LayerOutput2D command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0.., + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { diff --git a/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift b/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift index d1104542..6c5396c0 100644 --- a/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift +++ b/Sources/GrAIdient/Layer2D/BCESigmoid2D.swift @@ -315,7 +315,7 @@ public class BCESigmoid2D: LayerOutput2D /// - Returns: The loss value. /// public func getLossGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws -> Float { @@ -343,9 +343,8 @@ public class BCESigmoid2D: LayerOutput2D command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0.., + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { diff --git a/Sources/GrAIdient/Layer2D/BN2D.swift b/Sources/GrAIdient/Layer2D/BN2D.swift index f154a2c9..5847ccb7 100644 --- a/Sources/GrAIdient/Layer2D/BN2D.swift +++ b/Sources/GrAIdient/Layer2D/BN2D.swift @@ -533,8 +533,7 @@ public class BN2D: Activation2D, LayerUpdate, LayerWithActivation }}} }} - MetalKernel.get.download([layerPrev.outs]) - let outsPrevPtr = layerPrev.outs.shared.buffer + let outsPrevPtr = layerPrev.outs.download() // Prepare GC for norm weights: Ɣ and β. for batch in 0.. [IWeightArrays] { - var weights = [IWeightArrays]() + var weights = [WeightArrays]() if let norm = self.norm { weights += norm.collectWeights() diff --git a/Sources/GrAIdient/Layer2D/Base/Layer2D.swift b/Sources/GrAIdient/Layer2D/Base/Layer2D.swift index fc95d9a3..e4af2a0b 100644 --- a/Sources/GrAIdient/Layer2D/Base/Layer2D.swift +++ b/Sources/GrAIdient/Layer2D/Base/Layer2D.swift @@ -15,12 +15,12 @@ open class Layer2D: Layer /// Output buffer (result of the forward pass) used in the GPU execution context. /// Shape ~ (batch, nbChannels, height, width). /// - public var outs: MetalPrivateBuffer! = nil + public var outs: FloatBuffer! = nil /// /// Gradient buffer (result of the backward pass) used in the GPU execution context. /// Shape ~ (batch, nbChannels, height, width). /// - public var delta: MetalPrivateBuffer! = nil + public var delta: FloatBuffer! = nil /// Number of channels. public let nbChannels: Int @@ -192,8 +192,9 @@ open class Layer2D: Layer { if outs == nil { - outs = MetalPrivateBuffer( - batchSize * nbChannels * width * height, deviceID: deviceID + outs = FloatBuffer( + nbElems: batchSize * nbChannels * width * height, + deviceID: deviceID ) } else if batchSize <= 0 || @@ -214,8 +215,9 @@ open class Layer2D: Layer { if delta == nil { - delta = MetalPrivateBuffer( - batchSize * nbChannels * width * height, deviceID: deviceID + delta = FloatBuffer( + nbElems: batchSize * nbChannels * width * height, + deviceID: deviceID ) } else if batchSize <= 0 || @@ -251,9 +253,8 @@ open class Layer2D: Layer public func getOutsGPU(elem: Int) -> [T] { var outs = [T]() - MetalKernel.get.download([self.outs]) + let outsPtr = self.outs.download() - let outsPtr = self.outs.shared.buffer for depth in 0.., + _ data: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { diff --git a/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift b/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift index c6d9fbd9..fcd11e8e 100644 --- a/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift +++ b/Sources/GrAIdient/Layer2D/Base/LayerOutput2D.swift @@ -15,13 +15,13 @@ open class LayerOutput2D: Layer2D /// Ground truth buffer in the GPU execution context. /// Shape ~ (batch, nbChannels, height, width). /// - public internal(set) var groundTruth: MetalSharedBuffer! = nil + public internal(set) var groundTruth: FloatBuffer! = nil /// /// Loss buffer in the GPU execution context. /// Shape ~ (batch,). /// - public internal(set) var loss: MetalSharedBuffer! = nil + public internal(set) var loss: FloatBuffer! = nil private enum Keys: String, CodingKey { @@ -157,9 +157,10 @@ open class LayerOutput2D: Layer2D if self.groundTruth == nil { - self.groundTruth = MetalSharedBuffer( - batchSize * nbChannels * height * width, - deviceID: deviceID + self.groundTruth = FloatBuffer( + nbElems: batchSize * nbChannels * height * width, + deviceID: deviceID, + shared: true ) } else if batchSize <= 0 || @@ -168,7 +169,10 @@ open class LayerOutput2D: Layer2D throw LayerError.BatchSize } - let bufferPtr = self.groundTruth.buffer + var buffer = [Float]( + repeating: 0.0, count: batchSize * nbChannels * height * width + ) + switch format { case .RGB: @@ -184,7 +188,7 @@ open class LayerOutput2D: Layer2D let offsetSet = j + (offsetStart + i) * width let gt = groundTruth[nbChannels * offsetGet + depth] - bufferPtr[offsetSet] = Float(gt) + buffer[offsetSet] = Float(gt) }} }} case .Neuron: @@ -199,11 +203,11 @@ open class LayerOutput2D: Layer2D let offset = j + (offsetStart + i) * width let gt = groundTruth[offset] - bufferPtr[offset] = Float(gt) + buffer[offset] = Float(gt) }} }} } - MetalKernel.get.upload([self.groundTruth]) + self.groundTruth.initialize(array: &buffer) } /// @@ -219,7 +223,7 @@ open class LayerOutput2D: Layer2D /// - width: Width of each channel. /// public func checkGroundTruthGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { @@ -248,7 +252,9 @@ open class LayerOutput2D: Layer2D { if loss == nil { - loss = MetalSharedBuffer(batchSize, deviceID: deviceID) + loss = FloatBuffer( + nbElems: batchSize, deviceID: deviceID, shared: true + ) } else if batchSize <= 0 || batchSize > loss.nbElems { diff --git a/Sources/GrAIdient/Layer2D/Concat2D.swift b/Sources/GrAIdient/Layer2D/Concat2D.swift index 4a9a0e6c..17fdfd1a 100644 --- a/Sources/GrAIdient/Layer2D/Concat2D.swift +++ b/Sources/GrAIdient/Layer2D/Concat2D.swift @@ -168,9 +168,10 @@ public class Concat2D: LayerMerge2D { try checkStateCPU(batchSize: batchSize) + var buffersPrev = [[Float]]() for num in 0..<_layersPrev.count { - MetalKernel.get.download([(_layersPrev[num] as! Layer2D).outs]) + buffersPrev.append((_layersPrev[num] as! Layer2D).outs.download()) } let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -221,7 +222,7 @@ public class Concat2D: LayerMerge2D var curElem = 0 for num in 0..<_layersPrev.count { - let outsPrevPtr = (_layersPrev[num] as! Layer2D).outs.shared.buffer + let outsPrevPtr = buffersPrev[num] let neuronsPrev = (_layersPrev[num] as! Layer2D).neurons let nbChannels = neuronsPrev.count diff --git a/Sources/GrAIdient/Layer2D/Constant2D.swift b/Sources/GrAIdient/Layer2D/Constant2D.swift index 0b65cf86..96d80aee 100644 --- a/Sources/GrAIdient/Layer2D/Constant2D.swift +++ b/Sources/GrAIdient/Layer2D/Constant2D.swift @@ -24,7 +24,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate /// Buffer of gradients per sample for biases. /// Shape ~ (batch, nbChannels). /// - var _wDeltaWeights: MetalPrivateBuffer! = nil + var _wDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -64,12 +64,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate { return _weightsList } - - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - - return weightsTmp + return _wBuffers.w.download() } set { _weightsList = newValue @@ -315,19 +310,16 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer if _weightsList.count != 0 { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: nbChannels - ) + _wBuffers.w.initialize(array: &_weightsList) + } + else + { + _wBuffers.w.initialize() } - _weightsList = [] - MetalKernel.get.upload([_wBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil } @@ -344,7 +336,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * nbChannels, deviceID: deviceID ) } @@ -411,8 +403,7 @@ public class Constant2D: Layer2D, LayerResize, LayerUpdate neurons[depth].get(i, j)!.initGC(batchSize: batchSize, nbGC: newGC) }}} - MetalKernel.get.download([_wBuffers.w_p!]) - let weightsPtr = _wBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() for batch in 0..! = nil + var _wDeltaWeights: FloatBuffer! = nil /// /// Buffer of gradients per sample for biases. /// Shape ~ (batch, nbChannels). /// - var _bDeltaWeights: MetalPrivateBuffer! = nil + var _bDeltaWeights: FloatBuffer! = nil /// Number of weight kernels. public let nbWeights: Int @@ -184,14 +184,10 @@ public class Convolution2D: BN2D, LayerWeightInit return _weightsList } - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - + var weightsTmp = _wBuffers.w.download() if _updateBiases { - MetalKernel.get.download([_bBuffers.w_p!]) - weightsTmp += _bBuffers.w_p!.shared.array + weightsTmp += _bBuffers.w.download() } return weightsTmp } @@ -782,35 +778,24 @@ public class Convolution2D: BN2D, LayerWeightInit deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer - + _bBuffers.w.initialize() if _weightsList.count == 0 { - generateWeightsList(buffer: weightsPtr) + generateWeightsList(out: _wBuffers.w, deviceID: deviceID) } else { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: nbWeights * weightHeight * weightWidth - ) + _wBuffers.w.initialize(array: &_weightsList) if _updateBiases { - copyFloatArrayToBuffer( + _bBuffers.w.initialize( array: &_weightsList, - buffer: biasesPtr, - start: nbWeights * weightHeight * weightWidth, - nbElems: nbChannels + start: nbWeights * weightHeight * weightWidth ) } } - _weightsList = [] - - MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil _bDeltaWeights = nil } @@ -828,14 +813,14 @@ public class Convolution2D: BN2D, LayerWeightInit if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * nbWeights * weightWidth * weightHeight, deviceID: deviceID ) if _updateBiases { - _bDeltaWeights = MetalPrivateBuffer( + _bDeltaWeights = FloatBuffer(nbElems: batchSize * nbChannels, deviceID: deviceID ) } @@ -1071,11 +1056,8 @@ public class Convolution2D: BN2D, LayerWeightInit }} } - MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!]) - MetalKernel.get.download([layerPrev.outs]) - - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() + let biasesPtr = _bBuffers.w.download() let neuronsPrev = layerPrev.neurons let widthPrev = layerPrev.width @@ -1115,7 +1097,7 @@ public class Convolution2D: BN2D, LayerWeightInit }} }}} - let outsPrevPtr = layerPrev.outs.shared.buffer + let outsPrevPtr = layerPrev.outs.download() for batch in 0.. [IWeightArrays] { - var weights = [IWeightArrays]() + var weights = [WeightArrays]() weights += _wArrays if _updateBiases { @@ -1826,8 +1808,7 @@ public class Convolution2D: BN2D, LayerWeightInit } var deltaWeights = [T]() - MetalKernel.get.download([_wDeltaWeights]) - var deltaWeightsPtr = _wDeltaWeights.shared.buffer + var deltaWeightsPtr = _wDeltaWeights.download() let nbChannelsPrev = (self.layerPrev as! Layer2D).nbChannels let offsetStartGrid = @@ -1853,8 +1834,7 @@ public class Convolution2D: BN2D, LayerWeightInit if _updateBiases { - MetalKernel.get.download([_bDeltaWeights]) - deltaWeightsPtr = _bDeltaWeights.shared.buffer + deltaWeightsPtr = _bDeltaWeights.download() for depth in 0.., IWeightArrays class InputBuffers2D: InputBuffers, IWeightBuffers { /// Inputs buffer: the buffer to be update. - var w: MetalBuffer + var w: FloatBuffer { get { return _layer.outs @@ -90,7 +90,7 @@ class InputBuffers2D: InputBuffers, IWeightBuffers } /// Gradients buffer. - var g: MetalBuffer + var g: FloatBuffer { get { return _layer.delta @@ -397,7 +397,7 @@ public class Input2D: LayerInput2D, LayerResize, LayerUpdate /// - width: Width of each channel. /// public func setDataGPU( - _ data: MetalPrivateBuffer, + _ data: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { diff --git a/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift b/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift index 17ccbc4e..1585cdb6 100644 --- a/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift +++ b/Sources/GrAIdient/Layer2D/InstanceNorm2D.swift @@ -457,8 +457,7 @@ public class InstanceNorm2D: Activation2D, LayerUpdate, LayerWithActivation }}} }} - MetalKernel.get.download([layerPrev.outs]) - let outsPrevPtr = layerPrev.outs.shared.buffer + let outsPrevPtr = layerPrev.outs.download() // Prepare GC for norm weights: Ɣ and β. for batch in 0.. [IWeightArrays] { - var weights = [IWeightArrays]() + var weights = [WeightArrays]() if let norm = self.norm { weights += norm.collectWeights() diff --git a/Sources/GrAIdient/Layer2D/MSE2D.swift b/Sources/GrAIdient/Layer2D/MSE2D.swift index 1cdf404f..75775063 100644 --- a/Sources/GrAIdient/Layer2D/MSE2D.swift +++ b/Sources/GrAIdient/Layer2D/MSE2D.swift @@ -268,7 +268,7 @@ public class MSE2D: LayerOutput2D /// - Returns: The loss value. /// public func getLossGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws -> Float { @@ -296,9 +296,8 @@ public class MSE2D: LayerOutput2D command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0.., + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { diff --git a/Sources/GrAIdient/Layer2D/Multiply2D.swift b/Sources/GrAIdient/Layer2D/Multiply2D.swift index d5d879ec..677bf228 100644 --- a/Sources/GrAIdient/Layer2D/Multiply2D.swift +++ b/Sources/GrAIdient/Layer2D/Multiply2D.swift @@ -14,10 +14,15 @@ public class Multiply2D: LayerMerge2D { /// - /// List of output buffers. + /// List of output buffers for CPU usage. /// Shape ~ (batch, nbChannels, height, width). /// - var _otherOuts: [MetalBuffer] = [] + var _otherOuts1: [[Double]] = [] + /// + /// List of output buffers for GPU usage. + /// Shape ~ (batch, nbChannels, height, width). + /// + var _otherOuts2: [FloatBuffer] = [] /// /// Create a layer with a 2D shape neural structure. @@ -97,7 +102,7 @@ public class Multiply2D: LayerMerge2D public override func resetKernelCPU() { super.resetKernelCPU() - _otherOuts = [] + _otherOuts1 = [] } /// @@ -108,7 +113,7 @@ public class Multiply2D: LayerMerge2D public override func resetKernelGPU() { super.resetKernelGPU() - _otherOuts = [] + _otherOuts2 = [] } /// @@ -120,15 +125,14 @@ public class Multiply2D: LayerMerge2D { try super.checkStateCPU(batchSize: batchSize) - if _otherOuts.count == 0 + if _otherOuts1.count == 0 { for _ in 0..<_layersPrev.count { - let buffer = MetalSharedBuffer( - batchSize * nbChannels * height * width, - deviceID: deviceID - ) - _otherOuts.append(buffer) + _otherOuts1.append([Double]( + repeating: 0.0, + count: batchSize * nbChannels * height * width + )) } } } @@ -142,15 +146,15 @@ public class Multiply2D: LayerMerge2D { try super.checkStateForwardGPU(batchSize: batchSize) - if _otherOuts.count == 0 + if _otherOuts2.count == 0 { for _ in 0..<_layersPrev.count { - let buffer = MetalPrivateBuffer( + let buffer = FloatBuffer(nbElems: batchSize * nbChannels * height * width, deviceID: deviceID ) - _otherOuts.append(buffer) + _otherOuts2.append(buffer) } } } @@ -248,9 +252,10 @@ public class Multiply2D: LayerMerge2D { try checkStateCPU(batchSize: batchSize) + var buffersPrev = [[Float]]() for num in 0..<_layersPrev.count { - MetalKernel.get.download([(_layersPrev[num] as! Layer2D).outs]) + buffersPrev.append((_layersPrev[num] as! Layer2D).outs.download()) } let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -305,8 +310,7 @@ public class Multiply2D: LayerMerge2D var mult = 1.0 for num in 0..<_layersPrev.count { - let outsPrevPtr = - (_layersPrev[num] as! Layer2D).outs.shared.buffer + let outsPrevPtr = buffersPrev[num] let neuronsPrev = (_layersPrev[num] as! Layer2D).neurons @@ -363,8 +367,6 @@ public class Multiply2D: LayerMerge2D for num1 in 0..<_layersPrev.count { - let buffer = (_otherOuts[num1] as! MetalSharedBuffer).buffer - mult = 1.0 for num2 in 0..<_layersPrev.count { if num2 != num1 @@ -373,8 +375,7 @@ public class Multiply2D: LayerMerge2D (_layersPrev[num2] as! Layer2D).neurons mult *= neuronsPrev[depth].get(i, j)!.v[elem].out }} - - buffer[offset] = Float(mult) + _otherOuts1[num1][offset] = mult } }} }} @@ -441,7 +442,7 @@ public class Multiply2D: LayerMerge2D (_layersPrev[num2] as! Layer2D).outs.metal, atIndex: 0 ) command.setBytes(pNbElems, atIndex: 1) - command.setBuffer(_otherOuts[num1].metal, atIndex: 2) + command.setBuffer(_otherOuts2[num1].metal, atIndex: 2) command.dispatchThreads(nbElems) command.enqueue() @@ -465,7 +466,7 @@ public class Multiply2D: LayerMerge2D } let neuronsPrev = (_layersPrev[num] as! Layer2D).neurons - let buffer = (_otherOuts[num] as! MetalSharedBuffer).buffer + let buffer = _otherOuts1[num] for elem in 0..! = nil + private var _squaredNorm: FloatBuffer! = nil /// /// Temporary delta buffer used in the GPU execution context. /// Shape ~ (batch, nbThreadgroups). /// - private var _deltaTmp: MetalPrivateBuffer! = nil + private var _deltaTmp: FloatBuffer! = nil /// Number of thread groups in the GPU execution context. var nbThreadgroups: Int @@ -404,7 +404,7 @@ public class Normalize122D: Layer2D { if _squaredNorm == nil { - _squaredNorm = MetalPrivateBuffer( + _squaredNorm = FloatBuffer(nbElems: batchSize * nbThreadgroups, deviceID: deviceID ) } @@ -422,7 +422,7 @@ public class Normalize122D: Layer2D { if _deltaTmp == nil { - _deltaTmp = MetalPrivateBuffer( + _deltaTmp = FloatBuffer(nbElems: batchSize * nbThreadgroups, deviceID: deviceID ) } diff --git a/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift b/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift index f341e429..a93b2c9e 100644 --- a/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift +++ b/Sources/GrAIdient/Layer2D/SimilarityBatchError2D.swift @@ -126,7 +126,7 @@ public class SimilarityBatchError2D: LayerOutput2D /// - width: Width of each channel. /// public override func checkGroundTruthGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { @@ -144,9 +144,10 @@ public class SimilarityBatchError2D: LayerOutput2D { if loss == nil { - loss = MetalSharedBuffer( - batchSize * batchSize, - deviceID: deviceID + loss = FloatBuffer( + nbElems: batchSize * batchSize, + deviceID: deviceID, + shared: true ) } else if batchSize <= 0 || batchSize * batchSize > loss.nbElems @@ -259,9 +260,8 @@ public class SimilarityBatchError2D: LayerOutput2D command.dispatchThreads(width: batchSize, height: batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for elem1 in 0..! = nil + public internal(set) var loss: FloatBuffer! = nil /// Batch size sum in the previous layers. public var mergedBatchSize: Int @@ -151,9 +151,10 @@ public class SimilarityError2D: LayerMerge2D { if loss == nil { - loss = MetalSharedBuffer( - batchSize * batchSize, - deviceID: deviceID + loss = FloatBuffer( + nbElems: batchSize * batchSize, + deviceID: deviceID, + shared: true ) } else if batchSize <= 0 || batchSize * batchSize > loss.nbElems @@ -255,9 +256,10 @@ public class SimilarityError2D: LayerMerge2D { try checkStateCPU(batchSize: mergedBatchSize) + var buffersPrev = [[Float]]() for num in 0..<_layersPrev.count { - MetalKernel.get.download([(_layersPrev[num] as! Layer2D).outs]) + buffersPrev.append((_layersPrev[num] as! Layer2D).outs.download()) } let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -300,7 +302,7 @@ public class SimilarityError2D: LayerMerge2D for num in 0..<_layersPrev.count { let batchSize = _layersPrev[num].batchSize - let outsPrevPtr = (_layersPrev[num] as! Layer2D).outs.shared.buffer + let outsPrevPtr = buffersPrev[num] let neuronsPrev = (_layersPrev[num] as! Layer2D).neurons for batch in 0..! = nil + var _wDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -103,12 +103,7 @@ public class VQ2D: LayerOutput2D, LayerWeightInit { return _weightsList } - - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - - return weightsTmp + return _wBuffers.w.download() } set { _weightsList = newValue @@ -314,23 +309,16 @@ public class VQ2D: LayerOutput2D, LayerWeightInit deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer if _weightsList.count == 0 { - generateWeightsList(buffer: weightsPtr) + generateWeightsList(out: _wBuffers.w, deviceID: deviceID) } else { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: K * nbChannels - ) + _wBuffers.w.initialize(array: &_weightsList) } - _weightsList = [] - MetalKernel.get.upload([_wBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil } @@ -365,7 +353,7 @@ public class VQ2D: LayerOutput2D, LayerWeightInit if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * K * nbChannels, deviceID: deviceID ) } @@ -434,7 +422,7 @@ public class VQ2D: LayerOutput2D, LayerWeightInit /// - width: Width of each channel. /// public override func checkGroundTruthGPU( - _ groundTruth: MetalBuffer, + _ groundTruth: FloatBuffer, batchSize: Int, nbChannels: Int, height: Int, width: Int) throws { @@ -859,9 +847,8 @@ public class VQ2D: LayerOutput2D, LayerWeightInit command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0..! = nil + private var _camMax: FloatBuffer! = nil /// Number of thread groups in the GPU execution context. var nbThreadgroups: Int @@ -1169,7 +1156,7 @@ public class VQGrad2D: VQ2D if _camMax == nil { - _camMax = MetalPrivateBuffer( + _camMax = FloatBuffer(nbElems: batchSize * nbThreadgroups, deviceID: deviceID ) diff --git a/Sources/GrAIdient/LayerSeq/ActivationSeq.swift b/Sources/GrAIdient/LayerSeq/ActivationSeq.swift index 484431cc..39521636 100644 --- a/Sources/GrAIdient/LayerSeq/ActivationSeq.swift +++ b/Sources/GrAIdient/LayerSeq/ActivationSeq.swift @@ -16,7 +16,7 @@ public class ActivationSeq: LayerSeq /// used in the GPU execution context. /// Shape ~ (batch, nbNeurons). /// - var _tmp: MetalPrivateBuffer! = nil + var _tmp: FloatBuffer! = nil /// Get coefficient (depending on activation function) to apply during the weights initialization. public var coeffInitWeights: Float diff --git a/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift b/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift index 960ae791..857057f1 100644 --- a/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift +++ b/Sources/GrAIdient/LayerSeq/Base/LayerSeq.swift @@ -15,12 +15,12 @@ open class LayerSeq: Layer /// Output buffer (result of the forward pass) used in the GPU execution context. /// Shape ~ (batch, seq, nbNeurons). /// - public var outs: MetalPrivateBuffer! = nil + public var outs: FloatBuffer! = nil /// /// Gradient buffer (result of the backward pass) used in the GPU execution context. /// Shape ~ (batch, seq, nbNeurons). /// - public var delta: MetalPrivateBuffer! = nil + public var delta: FloatBuffer! = nil /// Length of the sequence. public let sequence: Int @@ -148,8 +148,9 @@ open class LayerSeq: Layer { if outs == nil { - outs = MetalPrivateBuffer( - batchSize * sequence * nbNeurons, deviceID: deviceID + outs = FloatBuffer( + nbElems: batchSize * sequence * nbNeurons, + deviceID: deviceID ) } else if batchSize <= 0 || batchSize > outs.nbElems / nbNeurons @@ -169,8 +170,9 @@ open class LayerSeq: Layer { if delta == nil { - delta = MetalPrivateBuffer( - batchSize * sequence * nbNeurons, deviceID: deviceID + delta = FloatBuffer( + nbElems: batchSize * sequence * nbNeurons, + deviceID: deviceID ) } else if batchSize <= 0 || diff --git a/Sources/GrAIdient/LayerSeq/ConcatSeq.swift b/Sources/GrAIdient/LayerSeq/ConcatSeq.swift index b205a439..059ad9ef 100644 --- a/Sources/GrAIdient/LayerSeq/ConcatSeq.swift +++ b/Sources/GrAIdient/LayerSeq/ConcatSeq.swift @@ -164,9 +164,10 @@ public class Concat1Seq: LayerMergeSeq { try checkStateCPU(batchSize: batchSize) + var buffersPrev = [[Float]]() for num in 0..<_layersPrev.count { - MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs]) + buffersPrev.append((_layersPrev[num] as! LayerSeq).outs.download()) } let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -213,7 +214,7 @@ public class Concat1Seq: LayerMergeSeq for num in 0..<_layersPrev.count { let layerPrev = _layersPrev[num] as! LayerSeq - let outsPrevPtr = layerPrev.outs.shared.buffer + let outsPrevPtr = buffersPrev[num] let neuronsPrev = layerPrev.neurons! let sequence = layerPrev.sequence @@ -595,9 +596,10 @@ public class Concat2Seq: LayerMergeSeq { try checkStateCPU(batchSize: batchSize) + var buffersPrev = [[Float]]() for num in 0..<_layersPrev.count { - MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs]) + buffersPrev.append((_layersPrev[num] as! LayerSeq).outs.download()) } let (nbSameElems, layersIndex, nbElems) = getMergedGraph() @@ -644,7 +646,7 @@ public class Concat2Seq: LayerMergeSeq for num in 0..<_layersPrev.count { let layerPrev = _layersPrev[num] as! LayerSeq - let outsPrevPtr = layerPrev.outs.shared.buffer + let outsPrevPtr = buffersPrev[num] let neuronsPrev = layerPrev.neurons! let nbNeurons = layerPrev.nbNeurons diff --git a/Sources/GrAIdient/LayerSeq/ConstantSeq.swift b/Sources/GrAIdient/LayerSeq/ConstantSeq.swift index 3156765e..f8796ecb 100644 --- a/Sources/GrAIdient/LayerSeq/ConstantSeq.swift +++ b/Sources/GrAIdient/LayerSeq/ConstantSeq.swift @@ -63,12 +63,7 @@ public class Constant12Seq: LayerSeq, LayerUpdate { return _weightsList } - - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - - return weightsTmp + return _wBuffers.w.download() } set { _weightsList = newValue @@ -261,19 +256,15 @@ public class Constant12Seq: LayerSeq, LayerUpdate deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer if _weightsList.count != 0 { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: sequence * nbNeurons - ) + _wBuffers.w.initialize(array: &_weightsList) + } + else + { + _wBuffers.w.initialize() } _weightsList = [] - - MetalKernel.get.upload([_wBuffers.w_p!]) } /// @@ -339,8 +330,7 @@ public class Constant12Seq: LayerSeq, LayerUpdate ) }} - MetalKernel.get.download([_wBuffers.w_p!]) - let weightsPtr = _wBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() for batch in 0..! = nil + var _wDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -558,12 +548,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate { return _weightsList } - - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - - return weightsTmp + return _wBuffers.w.download() } set { _weightsList = newValue @@ -755,19 +740,16 @@ public class Constant2Seq: LayerSeq, LayerUpdate deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer if _weightsList.count != 0 { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: nbNeurons - ) + _wBuffers.w.initialize(array: &_weightsList) + } + else + { + _wBuffers.w.initialize() } - _weightsList = [] - MetalKernel.get.upload([_wBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil } @@ -784,7 +766,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * sequence * nbNeurons, deviceID: deviceID ) } @@ -852,8 +834,7 @@ public class Constant2Seq: LayerSeq, LayerUpdate ) }} - MetalKernel.get.download([_wBuffers.w_p!]) - let weightsPtr = _wBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() for batch in 0..! = nil + var _wDeltaWeights: FloatBuffer! = nil /// /// Buffer of gradients per sample for biases. /// Shape ~ (batch, nbNeurons). /// - var _bDeltaWeights: MetalPrivateBuffer! = nil + var _bDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -106,14 +106,10 @@ public class FullyConnectedPatch: ActivationSeq, return _weightsList } - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - + var weightsTmp = _wBuffers.w.download() if _updateBiases { - MetalKernel.get.download([_bBuffers.w_p!]) - weightsTmp += _bBuffers.w_p!.shared.array + weightsTmp += _bBuffers.w.download() } return weightsTmp } @@ -467,34 +463,24 @@ public class FullyConnectedPatch: ActivationSeq, deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer - + _bBuffers.w.initialize() if _weightsList.count == 0 { - generateWeightsList(buffer: weightsPtr) + generateWeightsList(out: _wBuffers.w, deviceID: deviceID) } else { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: weightHeight * weightWidth - ) + _wBuffers.w.initialize(array: &_weightsList) if _updateBiases { - copyFloatArrayToBuffer( + _bBuffers.w.initialize( array: &_weightsList, - buffer: biasesPtr, - start: weightHeight * weightWidth, - nbElems: weightHeight + start: weightHeight * weightWidth ) } } - _weightsList = [] - MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil _bDeltaWeights = nil @@ -513,14 +499,14 @@ public class FullyConnectedPatch: ActivationSeq, if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * sequence * nbNeurons * weightWidth, deviceID: deviceID ) if _updateBiases { - _bDeltaWeights = MetalPrivateBuffer( + _bDeltaWeights = FloatBuffer(nbElems: batchSize * sequence * nbNeurons, deviceID: deviceID ) } @@ -715,11 +701,8 @@ public class FullyConnectedPatch: ActivationSeq, ) }} - MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!]) - MetalKernel.get.download([layerPrev.outs]) - - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() + let biasesPtr = _bBuffers.w.download() let nbSeqPerCol = layerPrev.width / _patch let neuronsPrev = layerPrev.neurons @@ -757,7 +740,7 @@ public class FullyConnectedPatch: ActivationSeq, } }}} - let outsPrevPtr = layerPrev.outs.shared.buffer + let outsPrevPtr = layerPrev.outs.download() for batch in 0.. [IWeightArrays] { - var weights = [IWeightArrays]() + var weights = [WeightArrays]() weights.append(_wArrays) if _updateBiases { @@ -1325,8 +1308,7 @@ public class FullyConnectedPatch: ActivationSeq, } var deltaWeights = [T]() - MetalKernel.get.download([_wDeltaWeights]) - var deltaWeightsPtr = _wDeltaWeights.shared.buffer + var deltaWeightsPtr = _wDeltaWeights.download() let offsetStart = elem * nbNeurons * weightWidth for depth in 0..! = nil + var _wDeltaWeights: FloatBuffer! = nil /// /// Buffer of gradients per sample for biases. /// Shape ~ (batch, nbNeurons). /// - var _bDeltaWeights: MetalPrivateBuffer! = nil + var _bDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -98,14 +98,10 @@ public class FullyConnectedSeq: ActivationSeq, return _weightsList } - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - + var weightsTmp = _wBuffers.w.download() if _updateBiases { - MetalKernel.get.download([_bBuffers.w_p!]) - weightsTmp += _bBuffers.w_p!.shared.array + weightsTmp += _bBuffers.w.download() } return weightsTmp } @@ -442,35 +438,24 @@ public class FullyConnectedSeq: ActivationSeq, deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer - + _bBuffers.w.initialize() if _weightsList.count == 0 { - generateWeightsList(buffer: weightsPtr) + generateWeightsList(out: _wBuffers.w, deviceID: deviceID) } else { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: weightHeight * weightWidth - ) + _wBuffers.w.initialize(array: &_weightsList) if _updateBiases { - copyFloatArrayToBuffer( + _bBuffers.w.initialize( array: &_weightsList, - buffer: biasesPtr, - start: weightHeight * weightWidth, - nbElems: weightHeight + start: weightHeight * weightWidth ) } } - _weightsList = [] - - MetalKernel.get.upload([_wBuffers.w_p!, _bBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil _bDeltaWeights = nil } @@ -488,14 +473,14 @@ public class FullyConnectedSeq: ActivationSeq, if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * sequence * nbNeurons * weightWidth, deviceID: deviceID ) if _updateBiases { - _bDeltaWeights = MetalPrivateBuffer( + _bDeltaWeights = FloatBuffer(nbElems: batchSize * sequence * nbNeurons, deviceID: deviceID ) } @@ -656,11 +641,8 @@ public class FullyConnectedSeq: ActivationSeq, ) }} - MetalKernel.get.download([_wBuffers.w_p!, _bBuffers.w_p!]) - MetalKernel.get.download([layerPrev.outs]) - - let weightsPtr = _wBuffers.w_p!.shared.buffer - let biasesPtr = _bBuffers.w_p!.shared.buffer + let weightsPtr = _wBuffers.w.download() + let biasesPtr = _bBuffers.w.download() let neuronsPrev = layerPrev.neurons! let nbNeuronsPrev = layerPrev.nbNeurons @@ -685,7 +667,7 @@ public class FullyConnectedSeq: ActivationSeq, } }}} - let outsPrevPtr = layerPrev.outs.shared.buffer + let outsPrevPtr = layerPrev.outs.download() for batch in 0.. [IWeightArrays] { - var weights = [IWeightArrays]() + var weights = [WeightArrays]() weights.append(_wArrays) if _updateBiases { @@ -1210,8 +1192,7 @@ public class FullyConnectedSeq: ActivationSeq, } var deltaWeights = [T]() - MetalKernel.get.download([_wDeltaWeights]) - var deltaWeightsPtr = _wDeltaWeights.shared.buffer + var deltaWeightsPtr = _wDeltaWeights.download() let offsetStart = elem * nbNeurons * weightWidth for depth in 0..! = nil + public internal(set) var loss: FloatBuffer! = nil /// /// Indices of maximal elements. /// Shape ~ (batch, seq). @@ -46,7 +46,7 @@ public class VQSeq: LayerSeq, LayerWeightInit /// Buffer of gradients per sample for biases. /// Shape ~ (batch, K, nbNeurons). /// - var _wDeltaWeights: MetalPrivateBuffer! = nil + var _wDeltaWeights: FloatBuffer! = nil /// Whether to compute weights' gradients or not. public var computeDeltaWeights: Bool = true @@ -87,12 +87,7 @@ public class VQSeq: LayerSeq, LayerWeightInit { return _weightsList } - - var weightsTmp = [Float]() - MetalKernel.get.download([_wBuffers.w_p!]) - weightsTmp += _wBuffers.w_p!.shared.array - - return weightsTmp + return _wBuffers.w.download() } set { _weightsList = newValue @@ -304,23 +299,16 @@ public class VQSeq: LayerSeq, LayerWeightInit deviceID: deviceID ) - let weightsPtr = _wBuffers.w_p!.shared.buffer if _weightsList.count == 0 { - generateWeightsList(buffer: weightsPtr) + generateWeightsList(out: _wBuffers.w, deviceID: deviceID) } else { - copyFloatArrayToBuffer( - array: &_weightsList, - buffer: weightsPtr, - start: 0, - nbElems: K * nbNeurons - ) + _wBuffers.w.initialize(array: &_weightsList) } - _weightsList = [] - MetalKernel.get.upload([_wBuffers.w_p!]) + _weightsList = [] _wDeltaWeights = nil } @@ -355,7 +343,7 @@ public class VQSeq: LayerSeq, LayerWeightInit if computeDeltaWeights && GrAI.Gradient.sample && _wDeltaWeights == nil { - _wDeltaWeights = MetalPrivateBuffer( + _wDeltaWeights = FloatBuffer(nbElems: batchSize * K * nbNeurons, deviceID: deviceID ) } @@ -380,7 +368,9 @@ public class VQSeq: LayerSeq, LayerWeightInit { if loss == nil { - loss = MetalSharedBuffer(batchSize, deviceID: deviceID) + loss = FloatBuffer( + nbElems: batchSize, deviceID: deviceID, shared: true + ) } else if batchSize <= 0 || batchSize > loss.nbElems { @@ -778,9 +768,8 @@ public class VQSeq: LayerSeq, LayerWeightInit command.dispatchThreads(batchSize) command.enqueue() - MetalKernel.get.download([loss]) var loss: Float = 0.0 - let lossPtr = self.loss.buffer + let lossPtr = self.loss.download() for i in 0..! = nil + private var _camMax: FloatBuffer! = nil /// Number of thread groups in the GPU execution context. var nbThreadgroups: Int @@ -1087,7 +1076,7 @@ public class VQGradSeq: VQSeq if _camMax == nil { - _camMax = MetalPrivateBuffer( + _camMax = FloatBuffer(nbElems: batchSize * nbThreadgroups, deviceID: deviceID ) diff --git a/Sources/GrAIdient/LayerSeq/ValueSeq.swift b/Sources/GrAIdient/LayerSeq/ValueSeq.swift index 09d6b70a..2507e484 100644 --- a/Sources/GrAIdient/LayerSeq/ValueSeq.swift +++ b/Sources/GrAIdient/LayerSeq/ValueSeq.swift @@ -223,11 +223,6 @@ public class ValueSeq: LayerMergeSeq { try checkStateCPU(batchSize: batchSize) - for num in 0..<_layersPrev.count - { - MetalKernel.get.download([(_layersPrev[num] as! LayerSeq).outs]) - } - let (nbSameElems, layersIndex, nbElems) = getMergedGraph() var nbGC = nbSameElems @@ -268,10 +263,8 @@ public class ValueSeq: LayerMergeSeq neurons.get(seqQ, depth)!.gc[batch][elem].out = sum }}}}} - let valueBuffer = - (_layersPrev[0] as! LayerSeq).outs.shared.buffer - let scoreBuffer = - (_layersPrev[1] as! LayerSeq).outs.shared.buffer + let valueBuffer = (_layersPrev[0] as! LayerSeq).outs.download() + let scoreBuffer = (_layersPrev[1] as! LayerSeq).outs.download() for batch in 0.. using namespace metal; -kernel void forwardReLU( +kernel void forwardReLUFloat( constant uint * pNbElems, device float * tmps, device float * outs, @@ -39,7 +39,7 @@ kernel void forwardReLU( } } -kernel void backwardReLU( +kernel void backwardReLUFloat( const device float * tmps, constant uint * pNbElems, device float * delta, @@ -65,7 +65,7 @@ kernel void backwardReLU( } } -kernel void forwardLeakyReLU( +kernel void forwardLeakyReLUFloat( constant uint * pNbElems, device float * tmps, device float * outs, @@ -97,7 +97,7 @@ kernel void forwardLeakyReLU( } } -kernel void backwardLeakyReLU( +kernel void backwardLeakyReLUFloat( const device float * tmps, constant uint * pNbElems, device float * delta, @@ -124,7 +124,7 @@ kernel void backwardLeakyReLU( } } -kernel void forwardSoftReLU( +kernel void forwardSoftReLUFloat( constant uint * pNbElems, device float * tmps, device float * outs, @@ -149,7 +149,7 @@ kernel void forwardSoftReLU( outs[id] = Ɛ * tmps[id] + (1 - Ɛ) * log(1 + exp(tmps[id])); } -kernel void backwardSoftReLU( +kernel void backwardSoftReLUFloat( const device float * tmps, constant uint * pNbElems, device float * delta, @@ -174,7 +174,7 @@ kernel void backwardSoftReLU( delta[id] = delta[id] * derivative; } -kernel void forwardSigmoid( +kernel void forwardSigmoidFloat( constant uint * pNbElems, device float * tmps, device float * outs, @@ -205,7 +205,7 @@ kernel void forwardSigmoid( } } -kernel void backwardSigmoid( +kernel void backwardSigmoidFloat( const device float * tmps, constant uint * pNbElems, device float * delta, @@ -239,7 +239,7 @@ kernel void backwardSigmoid( delta[id] = delta[id] * derivative; } -kernel void forwardGELUApprox( +kernel void forwardGELUApproxFloat( constant uint * pNbElems, device float * tmps, device float * outs, @@ -275,7 +275,7 @@ kernel void forwardGELUApprox( outs[id] = 0.5 * x * (1 + tmp2); } -kernel void backwardGELUApprox( +kernel void backwardGELUApproxFloat( const device float * tmps, constant uint * pNbElems, device float * delta, @@ -350,7 +350,7 @@ float erf(float a) return r; } -kernel void forwardGELU( +kernel void forwardGELUFloat( constant uint * pNbElems, device float * tmps, device float * outs, @@ -375,7 +375,7 @@ kernel void forwardGELU( outs[id] = 0.5 * x * (1 + erf(x / sqrt(2.0))); } -kernel void backwardGELU( +kernel void backwardGELUFloat( const device float * tmps, constant uint * pNbElems, device float * delta, diff --git a/Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal b/Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal new file mode 100644 index 00000000..a3e089f5 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/ActivationHalf.metal @@ -0,0 +1,403 @@ +// +// Activation.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void forwardReLUHalf( + constant uint * pNbElems, + device half * tmps, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + tmps[id] = outs[id]; + if (tmps[id] < 0) + { + outs[id] = 0.0; + } + else + { + outs[id] = tmps[id]; + } +} + +kernel void backwardReLUHalf( + const device half * tmps, + constant uint * pNbElems, + device half * delta, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + if (tmps[id] < 0) + { + delta[id] = 0.0; + } +} + +kernel void forwardLeakyReLUHalf( + constant uint * pNbElems, + device half * tmps, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float Ɛ = 0.01; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + tmps[id] = outs[id]; + if (tmps[id] < 0) + { + outs[id] = Ɛ * tmps[id]; + } + else + { + outs[id] = tmps[id]; + } +} + +kernel void backwardLeakyReLUHalf( + const device half * tmps, + constant uint * pNbElems, + device half * delta, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float Ɛ = 0.01; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + if (tmps[id] < 0) + { + delta[id] = Ɛ * delta[id]; + } +} + +kernel void forwardSoftReLUHalf( + constant uint * pNbElems, + device half * tmps, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float Ɛ = 0.01; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + tmps[id] = outs[id]; + outs[id] = Ɛ * tmps[id] + (1 - Ɛ) * log(1 + exp(tmps[id])); +} + +kernel void backwardSoftReLUHalf( + const device half * tmps, + constant uint * pNbElems, + device half * delta, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float Ɛ = 0.01; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float derivative = Ɛ + (1 - Ɛ) / (1 + exp(-tmps[id])); + delta[id] = delta[id] * derivative; +} + +kernel void forwardSigmoidHalf( + constant uint * pNbElems, + device half * tmps, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + tmps[id] = outs[id]; + if (tmps[id] >= 0) + { + outs[id] = 1.0 / (1.0 + exp(-tmps[id])); + } + else + { + outs[id] = exp(tmps[id]) / (1.0 + exp(tmps[id])); + } +} + +kernel void backwardSigmoidHalf( + const device half * tmps, + constant uint * pNbElems, + device half * delta, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float tmp; + if (tmps[id] >= 0) + { + tmp = 1.0 / (1.0 + exp(-tmps[id])); + } + else + { + tmp = exp(tmps[id]) / (1.0 + exp(tmps[id])); + } + + float derivative = tmp * (1 - tmp); + delta[id] = delta[id] * derivative; +} + +kernel void forwardGELUApproxHalf( + constant uint * pNbElems, + device half * tmps, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float cst = sqrt(2.0 / 3.14159); + float x = outs[id]; + float tmp1 = cst * (x + 0.044715 * pow(x, 3)); + float tmp2; + if (tmp1 >= 0) + { + tmp2 = (1.0 - exp(-2.0 * tmp1)) / (1.0 + exp(-2.0 * tmp1)); + } + else + { + tmp2 = (exp(2.0 * tmp1) - 1.0) / (exp(2.0 * tmp1) + 1.0); + } + tmps[id] = x; + outs[id] = 0.5 * x * (1 + tmp2); +} + +kernel void backwardGELUApproxHalf( + const device half * tmps, + constant uint * pNbElems, + device half * delta, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float cst = sqrt(2.0 / 3.14159); + float x = tmps[id]; + float tmp1 = cst * (x + 0.044715 * pow(x, 3)); + float tmp2; + if (tmp1 >= 0) + { + tmp2 = (1.0 - exp(-2.0 * tmp1)) / (1.0 + exp(-2.0 * tmp1)); + } + else + { + tmp2 = (exp(2.0 * tmp1) - 1.0) / (exp(2.0 * tmp1) + 1.0); + } + float tmp3 = cst * (1 + 3 * 0.044715 * x * x) * (1 - tmp2 * tmp2); + float derivative = 0.5 * (1 + tmp2 + x * tmp3); + delta[id] = delta[id] * derivative; +} + +/* + * Approximation to the error function. + * Based on code from: + * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199 + */ +float erf(float a) +{ + float r, s, t, u; + t = metal::abs(a); + s = a * a; + if (t > 0.927734375f) + { + // maximum error 0.99527 ulp + r = metal::fma(-1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12 + u = metal::fma(-3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6 + r = metal::fma(r, s, u); + r = metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4 + r = metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1 + r = metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3 + r = metal::fma(r, t, -t); + // TODO, replace with expm1 when implemented + r = 1.0f - metal::exp(r); + r = metal::copysign(r, a); + } + else + { + // maximum error 0.98929 ulp + r = -5.96761703e-4f; // -0x1.38e000p-11 + r = metal::fma(r, s, 4.99119423e-3f); // 0x1.471a58p-8 + r = metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6 + r = metal::fma(r, s, 1.12819925e-1f); // 0x1.ce1c44p-4 + r = metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2 + r = metal::fma(r, s, 1.28379166e-1f); // 0x1.06eba8p-3 + r = metal::fma(r, a, a); + } + return r; +} + +kernel void forwardGELUHalf( + constant uint * pNbElems, + device half * tmps, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float x = outs[id]; + tmps[id] = x; + outs[id] = 0.5 * x * (1 + erf(x / sqrt(2.0))); +} + +kernel void backwardGELUHalf( + const device half * tmps, + constant uint * pNbElems, + device half * delta, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float x = tmps[id]; + float tmp1 = 0.5 * (1.0 + erf(x / sqrt(2.0))); + float tmp2 = x / sqrt(2.0 * M_PI_F) * exp(-x * x / 2.0); + float derivative = tmp1 + tmp2; + delta[id] = delta[id] * derivative; +} diff --git a/Sources/GrAIdient/Metal/Kernel/BatchNorm.metal b/Sources/GrAIdient/Metal/Kernel/BatchNormFloat.metal similarity index 97% rename from Sources/GrAIdient/Metal/Kernel/BatchNorm.metal rename to Sources/GrAIdient/Metal/Kernel/BatchNormFloat.metal index 413ab070..355a3ff8 100644 --- a/Sources/GrAIdient/Metal/Kernel/BatchNorm.metal +++ b/Sources/GrAIdient/Metal/Kernel/BatchNormFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void computeBNConvμ( +kernel void computeBNConvμFloat( const device float * tmps, constant uint * pNbChannels, constant uint * pNbBatch, @@ -67,7 +67,7 @@ kernel void computeBNConvμ( } } -kernel void computeBNConvσ2( +kernel void computeBNConvσ2Float( const device float * tmps, const device float * μ, constant uint * pNbChannels, @@ -128,7 +128,7 @@ kernel void computeBNConvσ2( } } -kernel void forwardBNConvTraining( +kernel void forwardBNConvTrainingFloat( const device float * β, const device float * Ɣ, const device float * μ, @@ -178,7 +178,7 @@ kernel void forwardBNConvTraining( tmps[offset] = Ɣ[depth] * xhat + β[depth]; } -kernel void forwardBNConvInference( +kernel void forwardBNConvInferenceFloat( const device float * β, const device float * Ɣ, const device float * Eμ, @@ -234,7 +234,7 @@ kernel void forwardBNConvInference( tmps[offset] = Ɣ[depth] * xhat + β[depth]; } -kernel void backwardWeightsBNConv( +kernel void backwardWeightsBNConvFloat( const device float * delta, const device float * xHat, const device float * Ɣ, @@ -308,7 +308,7 @@ kernel void backwardWeightsBNConv( } } -kernel void backwardBNConvTraining( +kernel void backwardBNConvTrainingFloat( const device float * σ2, const device float * xHat, const device float * Ɣ, @@ -361,7 +361,7 @@ kernel void backwardBNConvTraining( delta[offset] = mult * (tmp1 - tmp2 - tmp3); } -kernel void backwardBNConvInference( +kernel void backwardBNConvInferenceFloat( const device float * Ɣ, const device float * Eσ2, constant uint * pNbChannels, diff --git a/Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal b/Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal new file mode 100644 index 00000000..4872c749 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/BatchNormHalf.metal @@ -0,0 +1,415 @@ +// +// BatchNorm.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void computeBNConvμHalf( + const device half * tmps, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pDimensions, + constant uint * pFirstCall, + device half * μ, + device half * Eμ, + uint id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint width; + uint height; + uint firstCall; + + if (pNbChannels && pNbBatch && pDimensions && pFirstCall && tmps && + μ && Eμ) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + width = pDimensions[0]; + height = pDimensions[1]; + firstCall = *pFirstCall; + } + else + return ; + + uint depth = id; + if (depth >= nbChannels) + { + return ; + } + + uint nbElems = nbBatch * width * height; + float sum = 0.0; + for (uint elem=0; elem= nbChannels) + { + return ; + } + + uint nbElems = nbBatch * width * height; + float sum = 0.0; + for (uint elem=0; elem= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float tmp1 = tmps[offset] - μ[depth]; + float tmp2 = sqrt(σ2[depth] + Ɛ); + float xhat = tmp1 / tmp2; + xHat[offset] = xhat; + tmps[offset] = Ɣ[depth] * xhat + β[depth]; +} + +kernel void forwardBNConvInferenceHalf( + const device half * β, + const device half * Ɣ, + const device half * Eμ, + const device half * Eσ2, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pM, + constant uint * pDimensions, + device half * tmps, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint m; + uint width; + uint height; + float Ɛ = 1e-5; + + if (pNbChannels && pNbBatch && pM && pDimensions && β && Ɣ && + tmps && Eμ && Eσ2) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + m = *pM; + width = pDimensions[0]; + height = pDimensions[1]; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float Var = Eσ2[depth]; + if (m > 1) + { + Var *= (float)m / ((float)m - 1); + } + float tmp1 = tmps[offset] - Eμ[depth]; + float tmp2 = sqrt(Var + Ɛ); + float xhat = tmp1 / tmp2; + tmps[offset] = Ɣ[depth] * xhat + β[depth]; +} + +kernel void backwardWeightsBNConvHalf( + const device half * delta, + const device half * xHat, + const device half * Ɣ, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pDimensions, + constant uint * pAccumulate, + device half * sum1, + device half * sum2, + device half * dƔ, + device half * dβ, + uint id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint width; + uint height; + uint accumulate; + + if (pNbChannels && pNbBatch && pDimensions && pAccumulate && + delta && xHat && Ɣ && + sum1 && sum2 && dƔ && dβ) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + width = pDimensions[0]; + height = pDimensions[1]; + accumulate = *pAccumulate; + } + else + return ; + + uint depth = id; + if (depth >= nbChannels) + { + return ; + } + + float tmp1 = 0.0, tmp2 = 0.0; + float tmp3 = 0.0, tmp4 = 0.0; + for (uint elem=0; elem= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float mult = 1.0 / ((float)nbElems * sqrt(σ2[depth] + Ɛ)); + float dxHat = Ɣ[depth] * delta[offset]; + float tmp1 = nbElems * dxHat; + float tmp2 = sum1[depth]; + float tmp3 = xHat[offset] * sum2[depth]; + + delta[offset] = mult * (tmp1 - tmp2 - tmp3); +} + +kernel void backwardBNConvInferenceHalf( + const device half * Ɣ, + const device half * Eσ2, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pM, + constant uint * pDimensions, + device half * delta, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint m; + uint width; + uint height; + float Ɛ = 1e-5; + + if (pNbChannels && pNbBatch && pM && pDimensions && Ɣ && Eσ2 && delta) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + m = *pM; + width = pDimensions[0]; + height = pDimensions[1]; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float Var = Eσ2[depth]; + if (m > 1) + { + Var *= (float)m / ((float)m - 1); + } + float tmp1 = delta[offset]; + float tmp2 = sqrt(Var + Ɛ); + float xhat = tmp1 / tmp2; + delta[offset] = Ɣ[depth] * xhat; +} diff --git a/Sources/GrAIdient/Metal/Kernel/Biases.metal b/Sources/GrAIdient/Metal/Kernel/BiasesFloat.metal similarity index 96% rename from Sources/GrAIdient/Metal/Kernel/Biases.metal rename to Sources/GrAIdient/Metal/Kernel/BiasesFloat.metal index 31546305..fefd2da2 100644 --- a/Sources/GrAIdient/Metal/Kernel/Biases.metal +++ b/Sources/GrAIdient/Metal/Kernel/BiasesFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void reduceBiases( +kernel void reduceBiasesFloat( const device float * deltaWeights, constant uint * pNbNeurons, constant uint * pNbBatch, diff --git a/Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal b/Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal new file mode 100644 index 00000000..ba24365b --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/BiasesHalf.metal @@ -0,0 +1,53 @@ +// +// Biases.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void reduceBiasesHalf( + const device half * deltaWeights, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pAccumulate, + device half * grads, + uint id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint accumulate; + + if (pNbNeurons && pNbBatch && pAccumulate && deltaWeights && grads) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + accumulate = *pAccumulate; + } + else + return ; + + uint depth = id; + if (depth >= nbNeurons) + { + return ; + } + + float tmp = 0.0; + for (uint elem=0; elem using namespace metal; -kernel void convForward( +kernel void convForwardFloat( const device float * outsPrev, const device float * weights, const device float * biases, @@ -104,7 +104,7 @@ kernel void convForward( outs[offset] = tmp; } -kernel void conv16Forward( +kernel void conv16ForwardFloat( const device float * outsPrev, const device float * weights, const device float * biases, @@ -206,7 +206,7 @@ kernel void conv16Forward( } } -kernel void convBackward( +kernel void convBackwardFloat( const device float * delta, const device float * weights, constant int * pStart, @@ -313,7 +313,7 @@ kernel void convBackward( } } -kernel void conv16Backward( +kernel void conv16BackwardFloat( const device float * delta, const device float * weights, constant int * pStart, @@ -428,7 +428,7 @@ kernel void conv16Backward( } } -kernel void convBatchDerWeights( +kernel void convBatchDerWeightsFloat( const device float * outsPrev, const device float * delta, constant int * pStart, @@ -538,7 +538,7 @@ kernel void convBatchDerWeights( } } -kernel void conv34BatchDerWeights( +kernel void conv34BatchDerWeightsFloat( const device float4 * outsPrev, const device float4 * delta, constant uint * pNbChannels, @@ -783,7 +783,7 @@ kernel void conv34BatchDerWeights( } } -kernel void convBatchDerBiases( +kernel void convBatchDerBiasesFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -838,7 +838,7 @@ kernel void convBatchDerBiases( } } -kernel void convDerWeights( +kernel void convDerWeightsFloat( const device float * outsPrev, const device float * delta, constant int * pStart, @@ -938,7 +938,7 @@ kernel void convDerWeights( deltaWeights[offsetWeights] = tmp; } -kernel void convDerBiases( +kernel void convDerBiasesFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -982,7 +982,7 @@ kernel void convDerBiases( deltaWeights[offsetWeights] = tmp; } -kernel void convReduceWeights( +kernel void convReduceWeightsFloat( const device float * deltaWeights, constant uint * pNbChannels, constant uint * pNbChannelsPrev, diff --git a/Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal b/Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal new file mode 100644 index 00000000..95d03a60 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/ConvolutionHalf.metal @@ -0,0 +1,1049 @@ +// +// Convolution.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void convForwardHalf( + const device half * outsPrev, + const device half * weights, + const device half * biases, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + int offI, offJ; + uint stride; + uint nbBatch; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && + pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && + outsPrev && weights && biases && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + offI = pStart[4]; + offJ = pStart[5]; + stride = pStride[0]; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth+nbChannels*elem)*height; + + float tmp = biases[depth]; + for (uint depthPrev=0; depthPrev= 0 && + (int)(stride*j)+l-offJ < (int)widthPrev && + (int)(stride*i)+k-offI >= 0 && + (int)(stride*i)+k-offI < (int)heightPrev) + { + uint offsetPrev = (int)(stride*j)+l-offJ + + (offsetStartPrev + (int)(stride*i)+k-offI)*widthPrev; + float outPrev = outsPrev[offsetPrev]; + + uint offsetWeights = l-startJ + + (offsetStartWeights + k-startI) * weightWidth; + float w = weights[offsetWeights]; + + tmp += outPrev * w; + } + }} + } + + uint offset = j + (offsetStart + i)*width; + outs[offset] = tmp; +} + +kernel void conv16ForwardHalf( + const device half * outsPrev, + const device half * weights, + const device half * biases, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + int offI, offJ; + uint stride; + uint nbBatch; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && + pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && + outsPrev && weights && biases && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + offI = pStart[4]; + offJ = pStart[5]; + stride = pStride[0]; + } + else + return ; + + uint coeff = 16; + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth * coeff >= width * nbChannels) + { + return ; + } + + float tmp[16] = {0}; + for (uint depthPrev=0; depthPrev= 0 && + (int)(stride*j)+l-offJ < (int)widthPrev && + (int)(stride*i)+k-offI >= 0 && + (int)(stride*i)+k-offI < (int)heightPrev) + { + uint offsetPrev = (int)(stride*j)+l-offJ + + (offsetStartPrev + (int)(stride*i)+k-offI)*widthPrev; + float outPrev = outsPrev[offsetPrev]; + + for (uint c=0; c= heightPrev * nbBatch || + j * depthPrev >= widthPrev * nbChannelsPrev) + { + return ; + } + + uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev; + + float tmp = 0.0; + for (uint depth=0; depth= 0 && j1 < (int)width && + i1 >= 0 && i1 < (int)height) + { + uint offset = j1 + (offsetStart + i1) * width; + float deltaCur = delta[offset]; + + uint offsetWeights = l-startJ + + (offsetStartWeights + k-startI) * weightWidth; + float w = weights[offsetWeights]; + + tmp += deltaCur * w; + } + } + }} + } + + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + if (dirty) + { + deltaPrev[offsetPrev] = tmp; + } + else + { + deltaPrev[offsetPrev] += tmp; + } +} + +kernel void conv16BackwardHalf( + const device half * delta, + const device half * weights, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + int offI, offJ; + uint stride; + uint nbBatch; + uint dirty; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && + pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && pDirty && + delta && weights && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + offI = pStart[4]; + offJ = pStart[5]; + stride = pStride[0]; + dirty = *pDirty; + } + else + return ; + + uint coeff = 16; + uint depthPrev = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depthPrev * coeff >= widthPrev * nbChannelsPrev) + { + return ; + } + + float tmp[16] = {0}; + for (uint depth=0; depth= 0 && j1 < (int)width && + i1 >= 0 && i1 < (int)height) + { + uint offset = j1 + (offsetStart + i1) * width; + float deltaCur = delta[offset]; + + for (uint c=0; c= nbChannels * weightWidth || + id[1] >= nbChannelsPrev * weightHeight || + weightsI + startI > endI || weightsJ + startJ > endJ) + { + return ; + } + + int i = weightsI + startI; + int j = weightsJ + startJ; + + float tmp = 0.0; + for (uint elem=0; elem= 0 && + (int)(stride*l)+j-offJ < (int)widthPrev && + (int)(stride*k)+i-offI >= 0 && + (int)(stride*k)+i-offI < (int)heightPrev) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + uint offsetPrev = (int)(stride*l)+j-offJ + + (offsetStartPrev + (int)(stride*k)+i-offI)*widthPrev; + float outPrev = outsPrev[offsetPrev]; + + tmp += deltaCur * outPrev; + } + }} + } + + uint offsetStartWeights = + (depthPrev + nbChannelsPrev * depth) * weightHeight; + uint offsetWeights = j-startJ + + (offsetStartWeights + i-startI) * weightWidth; + + if (accumulate) + { + grads[offsetWeights] += tmp; + } + else + { + grads[offsetWeights] = tmp; + } +} + +kernel void conv34BatchDerWeightsHalf( + const device half4 * outsPrev, + const device half4 * delta, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + constant uint * pAccumulate, + device half * grads, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint nbChannelsPrev; + uint nbBatch; + uint accumulate; + + if (pNbChannels && pNbChannelsPrev && pDimensions && + pDimensionsPrev && pNbBatch && pAccumulate && + outsPrev && delta && grads) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + accumulate = *pAccumulate; + } + else + return ; + + uint depth = id[0]; + uint depthPrev = id[1]; + + if (id[0] >= nbChannels || + id[1] >= nbChannelsPrev) + { + return ; + } + + float tmp[9] = {0.0}; + for (uint elem=0; elem 0 && l > 0) + { + uint offsetPrev0 = + ((l-1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4; + float outPrev0 = outsPrev[offsetPrev0][3]; + + tmp[0] += outPrev0 * delta4[0]; + } + if (k > 0) + { + uint offsetPrev1 = + (l*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4; + half4 outPrev1 = outsPrev[offsetPrev1]; + + tmp[0] += outPrev1[0] * delta4[1]; + tmp[0] += outPrev1[1] * delta4[2]; + tmp[0] += outPrev1[2] * delta4[3]; + + half4 sum = outPrev1 * delta4; + tmp[1] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[2] += outPrev1[1] * delta4[0]; + tmp[2] += outPrev1[2] * delta4[1]; + tmp[2] += outPrev1[3] * delta4[2]; + } + if (k > 0 && (l+1)*4 < width) + { + uint offsetPrev2 = + ((l+1)*4 + (offsetStartPrev + k*2-1) * widthPrev) / 4; + float outPrev2 = outsPrev[offsetPrev2][0]; + + tmp[2] += outPrev2 * delta4[3]; + } + + if (l > 0) + { + uint offsetPrev3 = + ((l-1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4; + uint offsetPrev6 = + ((l-1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4; + float outPrev3 = outsPrev[offsetPrev3][3]; + float outPrev6 = outsPrev[offsetPrev6][3]; + + tmp[0] += outPrev3 * delta7[0]; + tmp[3] += outPrev3 * delta4[0]; + tmp[3] += outPrev6 * delta7[0]; + tmp[6] += outPrev6 * delta4[0]; + } + + uint offsetPrev4 = + (l*4 + (offsetStartPrev + k*2) * widthPrev) / 4; + uint offsetPrev7 = + (l*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4; + half4 outPrev4 = outsPrev[offsetPrev4]; + half4 outPrev7 = outsPrev[offsetPrev7]; + + tmp[0] += outPrev4[0] * delta7[1]; + tmp[0] += outPrev4[1] * delta7[2]; + tmp[0] += outPrev4[2] * delta7[3]; + + half4 sum = outPrev4 * delta7; + tmp[1] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[2] += outPrev4[1] * delta7[0]; + tmp[2] += outPrev4[2] * delta7[1]; + tmp[2] += outPrev4[3] * delta7[2]; + + tmp[3] += outPrev4[0] * delta4[1]; + tmp[3] += outPrev4[1] * delta4[2]; + tmp[3] += outPrev4[2] * delta4[3]; + tmp[3] += outPrev7[0] * delta7[1]; + tmp[3] += outPrev7[1] * delta7[2]; + tmp[3] += outPrev7[2] * delta7[3]; + + sum = outPrev4 * delta4; + tmp[4] += sum[0] + sum[1] + sum[2] + sum[3]; + sum = outPrev7 * delta7; + tmp[4] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[5] += outPrev4[1] * delta4[0]; + tmp[5] += outPrev4[2] * delta4[1]; + tmp[5] += outPrev4[3] * delta4[2]; + tmp[5] += outPrev7[1] * delta7[0]; + tmp[5] += outPrev7[2] * delta7[1]; + tmp[5] += outPrev7[3] * delta7[2]; + + tmp[6] += outPrev7[0] * delta4[1]; + tmp[6] += outPrev7[1] * delta4[2]; + tmp[6] += outPrev7[2] * delta4[3]; + + sum = outPrev7 * delta4; + tmp[7] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[8] += outPrev7[1] * delta4[0]; + tmp[8] += outPrev7[2] * delta4[1]; + tmp[8] += outPrev7[3] * delta4[2]; + + if ((l+1)*4 < width) + { + uint offsetPrev5 = + ((l+1)*4 + (offsetStartPrev + k*2) * widthPrev) / 4; + uint offsetPrev8 = + ((l+1)*4 + (offsetStartPrev + k*2+1) * widthPrev) / 4; + float outPrev5 = outsPrev[offsetPrev5][0]; + float outPrev8 = outsPrev[offsetPrev8][0]; + + tmp[2] += outPrev5 * delta7[3]; + tmp[5] += outPrev5 * delta4[3]; + tmp[5] += outPrev8 * delta7[3]; + tmp[8] += outPrev8 * delta4[3]; + } + + if ((k+1)*2 < height && l > 0) + { + uint offsetPrev9 = + ((l-1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4; + float outPrev9 = outsPrev[offsetPrev9][3]; + + tmp[6] += outPrev9 * delta7[0]; + } + if ((k+1)*2 < height) + { + uint offsetPrev10 = + (l*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4; + half4 outPrev10 = outsPrev[offsetPrev10]; + + tmp[6] += outPrev10[0] * delta7[1]; + tmp[6] += outPrev10[1] * delta7[2]; + tmp[6] += outPrev10[2] * delta7[3]; + + half4 sum = outPrev10 * delta7; + tmp[7] += sum[0] + sum[1] + sum[2] + sum[3]; + + tmp[8] += outPrev10[1] * delta7[0]; + tmp[8] += outPrev10[2] * delta7[1]; + tmp[8] += outPrev10[3] * delta7[2]; + } + if ((k+1)*2 < height && (l+1)*4 < width) + { + uint offsetPrev11 = + ((l+1)*4 + (offsetStartPrev + (k+1)*2) * widthPrev) / 4; + float outPrev11 = outsPrev[offsetPrev11][0]; + + tmp[8] += outPrev11 * delta7[3]; + } + }} + } + + uint offsetStartWeights = (depthPrev + nbChannelsPrev * depth) * 3; + uint offsetWeights0 = 0 + (offsetStartWeights + 0) * 3; + uint offsetWeights1 = 1 + (offsetStartWeights + 0) * 3; + uint offsetWeights2 = 2 + (offsetStartWeights + 0) * 3; + uint offsetWeights3 = 0 + (offsetStartWeights + 1) * 3; + uint offsetWeights4 = 1 + (offsetStartWeights + 1) * 3; + uint offsetWeights5 = 2 + (offsetStartWeights + 1) * 3; + uint offsetWeights6 = 0 + (offsetStartWeights + 2) * 3; + uint offsetWeights7 = 1 + (offsetStartWeights + 2) * 3; + uint offsetWeights8 = 2 + (offsetStartWeights + 2) * 3; + + if (accumulate) + { + grads[offsetWeights0] += tmp[0]; + grads[offsetWeights1] += tmp[1]; + grads[offsetWeights2] += tmp[2]; + grads[offsetWeights3] += tmp[3]; + grads[offsetWeights4] += tmp[4]; + grads[offsetWeights5] += tmp[5]; + grads[offsetWeights6] += tmp[6]; + grads[offsetWeights7] += tmp[7]; + grads[offsetWeights8] += tmp[8]; + } + else + { + grads[offsetWeights0] = tmp[0]; + grads[offsetWeights1] = tmp[1]; + grads[offsetWeights2] = tmp[2]; + grads[offsetWeights3] = tmp[3]; + grads[offsetWeights4] = tmp[4]; + grads[offsetWeights5] = tmp[5]; + grads[offsetWeights6] = tmp[6]; + grads[offsetWeights7] = tmp[7]; + grads[offsetWeights8] = tmp[8]; + } +} + +kernel void convBatchDerBiasesHalf( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + constant uint * pAccumulate, + device half * grads, + uint id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + uint accumulate; + + if (pNbChannels && pDimensions && pNbBatch && pAccumulate && + delta && grads) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + accumulate = *pAccumulate; + } + else + return ; + + uint depth = id; + if (depth >= nbChannels) + { + return ; + } + + float tmp = 0.0; + for (uint elem=0; elem= nbBatch * nbChannels * weightWidth || + id[1] >= nbChannelsPrev * weightHeight || + weightsI + startI > endI || weightsJ + startJ > endJ) + { + return ; + } + + uint offsetStartGridWeights = + elem * nbChannels * nbChannelsPrev * weightHeight; + + int i = weightsI + startI; + int j = weightsJ + startJ; + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev; + uint offsetStartWeights = + (depthPrev + nbChannelsPrev * depth) * weightHeight; + + float tmp = 0.0; + for (uint k=0; k= 0 && + (int)(stride*l)+j-offJ < (int)widthPrev && + (int)(stride*k)+i-offI >= 0 && + (int)(stride*k)+i-offI < (int)heightPrev) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + uint offsetPrev = (int)(stride*l)+j-offJ + + (offsetStartPrev + (int)(stride*k)+i-offI)*widthPrev; + float outPrev = outsPrev[offsetPrev]; + + tmp += deltaCur * outPrev; + } + }} + + uint offsetWeights = j-startJ + + (offsetStartGridWeights+offsetStartWeights+i-startI)*weightWidth; + deltaWeights[offsetWeights] = tmp; +} + +kernel void convDerBiasesHalf( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * deltaWeights, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && delta && deltaWeights) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbChannels || elem >= nbBatch) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + + float tmp = 0.0; + for (uint i=0; i= nbChannels * weightWidth || + id[1] >= nbChannelsPrev * weightHeight) + { + return ; + } + + uint offsetStartWeights = + (depthPrev + nbChannelsPrev * depth) * weightHeight; + uint offsetWeights = weightsJ + + (offsetStartWeights + weightsI) * weightWidth; + + float tmp = 0.0; + for (uint elem=0; elem using namespace metal; -kernel void deconvForward( +kernel void deconvForwardFloat( const device float * outsPrev, const device float * weights, const device float * biases, @@ -105,7 +105,7 @@ kernel void deconvForward( outs[offset] = tmp; } -kernel void deconvBackward( +kernel void deconvBackwardFloat( const device float * delta, const device float * weights, constant int * pStart, @@ -206,7 +206,7 @@ kernel void deconvBackward( } } -kernel void deconvBatchDerWeights( +kernel void deconvBatchDerWeightsFloat( const device float * outsPrev, const device float * delta, constant int * pStart, @@ -317,7 +317,7 @@ kernel void deconvBatchDerWeights( } } -kernel void deconvDerWeights( +kernel void deconvDerWeightsFloat( const device float * outsPrev, const device float * delta, constant int * pStart, diff --git a/Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal b/Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal new file mode 100644 index 00000000..2708d252 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/DeconvolutionHalf.metal @@ -0,0 +1,419 @@ +// +// Deconvolution.metal +// GrAIdient +// +// Created by Jean-François Reboud on 28/12/2022. +// + +#include +using namespace metal; + +kernel void deconvForwardHalf( + const device half * outsPrev, + const device half * weights, + const device half * biases, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + uint stride; + uint nbBatch; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && + pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && + outsPrev && weights && biases && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + stride = pStride[0]; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth+nbChannels*elem)*height; + + float tmp = biases[depth]; + for (uint depthPrev=0; depthPrev= 0 && j1 < (int)widthPrev && + i1 >= 0 && i1 < (int)heightPrev) + { + uint offsetPrev = j1 + + (offsetStartPrev + i1) * widthPrev; + float outPrev = outsPrev[offsetPrev]; + + uint offsetWeights = l-startJ + + (offsetStartWeights + k-startI) * weightWidth; + float w = weights[offsetWeights]; + + tmp += outPrev * w; + } + } + }} + } + + uint offset = j + (offsetStart + i)*width; + outs[offset] = tmp; +} + +kernel void deconvBackwardHalf( + const device half * delta, + const device half * weights, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + uint stride; + uint nbBatch; + uint dirty; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && + pDimensions && pDimensionsPrev && pDimWeights && pNbBatch && pDirty && + delta && weights && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + stride = pStride[0]; + dirty = *pDirty; + } + else + return ; + + uint depthPrev = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depthPrev >= widthPrev * nbChannelsPrev) + { + return ; + } + + uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev; + + float tmp = 0.0; + for (uint depth=0; depth= 0 && + (int)(stride*j)+l-startJ < (int)width && + (int)(stride*i)+k-startI >= 0 && + (int)(stride*i)+k-startI < (int)height) + { + uint offset = (int)(stride*j)+l-startJ + + (offsetStart + (int)(stride*i)+k-startI) * width; + float deltaCur = delta[offset]; + + uint offsetWeights = l-startJ + + (offsetStartWeights + k-startI) * weightWidth; + float w = weights[offsetWeights]; + + tmp += deltaCur * w; + } + }} + } + + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + if (dirty) + { + deltaPrev[offsetPrev] = tmp; + } + else + { + deltaPrev[offsetPrev] += tmp; + } +} + +kernel void deconvBatchDerWeightsHalf( + const device half * outsPrev, + const device half * delta, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + constant uint * pAccumulate, + device half * grads, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + uint stride; + uint nbBatch; + uint accumulate; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && pDimensions && + pDimensionsPrev && pDimWeights && pNbBatch && pAccumulate && + outsPrev && delta && grads) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + stride = pStride[0]; + accumulate = *pAccumulate; + } + else + return ; + + int weightsI = id[1] / nbChannelsPrev; + int weightsJ = id[0] / nbChannels; + uint depth = id[0] % nbChannels; + uint depthPrev = id[1] % nbChannelsPrev; + + if (id[0] >= nbChannels * weightWidth || + id[1] >= nbChannelsPrev * weightHeight || + weightsI + startI > endI || weightsJ + startJ > endJ) + { + return ; + } + + int i = weightsI + startI; + int j = weightsJ + startJ; + + float tmp = 0.0; + for (uint elem=0; elem= 0 && j1 < (int)widthPrev && + i1 >= 0 && i1 < (int)heightPrev) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + uint offsetPrev = j1 + + (offsetStartPrev + i1)*widthPrev; + float outPrev = outsPrev[offsetPrev]; + + tmp += deltaCur * outPrev; + } + } + }} + } + + uint offsetStartWeights = + (depthPrev + nbChannelsPrev * depth) * weightHeight; + uint offsetWeights = j-startJ + + (offsetStartWeights + i-startI) * weightWidth; + + if (accumulate) + { + grads[offsetWeights] += tmp; + } + else + { + grads[offsetWeights] = tmp; + } +} + +kernel void deconvDerWeightsHalf( + const device half * outsPrev, + const device half * delta, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimWeights, + constant uint * pNbBatch, + device half * deltaWeights, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint weightHeight, weightWidth; + uint nbChannels; + uint nbChannelsPrev; + int startI, startJ; + int endI, endJ; + uint stride; + uint nbBatch; + + if (pStart && pStride && pNbChannels && pNbChannelsPrev && pDimensions && + pDimensionsPrev && pDimWeights && pNbBatch && + outsPrev && delta && deltaWeights) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + weightWidth = pDimWeights[0]; + weightHeight = pDimWeights[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + startI = pStart[0]; + endI = pStart[1]; + startJ = pStart[2]; + endJ = pStart[3]; + stride = pStride[0]; + } + else + return ; + + uint remains = id[0]; + uint elem = remains / (weightWidth * nbChannels); + remains = remains % (weightWidth * nbChannels); + int weightsI = id[1] / nbChannelsPrev; + int weightsJ = remains / nbChannels; + uint depth = remains % nbChannels; + uint depthPrev = id[1] % nbChannelsPrev; + + if (id[0] >= nbBatch * nbChannels * weightWidth || + id[1] >= nbChannelsPrev * weightHeight || + weightsI + startI > endI || weightsJ + startJ > endJ) + { + return ; + } + + uint offsetStartGridWeights = + elem * nbChannels * nbChannelsPrev * weightHeight; + + int i = weightsI + startI; + int j = weightsJ + startJ; + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * heightPrev; + uint offsetStartWeights = + (depthPrev + nbChannelsPrev * depth) * weightHeight; + + float tmp = 0.0; + for (uint k=0; k= 0 && j1 < (int)widthPrev && + i1 >= 0 && i1 < (int)heightPrev) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + uint offsetPrev = j1 + + (offsetStartPrev + i1)*widthPrev; + float outPrev = outsPrev[offsetPrev]; + + tmp += deltaCur * outPrev; + } + } + }} + + uint offsetWeights = j-startJ + + (offsetStartGridWeights+offsetStartWeights+i-startI)*weightWidth; + deltaWeights[offsetWeights] = tmp; +} diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnected.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedFloat.metal similarity index 97% rename from Sources/GrAIdient/Metal/Kernel/FullyConnected.metal rename to Sources/GrAIdient/Metal/Kernel/FullyConnectedFloat.metal index 7f12744a..e7abeb06 100644 --- a/Sources/GrAIdient/Metal/Kernel/FullyConnected.metal +++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void flForward( +kernel void flForwardFloat( const device float * outsPrev, const device float * weights, const device float * biases, @@ -56,7 +56,7 @@ kernel void flForward( outs[offset] = tmp; } -kernel void flBackward( +kernel void flBackwardFloat( const device float * delta, const device float * weights, constant uint * pNbNeurons, @@ -113,7 +113,7 @@ kernel void flBackward( } } -kernel void flBatchDerWeights( +kernel void flBatchDerWeightsFloat( const device float * outsPrev, const device float * delta, constant uint * pNbNeurons, @@ -170,7 +170,7 @@ kernel void flBatchDerWeights( } } -kernel void flBatchDerBiases( +kernel void flBatchDerBiasesFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -214,7 +214,7 @@ kernel void flBatchDerBiases( } } -kernel void flDerWeights( +kernel void flDerWeightsFloat( const device float * outsPrev, const device float * delta, constant uint * pNbNeurons, @@ -261,7 +261,7 @@ kernel void flDerWeights( deltaWeights[offsetWeights] = tmp; } -kernel void flDerBiases( +kernel void flDerBiasesFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -294,7 +294,7 @@ kernel void flDerBiases( deltaWeights[offsetWeights] = deltaCur; } -kernel void flReduceWeights( +kernel void flReduceWeightsFloat( const device float * deltaWeights, constant uint * pNbNeurons, constant uint * pNbNeuronsPrev, diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal new file mode 100644 index 00000000..63c717f9 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedHalf.metal @@ -0,0 +1,347 @@ +// +// FullyConnected.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void flForwardHalf( + const device half * outsPrev, + const device half * weights, + const device half * biases, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + + if (pNbNeurons && pNbNeuronsPrev && pNbBatch && + outsPrev && weights && biases && outs) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + float tmp = biases[depth]; + for (uint depthPrev=0; depthPrev= nbNeuronsPrev || elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= nbNeurons || depthPrev >= nbNeuronsPrev) + { + return ; + } + + float tmp = 0.0; + for (uint elem=0; elem= nbNeurons) + { + return ; + } + + float tmp = 0.0; + for (uint elem=0; elem= nbNeurons * nbBatch || + depthPrev >= nbNeuronsPrev) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + float deltaCur = delta[offset]; + + uint offsetPrev = depthPrev + nbNeuronsPrev * elem; + float outPrev = outsPrev[offsetPrev]; + + float tmp = deltaCur * outPrev; + + uint offsetStartWeights = elem * nbNeurons * nbNeuronsPrev; + uint offsetWeights = offsetStartWeights + + depthPrev + nbNeuronsPrev * depth; + deltaWeights[offsetWeights] = tmp; +} + +kernel void flDerBiasesHalf( + const device half * delta, + constant uint * pNbNeurons, + constant uint * pNbBatch, + device half * deltaWeights, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + + if (pNbNeurons && pNbBatch && delta && deltaWeights) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + float deltaCur = delta[offset]; + + uint offsetWeights = elem * nbNeurons + depth; + deltaWeights[offsetWeights] = deltaCur; +} + +kernel void flReduceWeightsHalf( + const device half * deltaWeights, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + constant uint * pAccumulate, + device half * grads, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + uint accumulate; + + if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pAccumulate && + deltaWeights && grads) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + accumulate = *pAccumulate; + } + else + return ; + + uint depth = id[0]; + uint depthPrev = id[1]; + + if (depth >= nbNeurons || depthPrev >= nbNeuronsPrev) + { + return ; + } + + uint offsetWeights = depthPrev + nbNeuronsPrev * depth; + + float tmp = 0.0; + for (uint elem=0; elem using namespace metal; -kernel void flPatchForward( +kernel void flPatchForwardFloat( const device float * outsPrev, const device float * weights, const device float * biases, @@ -85,7 +85,7 @@ kernel void flPatchForward( outs[offset] = tmp; } -kernel void flPatchBackward( +kernel void flPatchBackwardFloat( const device float * delta, const device float * weights, constant uint * pNbNeurons, @@ -170,7 +170,7 @@ kernel void flPatchBackward( } } -kernel void flPatchBatchDerWeights( +kernel void flPatchBatchDerWeightsFloat( const device float * outsPrev, const device float * delta, constant uint * pNbNeurons, @@ -256,7 +256,7 @@ kernel void flPatchBatchDerWeights( } } -kernel void flPatchBatchDerBiases( +kernel void flPatchBatchDerBiasesFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -304,7 +304,7 @@ kernel void flPatchBatchDerBiases( } } -kernel void flPatchBatch4DerBiases( +kernel void flPatchBatch4DerBiasesFloat( const device float4 * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -353,7 +353,7 @@ kernel void flPatchBatch4DerBiases( } } -kernel void flPatchDerWeights( +kernel void flPatchDerWeightsFloat( const device float * outsPrev, const device float * delta, constant uint * pNbNeurons, @@ -432,7 +432,7 @@ kernel void flPatchDerWeights( deltaWeights[offsetWeights] = tmp; } -kernel void flPatchDerBiases( +kernel void flPatchDerBiasesFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -472,7 +472,7 @@ kernel void flPatchDerBiases( deltaWeights[offsetWeights] = tmp; } -kernel void flPatchReduceWeights( +kernel void flPatchReduceWeightsFloat( const device float * deltaWeights, constant uint * pNbNeurons, constant uint * pNbChannelsPrev, diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal new file mode 100644 index 00000000..4a6c3e36 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedPatchHalf.metal @@ -0,0 +1,529 @@ +// +// FullyConnectedPatch.metal +// GrAIdient +// +// Created by Jean-François Reboud on 25/02/2023. +// + +#include +using namespace metal; + +kernel void flPatchForwardHalf( + const device half * outsPrev, + const device half * weights, + const device half * biases, + constant uint * pNbNeurons, + constant uint * pNbChannelsPrev, + constant uint * pDimensionsPrev, + constant uint * pPatch, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbChannelsPrev; + uint heightPrev, widthPrev; + uint patch; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbChannelsPrev && pDimensionsPrev && pPatch && + pNbBatch && pSequence && + outsPrev && weights && biases && outs) + { + nbNeurons = *pNbNeurons; + nbChannelsPrev = *pNbChannelsPrev; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + patch = *pPatch; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint weightWidth = nbChannelsPrev * patch * patch; + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint nbSeqPerCol = widthPrev / patch; + uint seqI = seq / nbSeqPerCol; + uint seqJ = seq % nbSeqPerCol; + + uint iStart = seqI * patch; + uint jStart = seqJ * patch; + + float tmp = biases[depth]; + for (uint depthPrev=0; depthPrev= weightWidth || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint nbSeqPerCol = widthPrev / patch; + uint seqI = seq / nbSeqPerCol; + uint seqJ = seq % nbSeqPerCol; + + uint iStart = seqI * patch; + uint jStart = seqJ * patch; + + uint res = offsetWeight; + uint depthPrev = res / (patch * patch); + res -= depthPrev * patch * patch; + uint i = res / patch; + res -= i * patch; + uint j = res; + + float tmp = 0.0; + for (uint depth=0; depth= nbNeurons || offsetWeight >= weightWidth) + { + return ; + } + + uint nbSeqPerCol = widthPrev / patch; + + uint res = offsetWeight; + uint depthPrev = res / (patch * patch); + res -= depthPrev * patch * patch; + uint i = res / patch; + res -= i * patch; + uint j = res; + + float tmp = 0.0; + for (uint elem=0; elem= nbNeurons) + { + return ; + } + + float tmp = 0.0; + for (uint elem=0; elem= nbNeurons) + { + return ; + } + + half4 tmp = 0.0; + for (uint elem=0; elem= nbNeurons * nbBatch || + offsetWeight >= weightWidth) + { + return ; + } + + uint nbSeqPerCol = widthPrev / patch; + + uint res = offsetWeight; + uint depthPrev = res / (patch * patch); + res -= depthPrev * patch * patch; + uint i = res / patch; + res -= i * patch; + uint j = res; + + float tmp = 0.0; + for (uint seq=0; seq= nbNeurons || elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint seq=0; seq= nbNeurons || offsetWeight >= weightWidth) + { + return ; + } + + uint offsetWeights = offsetWeight + weightWidth * depth; + + float tmp = 0.0; + for (uint elem=0; elem using namespace metal; -kernel void flSeqForward( +kernel void flSeqForwardFloat( const device float * outsPrev, const device float * weights, const device float * biases, @@ -61,7 +61,7 @@ kernel void flSeqForward( outs[offset] = tmp; } -kernel void flSeq48Forward( +kernel void flSeq48ForwardFloat( const device float4 * outsPrev, const device float4 * weights, const device float * biases, @@ -123,7 +123,7 @@ kernel void flSeq48Forward( } } -kernel void flSeq4Forward( +kernel void flSeq4ForwardFloat( const device float4 * outsPrev, const device float4 * weights, const device float * biases, @@ -176,7 +176,7 @@ kernel void flSeq4Forward( outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3] + biases[depth]; } -kernel void flSeqBackward( +kernel void flSeqBackwardFloat( const device float * delta, const device float * weights, constant uint * pNbNeurons, @@ -239,7 +239,7 @@ kernel void flSeqBackward( } } -kernel void flSeq48Backward( +kernel void flSeq48BackwardFloat( const device float * delta, const device float4 * weights, constant uint * pNbNeurons, @@ -315,7 +315,7 @@ kernel void flSeq48Backward( } } -kernel void flSeq4Backward( +kernel void flSeq4BackwardFloat( const device float * delta, const device float4 * weights, constant uint * pNbNeurons, @@ -378,7 +378,7 @@ kernel void flSeq4Backward( } } -kernel void flSeqBatchDerWeights( +kernel void flSeqBatchDerWeightsFloat( const device float * outsPrev, const device float * delta, constant uint * pNbNeurons, @@ -440,7 +440,7 @@ kernel void flSeqBatchDerWeights( } } -kernel void flSeqBatch4DerWeights( +kernel void flSeqBatch4DerWeightsFloat( const device float4 * outsPrev, const device float * delta, constant uint * pNbNeurons, @@ -502,7 +502,7 @@ kernel void flSeqBatch4DerWeights( } } -kernel void flSeqDerWeights( +kernel void flSeqDerWeightsFloat( const device float * outsPrev, const device float * delta, constant uint * pNbNeurons, @@ -556,7 +556,7 @@ kernel void flSeqDerWeights( deltaWeights[offsetWeights] = tmp; } -kernel void flSeqReduceWeights( +kernel void flSeqReduceWeightsFloat( const device float * deltaWeights, constant uint * pNbNeurons, constant uint * pNbNeuronsPrev, diff --git a/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal b/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal new file mode 100644 index 00000000..658d30de --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/FullyConnectedSeqHalf.metal @@ -0,0 +1,609 @@ +// +// FullyConnectedSeq.metal +// GrAIdient +// +// Created by Jean-François Reboud on 08/03/2023. +// + +#include +using namespace metal; + +kernel void flSeqForwardHalf( + const device half * outsPrev, + const device half * weights, + const device half * biases, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence && + outsPrev && weights && biases && outs) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + float tmp = biases[depth]; + for (uint depthPrev=0; depthPrev= nbNeurons || elem * coeff >= nbBatch || seq >= sequence) + { + return ; + } + + half4 tmp[8] = {0}; + for (uint depthPrev=0; depthPrev= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + half4 tmp = 0; + for (uint depthPrev=0; depthPrev= nbNeuronsPrev || elem >= nbBatch || seq >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= nbNeuronsPrev || + elem * coeff >= nbBatch || seq >= sequence) + { + return ; + } + + half4 tmp[8] = {0}; + for (uint depth=0; depth= nbNeuronsPrev || elem >= nbBatch || seq >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint depth=0; depth= nbNeurons || depthPrev >= nbNeuronsPrev) + { + return ; + } + + float tmp = 0.0; + for (uint elem=0; elem= nbNeurons || depthPrev * 4 >= nbNeuronsPrev) + { + return ; + } + + half4 tmp = 0.0; + for (uint elem=0; elem= nbNeurons * nbBatch || + depthPrev >= nbNeuronsPrev) + { + return ; + } + + float tmp = 0.0; + for (uint seq=0; seq= nbNeurons || depthPrev >= nbNeuronsPrev) + { + return ; + } + + uint offsetWeights = depthPrev + nbNeuronsPrev * depth; + + float tmp = 0.0; + for (uint elem=0; elem using namespace metal; -kernel void computeInstanceNormConvμ( +kernel void computeInstanceNormConvμFloat( const device float * tmps, constant uint * pNbChannels, constant uint * pNbBatch, @@ -53,7 +53,7 @@ kernel void computeInstanceNormConvμ( μ[depth + nbChannels * elem] = sum / nbElems; } -kernel void computeInstanceNormConvσ2( +kernel void computeInstanceNormConvσ2Float( const device float * tmps, const device float * μ, constant uint * pNbChannels, @@ -100,7 +100,7 @@ kernel void computeInstanceNormConvσ2( σ2[depth + nbChannels * elem] = sum / nbElems; } -kernel void forwardInstanceNormConv( +kernel void forwardInstanceNormConvFloat( const device float * β, const device float * Ɣ, const device float * μ, @@ -150,7 +150,7 @@ kernel void forwardInstanceNormConv( tmps[offset] = Ɣ[depth] * xhat + β[depth]; } -kernel void forwardAdaIN( +kernel void forwardAdaINFloat( const device float * outsPrev, const device float * styles, const device float * μ, @@ -200,7 +200,7 @@ kernel void forwardAdaIN( outs[offset] = styles[depth] * xhat + styles[depth + nbChannels]; } -kernel void backwardWeightsInstanceNormConv( +kernel void backwardWeightsInstanceNormConvFloat( const device float * delta, const device float * xHat, const device float * Ɣ, @@ -274,7 +274,7 @@ kernel void backwardWeightsInstanceNormConv( } } -kernel void backward2AdaIN( +kernel void backward2AdaINFloat( const device float * delta, const device float * xHat, const device float * outStyles, @@ -347,7 +347,7 @@ kernel void backward2AdaIN( } } -kernel void backwardInstanceNormConv( +kernel void backwardInstanceNormConvFloat( const device float * σ2, const device float * xHat, const device float * Ɣ, @@ -401,7 +401,7 @@ kernel void backwardInstanceNormConv( delta[offset] = mult * (tmp1 - tmp2 - tmp3); } -kernel void backward1AdaIN( +kernel void backward1AdaINFloat( const device float * delta, const device float * σ2, const device float * xHat, diff --git a/Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal b/Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal new file mode 100644 index 00000000..6a797f7d --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/InstanceNormHalf.metal @@ -0,0 +1,467 @@ +// +// InstanceNorm.metal +// GrAIdient +// +// Created by Jean-François Reboud on 17/02/2022. +// + +#include +using namespace metal; + +kernel void computeInstanceNormConvμHalf( + const device half * tmps, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pDimensions, + device half * μ, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint width; + uint height; + + if (pNbChannels && pNbBatch && pDimensions && tmps && μ) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + width = pDimensions[0]; + height = pDimensions[1]; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + if (depth >= nbChannels || elem >= nbBatch) + { + return ; + } + + uint nbElems = width * height; + float sum = 0.0; + + for (uint x=0; x= nbChannels || elem >= nbBatch) + { + return ; + } + + uint nbElems = width * height; + float sum = 0.0; + + for (uint x=0; x= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float tmp1 = tmps[offset] - μ[depth + nbChannels * elem]; + float tmp2 = sqrt(σ2[depth + nbChannels * elem] + Ɛ); + float xhat = tmp1 / tmp2; + xHat[offset] = xhat; + tmps[offset] = Ɣ[depth] * xhat + β[depth]; +} + +kernel void forwardAdaINHalf( + const device half * outsPrev, + const device half * styles, + const device half * μ, + const device half * σ2, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pDimensions, + device half * outs, + device half * xHat, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint width; + uint height; + float Ɛ = 1e-5; + + if (pNbChannels && pNbBatch && pDimensions && outsPrev && styles && + outs && xHat && μ && σ2) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + width = pDimensions[0]; + height = pDimensions[1]; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float tmp1 = outsPrev[offset] - μ[depth + nbChannels * elem]; + float tmp2 = sqrt(σ2[depth + nbChannels * elem] + Ɛ); + float xhat = tmp1 / tmp2; + xHat[offset] = xhat; + outs[offset] = styles[depth] * xhat + styles[depth + nbChannels]; +} + +kernel void backwardWeightsInstanceNormConvHalf( + const device half * delta, + const device half * xHat, + const device half * Ɣ, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pDimensions, + constant uint * pAccumulate, + device half * sum1, + device half * sum2, + device half * dƔ, + device half * dβ, + uint id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint width; + uint height; + uint accumulate; + + if (pNbChannels && pNbBatch && pDimensions && pAccumulate && + delta && xHat && Ɣ && + sum1 && sum2 && dƔ && dβ) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + width = pDimensions[0]; + height = pDimensions[1]; + accumulate = *pAccumulate; + } + else + return ; + + uint depth = id; + if (depth >= nbChannels) + { + return ; + } + + float tmp3 = 0.0, tmp4 = 0.0; + for (uint elem=0; elem= nbChannels || elem >= nbBatch) + { + return ; + } + + float tmp1 = 0.0, tmp2 = 0.0; + float tmp3 = 0.0, tmp4 = 0.0; + + for (uint x=0; x= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float mult = + 1.0 / ((float)nbElems * sqrt(σ2[depth + nbChannels * elem] + Ɛ)); + float dxHat = Ɣ[depth] * delta[offset]; + float tmp1 = nbElems * dxHat; + float tmp2 = sum1[depth + nbChannels * elem]; + float tmp3 = xHat[offset] * sum2[depth + nbChannels * elem]; + + delta[offset] = mult * (tmp1 - tmp2 - tmp3); +} + +kernel void backward1AdaINHalf( + const device half * delta, + const device half * σ2, + const device half * xHat, + const device half * styles, + const device half * sum1, + const device half * sum2, + constant uint * pNbChannels, + constant uint * pNbBatch, + constant uint * pDimensions, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbChannels; + uint nbBatch; + uint width; + uint height; + uint dirty; + float Ɛ = 1e-5; + + if (pNbChannels && pNbBatch && pDimensions && pDirty && + delta && σ2 && xHat && styles && sum1 && sum2 && deltaPrev) + { + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + width = pDimensions[0]; + height = pDimensions[1]; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + uint nbElems = width * height; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float mult = + 1.0 / ((float)nbElems * sqrt(σ2[depth + nbChannels * elem] + Ɛ)); + float dxHat = styles[depth] * delta[offset]; + float tmp1 = nbElems * dxHat; + float tmp2 = sum1[depth + nbChannels * elem]; + float tmp3 = xHat[offset] * sum2[depth + nbChannels * elem]; + + if (dirty) + { + deltaPrev[offset] = mult * (tmp1 - tmp2 - tmp3); + } + else + { + deltaPrev[offset] += mult * (tmp1 - tmp2 - tmp3); + } +} diff --git a/Sources/GrAIdient/Metal/Kernel/Layer1D.metal b/Sources/GrAIdient/Metal/Kernel/Layer1DFloat.metal similarity index 96% rename from Sources/GrAIdient/Metal/Kernel/Layer1D.metal rename to Sources/GrAIdient/Metal/Kernel/Layer1DFloat.metal index e5137942..bac32006 100644 --- a/Sources/GrAIdient/Metal/Kernel/Layer1D.metal +++ b/Sources/GrAIdient/Metal/Kernel/Layer1DFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void MSE1DLoss( +kernel void MSE1DLossFloat( const device float * outs, const device float * groundTruth, constant uint * pNbNeurons, @@ -48,7 +48,7 @@ kernel void MSE1DLoss( losses[elem] = tmp; } -kernel void MSE1DLossDerivative( +kernel void MSE1DLossDerivativeFloat( const device float * outs, const device float * groundTruth, constant uint * pNbNeurons, @@ -98,7 +98,7 @@ kernel void MSE1DLossDerivative( } } -kernel void linearErrorLoss( +kernel void linearErrorLossFloat( const device float * outs, const device float * groundTruth, constant uint * pNbNeurons, @@ -138,7 +138,7 @@ kernel void linearErrorLoss( losses[elem] = tmp; } -kernel void linearErrorLossDerivative( +kernel void linearErrorLossDerivativeFloat( const device float * outs, constant uint * pNbNeurons, constant float * pCoeff, @@ -182,7 +182,7 @@ kernel void linearErrorLossDerivative( } } -kernel void selectNeurons1DForward( +kernel void selectNeurons1DForwardFloat( const device float * outsPrev, constant uint * pNbNeurons, constant uint * pNbNeuronsPrev, @@ -219,7 +219,7 @@ kernel void selectNeurons1DForward( outs[offset] = pCoeffs[depth] * outsPrev[offsetPrev]; } -kernel void selectNeurons1DBackward( +kernel void selectNeurons1DBackwardFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pNbNeuronsPrev, @@ -256,7 +256,7 @@ kernel void selectNeurons1DBackward( deltaPrev[offsetPrev] += pCoeffs[depth] * delta[offset]; } -kernel void concat1DForward( +kernel void concat1DForwardFloat( const device float * outsPrev, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -295,7 +295,7 @@ kernel void concat1DForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void concat1DBackward( +kernel void concat1DBackwardFloat( const device float * delta, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -344,7 +344,7 @@ kernel void concat1DBackward( } } -kernel void softmax1DForward( +kernel void softmax1DForwardFloat( const device float * outsPrev, constant uint * pNbHeads, constant uint * pNbNeurons, @@ -401,7 +401,7 @@ kernel void softmax1DForward( outs[offset] = exp(outPrev - cMax) / sum1; } -kernel void softmax1DBackward( +kernel void softmax1DBackwardFloat( const device float * outs, const device float * delta, constant uint * pNbHeads, @@ -461,7 +461,7 @@ kernel void softmax1DBackward( } } -kernel void dotProduct1DForward( +kernel void dotProduct1DForwardFloat( const device float * outsPrev1, const device float * outsPrev2, constant int * pSize, @@ -508,7 +508,7 @@ kernel void dotProduct1DForward( outs[offset] = sum; } -kernel void dotProduct1DBackward( +kernel void dotProduct1DBackwardFloat( const device float * outsPrev, const device float * delta, constant int * pSize, @@ -563,7 +563,7 @@ kernel void dotProduct1DBackward( } } -kernel void constant1DForward( +kernel void constant1DForwardFloat( const device float * weights, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -593,7 +593,7 @@ kernel void constant1DForward( outs[offset] = weights[depth]; } -kernel void BCE1DLoss( +kernel void BCE1DLossFloat( const device float * outs, const device float * groundTruth, constant uint * pNbNeurons, @@ -634,7 +634,7 @@ kernel void BCE1DLoss( losses[elem] = tmp; } -kernel void BCE1DLossDerivative( +kernel void BCE1DLossDerivativeFloat( const device float * outs, const device float * groundTruth, constant uint * pNbNeurons, @@ -693,7 +693,7 @@ kernel void BCE1DLossDerivative( } } -kernel void BCESigmoid1DLoss( +kernel void BCESigmoid1DLossFloat( const device float * outs, const device float * groundTruth, constant uint * pNbNeurons, @@ -744,7 +744,7 @@ kernel void BCESigmoid1DLoss( losses[elem] = tmp; } -kernel void BCESigmoid1DLossDerivative( +kernel void BCESigmoid1DLossDerivativeFloat( const device float * outs, const device float * groundTruth, constant uint * pNbNeurons, @@ -803,7 +803,7 @@ kernel void BCESigmoid1DLossDerivative( } } -kernel void dropout1DForward( +kernel void dropout1DForwardFloat( const device float * outsPrev, const device bool * dropout, constant uint * pNbNeurons, @@ -852,7 +852,7 @@ kernel void dropout1DForward( } } -kernel void dropout1DBackward( +kernel void dropout1DBackwardFloat( const device float * delta, const device bool * dropout, constant uint * pNbNeurons, diff --git a/Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal b/Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal new file mode 100644 index 00000000..ce473260 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/Layer1DHalf.metal @@ -0,0 +1,915 @@ +// +// Layer1D.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void MSE1DLossHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbNeurons, + constant uint * pNbBatch, + device half * losses, + uint id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + + if (pNbNeurons && pNbBatch && outs && groundTruth && losses) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id; + if (elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + + float gt = groundTruth[offset]; + float out = outs[offset]; + float diff = out - gt; + + if (dirty) + { + deltaPrev[offset] = 2 * coeff * diff / float(nbNeurons * nbBatch); + } + else + { + deltaPrev[offset] += 2 * coeff * diff / float(nbNeurons * nbBatch); + } +} + +kernel void linearErrorLossHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbNeurons, + constant uint * pNbBatch, + device half * losses, + uint id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + + if (pNbNeurons && pNbBatch && outs && groundTruth && losses) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id; + if (elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + + if (dirty) + { + deltaPrev[offset] = coeff / float(nbNeurons * nbBatch); + } + else + { + deltaPrev[offset] += coeff / float(nbNeurons * nbBatch); + } +} + +kernel void selectNeurons1DForwardHalf( + const device half * outsPrev, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNeurons, + constant float * pCoeffs, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + + if (pNbNeurons && pNbNeuronsPrev && pNeurons && pCoeffs && pNbBatch && + outsPrev && outs) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + uint offsetPrev = pNeurons[depth] + nbNeuronsPrev * elem; + outs[offset] = pCoeffs[depth] * outsPrev[offsetPrev]; +} + +kernel void selectNeurons1DBackwardHalf( + const device half * delta, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNeurons, + constant float * pCoeffs, + constant uint * pNbBatch, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + + if (pNbNeurons && pNbNeuronsPrev && pNeurons && pCoeffs && pNbBatch && + deltaPrev && delta) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + uint offsetPrev = pNeurons[depth] + nbNeuronsPrev * elem; + deltaPrev[offsetPrev] += pCoeffs[depth] * delta[offset]; +} + +kernel void concat1DForwardHalf( + const device half * outsPrev, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + uint globalOffset; + + if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev && pNbBatch && + outsPrev && outs) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + globalOffset = *pGlobalOffset; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeuronsPrev || elem >= nbBatch) + { + return ; + } + + uint offsetPrev = depth + nbNeuronsPrev * elem; + uint offset = globalOffset+depth + nbNeurons * elem; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void concat1DBackwardHalf( + const device half * delta, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + uint globalOffset; + uint dirty; + + if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev && pNbBatch && pDirty && + deltaPrev && delta) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + globalOffset = *pGlobalOffset; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeuronsPrev || elem >= nbBatch) + { + return ; + } + + uint offsetPrev = depth + nbNeuronsPrev * elem; + uint offset = globalOffset+depth + nbNeurons * elem; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offset]; + } + else + { + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void softmax1DForwardHalf( + const device half * outsPrev, + constant uint * pNbHeads, + constant uint * pNbNeurons, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbHeads; + uint size; + uint nbNeurons; + uint nbBatch; + + if (pNbHeads && pNbNeurons && pNbBatch && outsPrev && outs) + { + nbHeads = *pNbHeads; + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + size = nbNeurons / nbHeads; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + uint head = depth / size; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + float cMax = outsPrev[0+head*size + nbNeurons * elem]; + for (uint j=0; j cMax) + { + cMax = outPrev; + } + } + + float sum1 = 0.0; + for (uint j=0; j= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + float outCur = outs[offset]; + float deltaCur = delta[offset]; + + float sum1 = 0.0; + for (uint j=0; j= nbNeurons || elem >= nbBatch) + { + return ; + } + + float sum = 0.0; + for (uint j=0; j= nbNeurons || elem >= nbBatch) + { + return ; + } + + for (uint j=0; j= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + outs[offset] = weights[depth]; +} + +kernel void BCE1DLossHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbNeurons, + constant uint * pNbBatch, + device half * losses, + uint id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + + if (pNbNeurons && pNbBatch && outs && groundTruth && losses) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id; + if (elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + + float gt = groundTruth[offset]; + float out = outs[offset]; + float derivative = 0.0; + + if (gt == 1.0) + { + derivative = -1 / out; + } + else if (gt == 0.0) + { + derivative = 1 / (1 - out); + } + + if (dirty) + { + deltaPrev[offset] = coeff * derivative / float(nbNeurons * nbBatch); + } + else + { + deltaPrev[offset] += coeff * derivative / float(nbNeurons * nbBatch); + } +} + +kernel void BCESigmoid1DLossHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbNeurons, + constant uint * pNbBatch, + device half * losses, + uint id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + + if (pNbNeurons && pNbBatch && outs && groundTruth && losses) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id; + if (elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth 0) + { + value = (1 - gt) * out; + value += log(1 + exp(-out)); + } + else + { + value = -out * gt; + value += log(exp(out) + 1); + } + + tmp += value; + } + + losses[elem] = tmp; +} + +kernel void BCESigmoid1DLossDerivativeHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbNeurons, + constant float * pCoeff, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + float coeff; + uint nbBatch; + uint dirty; + + if (pNbNeurons && pNbBatch && pCoeff && pDirty && + outs && groundTruth && deltaPrev) + { + nbNeurons = *pNbNeurons; + coeff = *pCoeff; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + + float gt = groundTruth[offset]; + float out = outs[offset]; + float value; + + if (out >= 0) + { + value = 1.0 / (1.0 + exp(-out)); + } + else + { + value = exp(out) / (1.0 + exp(out)); + } + + if (dirty) + { + deltaPrev[offset] = coeff * (value - gt) / float(nbNeurons * nbBatch); + } + else + { + deltaPrev[offset] += coeff * (value - gt) / float(nbNeurons * nbBatch); + } +} + +kernel void dropout1DForwardHalf( + const device half * outsPrev, + const device bool * dropout, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant bool * pApplyDropout, + constant float * pCoeff, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + bool applyDropout; + float coeff; + + if (pNbNeurons && pNbBatch && pApplyDropout && pCoeff && + dropout && outsPrev && outs) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + applyDropout = *pApplyDropout; + coeff = *pCoeff; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + if (applyDropout && !dropout[offset]) + { + outs[offset] = 1.0 / (1.0 - coeff) * outsPrev[offset]; + } + else if (applyDropout) + { + outs[offset] = 0.0; + } + else + { + outs[offset] = outsPrev[offset]; + } +} + +kernel void dropout1DBackwardHalf( + const device half * delta, + const device bool * dropout, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant bool * pApplyDropout, + constant float * pCoeff, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + bool applyDropout; + float coeff; + uint dirty; + + if (pNbNeurons && pNbBatch && pApplyDropout && pCoeff && + dropout && delta && deltaPrev) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + applyDropout = *pApplyDropout; + coeff = *pCoeff; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + float newValue = 0.0; + uint offset = depth + nbNeurons * elem; + if (applyDropout && !dropout[offset]) + { + newValue = 1.0 / (1.0 - coeff) * delta[offset]; + } + else if (applyDropout) + { + newValue = 0.0; + } + else + { + newValue = delta[offset]; + } + + if (dirty) + { + deltaPrev[offset] = newValue; + } + else + { + deltaPrev[offset] += newValue; + } +} diff --git a/Sources/GrAIdient/Metal/Kernel/Layer2D.metal b/Sources/GrAIdient/Metal/Kernel/Layer2DFloat.metal similarity index 97% rename from Sources/GrAIdient/Metal/Kernel/Layer2D.metal rename to Sources/GrAIdient/Metal/Kernel/Layer2DFloat.metal index 818f528b..72ca39f1 100644 --- a/Sources/GrAIdient/Metal/Kernel/Layer2D.metal +++ b/Sources/GrAIdient/Metal/Kernel/Layer2DFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void avgPoolForward( +kernel void avgPoolForwardFloat( const device float * outsPrev, constant uint * pNbNeurons, constant uint * pDimensionsPrev, @@ -54,7 +54,7 @@ kernel void avgPoolForward( outs[offset] = tmp; } -kernel void avgPoolBackward( +kernel void avgPoolBackwardFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pDimensionsPrev, @@ -107,7 +107,7 @@ kernel void avgPoolBackward( } } -kernel void maxPoolForward( +kernel void maxPoolForwardFloat( const device float * outsPrev, constant int * pStart, constant uint * pStride, @@ -184,7 +184,7 @@ kernel void maxPoolForward( indicesMax[offset] = indexMax; } -kernel void maxPoolBackward( +kernel void maxPoolBackwardFloat( const device float * delta, const device int * indicesMax, constant int * pStart, @@ -291,7 +291,7 @@ uint _endIndex(uint index, uint smallSize, uint bigSize) return (uint)(ceil(float((index + 1) * bigSize) / smallSize)); } -kernel void adaptiveAvgPoolForward1( +kernel void adaptiveAvgPoolForward1Float( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -353,7 +353,7 @@ kernel void adaptiveAvgPoolForward1( outs[offset] = tmp / (float)nbElems; } -kernel void adaptiveAvgPoolForward2( +kernel void adaptiveAvgPoolForward2Float( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -424,7 +424,7 @@ kernel void adaptiveAvgPoolForward2( }} } -kernel void adaptiveAvgPoolBackward1( +kernel void adaptiveAvgPoolBackward1Float( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -487,7 +487,7 @@ kernel void adaptiveAvgPoolBackward1( }} } -kernel void adaptiveAvgPoolBackward2( +kernel void adaptiveAvgPoolBackward2Float( const device float * delta, const device int * nbElems, constant uint * pNbChannels, @@ -548,7 +548,7 @@ kernel void adaptiveAvgPoolBackward2( }} } -kernel void selectNeurons2DForward( +kernel void selectNeurons2DForwardFloat( const device float * outsPrev, constant uint * pTarget, constant uint * pNbNeurons, @@ -591,7 +591,7 @@ kernel void selectNeurons2DForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void selectNeurons2DBackward( +kernel void selectNeurons2DBackwardFloat( const device float * delta, constant uint * pTarget, constant uint * pNbNeurons, @@ -652,7 +652,7 @@ kernel void selectNeurons2DBackward( } } -kernel void IRDFT2RGBForward( +kernel void IRDFT2RGBForwardFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -707,7 +707,7 @@ kernel void IRDFT2RGBForward( outs[offset] = sum; } -kernel void IRDFT2RGBBackward( +kernel void IRDFT2RGBBackwardFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -779,7 +779,7 @@ kernel void IRDFT2RGBBackward( } } -kernel void decorrelateRGBForward( +kernel void decorrelateRGBForwardFloat( const device float * outsPrev, constant float * correlation, constant uint * pNbChannels, @@ -831,7 +831,7 @@ kernel void decorrelateRGBForward( outs[offset] = sum; } -kernel void decorrelateRGBBackward( +kernel void decorrelateRGBBackwardFloat( const device float * delta, constant float * correlation, constant uint * pNbChannels, @@ -894,7 +894,7 @@ kernel void decorrelateRGBBackward( } } -kernel void linearScale2DForward( +kernel void linearScale2DForwardFloat( const device float * outsPrev, constant float * weights, constant uint * pNbChannels, @@ -935,7 +935,7 @@ kernel void linearScale2DForward( outs[offset] = weights[0] * outsPrev[offset] + weights[1]; } -kernel void linearScale2DBackward( +kernel void linearScale2DBackwardFloat( const device float * delta, constant float * weights, constant uint * pNbChannels, @@ -996,7 +996,7 @@ float _getScaleValue( return (1.0 / freq) * float(dimension); } -kernel void setDataFTFrequences2D( +kernel void setDataFTFrequences2DFloat( constant uint * pNbChannels, constant uint * pDimension, constant uint * pNbBatch, @@ -1063,7 +1063,7 @@ kernel void setDataFTFrequences2D( outs[offset] = _getScaleValue(iTmp, jTmp, dimension); } -kernel void pad2DForward( +kernel void pad2DForwardFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -1124,7 +1124,7 @@ kernel void pad2DForward( } } -kernel void pad2DBackward( +kernel void pad2DBackwardFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -1184,7 +1184,7 @@ kernel void pad2DBackward( } } -kernel void crop2DForward( +kernel void crop2DForwardFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -1238,7 +1238,7 @@ kernel void crop2DForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void crop2DBackward( +kernel void crop2DBackwardFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -1312,7 +1312,7 @@ kernel void crop2DBackward( } } -kernel void resizeBilinearPadForward( +kernel void resizeBilinearPadForwardFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -1406,7 +1406,7 @@ kernel void resizeBilinearPadForward( } } -kernel void resizeBilinearPadBackward( +kernel void resizeBilinearPadBackwardFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -1526,7 +1526,7 @@ kernel void resizeBilinearPadBackward( }} } -kernel void rotate2DForward( +kernel void rotate2DForwardFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -1590,7 +1590,7 @@ kernel void rotate2DForward( } } -kernel void rotate2DBackward( +kernel void rotate2DBackwardFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -1659,7 +1659,7 @@ kernel void rotate2DBackward( }} } -kernel void resizeBilinearCropForward( +kernel void resizeBilinearCropForwardFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -1740,7 +1740,7 @@ kernel void resizeBilinearCropForward( outs[offset] = out; } -kernel void resizeBilinearCropBackward( +kernel void resizeBilinearCropBackwardFloat( const device float * delta, constant uint * pNbChannels, constant uint * pDimensions, @@ -1861,7 +1861,7 @@ kernel void resizeBilinearCropBackward( }} } -kernel void concat02DForward( +kernel void concat02DForwardFloat( const device float * outsPrev, constant uint * pGlobalOffset, constant uint * pNbChannels, @@ -1907,7 +1907,7 @@ kernel void concat02DForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void concat02DBackward( +kernel void concat02DBackwardFloat( const device float * delta, constant uint * pGlobalOffset, constant uint * pNbChannels, @@ -1963,7 +1963,7 @@ kernel void concat02DBackward( } } -kernel void concat12DForward( +kernel void concat12DForwardFloat( const device float * outsPrev, constant uint * pGlobalOffset, constant uint * pNbChannels, @@ -2012,7 +2012,7 @@ kernel void concat12DForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void concat12DBackward( +kernel void concat12DBackwardFloat( const device float * delta, constant uint * pGlobalOffset, constant uint * pNbChannels, @@ -2071,7 +2071,7 @@ kernel void concat12DBackward( } } -kernel void constant2DForward( +kernel void constant2DForwardFloat( const device float * weights, constant uint * pNbChannels, constant uint * pDimensions, @@ -2110,7 +2110,7 @@ kernel void constant2DForward( outs[offset] = weights[depth]; } -kernel void MSE2DLoss( +kernel void MSE2DLossFloat( const device float * outs, const device float * groundTruth, constant uint * pNbChannels, @@ -2160,7 +2160,7 @@ kernel void MSE2DLoss( losses[elem] = tmp; } -kernel void MSE2DLossDerivative( +kernel void MSE2DLossDerivativeFloat( const device float * outs, const device float * groundTruth, constant uint * pNbChannels, @@ -2220,7 +2220,7 @@ kernel void MSE2DLossDerivative( } } -kernel void selfCorrelate2DForward( +kernel void selfCorrelate2DForwardFloat( const device float * outsPrev, constant uint * pNbChannelsPrev, constant uint * pDimensionsPrev, @@ -2271,7 +2271,7 @@ kernel void selfCorrelate2DForward( outs[offset] = correlation; } -kernel void selfCorrelate2DBackward( +kernel void selfCorrelate2DBackwardFloat( const device float * delta, const device float * outsPrev, constant uint * pNbChannelsPrev, @@ -2342,7 +2342,7 @@ kernel void selfCorrelate2DBackward( } } -kernel void normalize12DForward( +kernel void normalize12DForwardFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -2394,7 +2394,7 @@ kernel void normalize12DForward( outs[offset] = outPrev / max(norm, 1e-12); } -kernel void normalize12DBackward( +kernel void normalize12DBackwardFloat( const device float * delta, const device float * outsPrev, constant uint * pNbChannels, @@ -2480,7 +2480,7 @@ kernel void normalize12DBackward( } } -kernel void computeSquaredNorm122D( +kernel void computeSquaredNorm122DFloat( const device float * outsPrev, constant uint * pNbChannels, constant uint * pDimensions, @@ -2549,7 +2549,7 @@ kernel void computeSquaredNorm122D( } } -kernel void normalize122DForward( +kernel void normalize122DForwardFloat( const device float * outsPrev, const device float * squaredNorms, constant uint * pNbChannels, @@ -2596,7 +2596,7 @@ kernel void normalize122DForward( outs[offset] = outPrev / max(norm, 1e-12); } -kernel void computeDeltaTmp122D( +kernel void computeDeltaTmp122DFloat( const device float * delta, const device float * outsPrev, const device float * squaredNorms, @@ -2673,7 +2673,7 @@ kernel void computeDeltaTmp122D( } } -kernel void normalize122DBackward( +kernel void normalize122DBackwardFloat( const device float * delta, const device float * outsPrev, const device float * squaredNorms, @@ -2746,7 +2746,7 @@ kernel void normalize122DBackward( } } -kernel void similarBatchError2DLoss( +kernel void similarBatchError2DLossFloat( const device float * outs, constant uint * pNbChannels, constant uint * pDimensions, @@ -2795,7 +2795,7 @@ kernel void similarBatchError2DLoss( } } -kernel void similarBatchError2DLossDerivative( +kernel void similarBatchError2DLossDerivativeFloat( const device float * outs, constant uint * pNbChannels, constant uint * pDimensions, @@ -2856,7 +2856,7 @@ kernel void similarBatchError2DLossDerivative( } } -kernel void similarError2DLossDerivative( +kernel void similarError2DLossDerivativeFloat( const device float * outs, constant uint * pGlobalOffset, constant uint * pNbChannels, @@ -2923,7 +2923,7 @@ kernel void similarError2DLossDerivative( } } -kernel void flipHorizontal2DForward( +kernel void flipHorizontal2DForwardFloat( const device float * outsPrev, constant uint * pDoFlip, constant uint * pNbChannels, @@ -2971,7 +2971,7 @@ kernel void flipHorizontal2DForward( outs[offset1] = outsPrev[offset2]; } -kernel void flipHorizontal2DBackward( +kernel void flipHorizontal2DBackwardFloat( const device float * delta, constant uint * pDoFlip, constant uint * pNbChannels, @@ -3029,7 +3029,7 @@ kernel void flipHorizontal2DBackward( } } -kernel void flipVertical2DForward( +kernel void flipVertical2DForwardFloat( const device float * outsPrev, constant uint * pDoFlip, constant uint * pNbChannels, @@ -3077,7 +3077,7 @@ kernel void flipVertical2DForward( outs[offset1] = outsPrev[offset2]; } -kernel void flipVertical2DBackward( +kernel void flipVertical2DBackwardFloat( const device float * delta, constant uint * pDoFlip, constant uint * pNbChannels, @@ -3135,7 +3135,7 @@ kernel void flipVertical2DBackward( } } -kernel void colorJitterHSVForward( +kernel void colorJitterHSVForwardFloat( const device float * outsPrev, constant float * pNoise, constant uint * pDimensions, @@ -3260,7 +3260,7 @@ kernel void colorJitterHSVForward( outs[offsetB] = b; } -kernel void BCE2DLoss( +kernel void BCE2DLossFloat( const device float * outs, const device float * groundTruth, constant uint * pNbChannels, @@ -3311,7 +3311,7 @@ kernel void BCE2DLoss( losses[elem] = tmp; } -kernel void BCE2DLossDerivative( +kernel void BCE2DLossDerivativeFloat( const device float * outs, const device float * groundTruth, constant uint * pNbChannels, @@ -3380,7 +3380,7 @@ kernel void BCE2DLossDerivative( } } -kernel void BCESigmoid2DLoss( +kernel void BCESigmoid2DLossFloat( const device float * outs, const device float * groundTruth, constant uint * pNbChannels, @@ -3441,7 +3441,7 @@ kernel void BCESigmoid2DLoss( losses[elem] = tmp; } -kernel void BCESigmoid2DLossDerivative( +kernel void BCESigmoid2DLossDerivativeFloat( const device float * outs, const device float * groundTruth, constant uint * pNbChannels, @@ -3510,7 +3510,7 @@ kernel void BCESigmoid2DLossDerivative( } } -kernel void layerCAM2DForward( +kernel void layerCAM2DForwardFloat( const device float * outsPrev, const device float * deltaPrev, constant uint * pNbChannelsPrev, diff --git a/Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal b/Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal new file mode 100644 index 00000000..08fe23dc --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/Layer2DHalf.metal @@ -0,0 +1,3570 @@ +// +// Layer2D.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void avgPoolForwardHalf( + const device half * outsPrev, + constant uint * pNbNeurons, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint heightPrev, widthPrev; + uint nbNeurons; + uint nbBatch; + + if (pNbNeurons && pDimensionsPrev && pNbBatch && + outsPrev && outs) + { + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offsetStartPrev = (depth + nbNeurons * elem) * heightPrev; + + float tmp = 0.0; + for (uint i=0; i= heightPrev * nbBatch || + j * depthPrev >= widthPrev * nbNeurons) + { + return ; + } + + uint offset = depthPrev + nbNeurons * elem; + float deltaCur = delta[offset]; + + uint offsetStartPrev = (depthPrev + nbNeurons * elem) * heightPrev; + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + if (dirty) + { + deltaPrev[offsetPrev] = deltaCur / (heightPrev * widthPrev); + } + else + { + deltaPrev[offsetPrev] += deltaCur / (heightPrev * widthPrev); + } +} + +kernel void maxPoolForwardHalf( + const device half * outsPrev, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + device half * outs, + device int * indicesMax, + uint2 id [[ thread_position_in_grid ]]) +{ + int start, end; + uint stride; + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint nbBatch; + + if (pStart && pStride && pNbChannels && pDimensions && pDimensionsPrev && + pNbBatch && outsPrev && outs && indicesMax) + { + start = pStart[0]; + end = pStart[1]; + stride = pStride[0]; + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + int indexMax = -1; + float maxVal = -10000.0; + for (int k=start; k<=end; k++){ + for (int l=start; l<=end; l++) + { + if ((int)(stride*j)+l >= 0 && + (int)(stride*j)+l < (int)widthPrev && + (int)(stride*i)+k >= 0 && + (int)(stride*i)+k < (int)heightPrev) + { + uint offsetPrev = (int)(stride*j)+l + + (offsetStartPrev + (int)(stride*i)+k)*widthPrev; + + float outPrev = outsPrev[offsetPrev]; + if (outPrev > maxVal) + { + indexMax = offsetPrev; + indicesMax[offset] = offsetPrev; + maxVal = outPrev; + } + } + }} + + outs[offset] = maxVal; + indicesMax[offset] = indexMax; +} + +kernel void maxPoolBackwardHalf( + const device half * delta, + const device int * indicesMax, + constant int * pStart, + constant uint * pStride, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + int start, end; + uint stride; + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint nbBatch; + uint dirty; + + if (pStart && pStride && pNbChannels && pDimensions && pDimensionsPrev && + pNbBatch && pDirty && delta && indicesMax && deltaPrev) + { + start = pStart[0]; + end = pStart[1]; + stride = pStride[0]; + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depth >= widthPrev * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + float tmp = 0.0; + for (int k=start; k<=end; k++){ + for (int l=start; l<=end; l++) + { + int i1, j1; + // i-k rather than i+k to take into account non symetric kernels. + // Exemple: size of kernel 2 instead of 3. + if ((i-k) % stride != 0) + { + continue; + } + else if ((j-l) % stride != 0) + { + continue; + } + else + { + i1 = (i-k) / stride; + j1 = (j-l) / stride; + } + if (j1 >= 0 && j1 < (int)width && + i1 >= 0 && i1 < (int)height) + { + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j1 + (offsetStart + i1) * width; + + if ((uint)indicesMax[offset] == offsetPrev) + { + tmp += delta[offset]; + } + } + }} + + if (dirty) + { + deltaPrev[offsetPrev] = tmp; + } + else + { + deltaPrev[offsetPrev] += tmp; + } +} + +uint _startIndex(uint index, uint smallSize, uint bigSize) +{ + float val = float(index * bigSize) / smallSize; + val = round(val * 1000) / 1000; + return (uint)(floor(val)); +} + +uint _endIndex(uint index, uint smallSize, uint bigSize) +{ + return (uint)(ceil(float((index + 1) * bigSize) / smallSize)); +} + +kernel void adaptiveAvgPoolForward1Half( + const device half * outsPrev, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pDimensionsPrev && pNbBatch && + outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint startI = _startIndex(i, height, heightPrev); + uint endI = _endIndex(i, height, heightPrev); + uint startJ = _startIndex(j, width, widthPrev); + uint endJ = _endIndex(j, width, widthPrev); + + uint nbElemsI = endI - startI; + uint nbElemsJ = endJ - startJ; + uint nbElems = nbElemsI * nbElemsJ; + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetStart = (depth + nbChannels * elem) * height; + + float tmp = 0.0; + for (uint k=0; k= nbChannels || elem >= nbBatch) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + + for (uint i = 0; i < heightPrev; i++) { + for (uint j = 0; j < widthPrev; j++) + { + uint startI = _startIndex(i, heightPrev, height); + uint endI = _endIndex(i, heightPrev, height); + uint startJ = _startIndex(j, widthPrev, width); + uint endJ = _endIndex(j, widthPrev, width); + + uint nbElemsI = endI - startI; + uint nbElemsJ = endJ - startJ; + + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + float outPrev = outsPrev[offsetPrev]; + + for (uint k = 0; k < nbElemsI; k++){ + for (uint l = 0; l < nbElemsJ; l++) + { + uint offset = startJ+l + (offsetStart + startI+k) * width; + + outs[offset] += outPrev; + nbElems[offset] += 1; + }} + }} + + for (uint I = 0; I < height; I++){ + for (uint J = 0; J < width; J++) + { + uint offset = J + (offsetStart + I) * width; + outs[offset] /= nbElems[offset]; + }} +} + +kernel void adaptiveAvgPoolBackward1Half( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pDimensionsPrev && pNbBatch && + delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbChannels || elem >= nbBatch) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + + for (uint i = 0; i < height; i++) { + for (uint j = 0; j < width; j++) + { + uint startI = _startIndex(i, height, heightPrev); + uint endI = _endIndex(i, height, heightPrev); + uint startJ = _startIndex(j, width, widthPrev); + uint endJ = _endIndex(j, width, widthPrev); + + uint nbElemsI = endI - startI; + uint nbElemsJ = endJ - startJ; + uint nbElems = nbElemsI * nbElemsJ; + + uint offset = j + (offsetStart + i) * width; + float deltaCur = delta[offset] / (float)nbElems; + + for (uint k = 0; k < nbElemsI; k++){ + for (uint l = 0; l < nbElemsJ; l++) + { + uint offsetPrev = startJ+l + + (offsetStartPrev + startI+k) * widthPrev; + deltaPrev[offsetPrev] += deltaCur; + }} + }} +} + +kernel void adaptiveAvgPoolBackward2Half( + const device half * delta, + const device int * nbElems, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pDimensionsPrev && pNbBatch && + delta && nbElems && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbChannels || elem >= nbBatch) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + + for (uint i = 0; i < heightPrev; i++) { + for (uint j = 0; j < widthPrev; j++) + { + uint startI = _startIndex(i, heightPrev, height); + uint endI = _endIndex(i, heightPrev, height); + uint startJ = _startIndex(j, widthPrev, width); + uint endJ = _endIndex(j, widthPrev, width); + + uint nbElemsI = endI - startI; + uint nbElemsJ = endJ - startJ; + + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + for (uint k = 0; k < nbElemsI; k++){ + for (uint l = 0; l < nbElemsJ; l++) + { + uint offset = startJ+l + (offsetStart + startI+k) * width; + deltaPrev[offsetPrev] += delta[offset] / nbElems[offset]; + }} + }} +} + +kernel void selectNeurons2DForwardHalf( + const device half * outsPrev, + constant uint * pTarget, + constant uint * pNbNeurons, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint targetI, targetJ; + uint heightPrev, widthPrev; + uint nbNeurons; + uint nbBatch; + + if (pTarget && pNbNeurons && pDimensionsPrev && pNbBatch && + outsPrev && outs) + { + targetI = pTarget[0]; + targetJ = pTarget[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offsetStartPrev = (depth + nbNeurons * elem) * heightPrev; + uint offsetPrev = targetJ + + (offsetStartPrev + targetI) * widthPrev; + uint offset = depth + nbNeurons * elem; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void selectNeurons2DBackwardHalf( + const device half * delta, + constant uint * pTarget, + constant uint * pNbNeurons, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint targetI, targetJ; + uint heightPrev, widthPrev; + uint nbNeurons; + uint nbBatch; + uint dirty; + + if (pTarget && pNbNeurons && pDimensionsPrev && pNbBatch && pDirty && + delta && deltaPrev) + { + targetI = pTarget[0]; + targetJ = pTarget[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depthPrev = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depthPrev >= widthPrev * nbNeurons) + { + return ; + } + + float deltaCur = 0.0; + if (i == targetI && j == targetJ) + { + uint offset = depthPrev + nbNeurons * elem; + deltaCur = delta[offset]; + } + + uint offsetStartPrev = (depthPrev + nbNeurons * elem) * heightPrev; + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + if (dirty) + { + deltaPrev[offsetPrev] = deltaCur; + } + else + { + deltaPrev[offsetPrev] += deltaCur; + } +} + +kernel void IRDFT2RGBForwardHalf( + const device half * outsPrev, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStartRealPrev = (2 * depth + 2 * nbChannels * elem) * height; + uint offsetStartImPrev = (2 * depth + 1 + 2 * nbChannels * elem) * height; + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float sum = 0.0; + for (uint k=0; k= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStartRealPrev = (2 * depth + 2 * nbChannels * elem) * height; + uint offsetStartImPrev = (2 * depth + 1 + 2 * nbChannels * elem) * height; + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetRealPrev = j + (offsetStartRealPrev + i) * width; + uint offsetImPrev = j + (offsetStartImPrev + i) * width; + + float sum1 = 0.0; + float sum2 = 0.0; + for (uint k=0; k= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint block = depth / 3; + uint res = depth % 3; + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float sum = 0.0; + for (uint k=0; k<3; k++) + { + uint offsetStartPrev = (block * 3 + k + nbChannels * elem) * height; + uint offsetPrev = j + (offsetStartPrev + i) * width; + + sum += outsPrev[offsetPrev] * correlation[res * 3 + k]; + } + outs[offset] = sum; +} + +kernel void decorrelateRGBBackwardHalf( + const device half * delta, + constant float * correlation, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + uint dirty; + + if (pNbChannels && pDimensions && pNbBatch && pDirty && + delta && correlation && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint block = depth / 3; + uint res = depth % 3; + + uint offsetStartPrev = (depth + nbChannels * elem) * height; + uint offsetPrev = j + (offsetStartPrev + i) * width; + + float sum = 0.0; + for (uint k=0; k<3; k++) + { + uint offsetStart = (block * 3 + k + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + sum += delta[offset] * correlation[k * 3 + res]; + } + + if (dirty) + { + deltaPrev[offsetPrev] = sum; + } + else + { + deltaPrev[offsetPrev] += sum; + } +} + +kernel void linearScale2DForwardHalf( + const device half * outsPrev, + constant float * weights, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && + outsPrev && weights && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + outs[offset] = weights[0] * outsPrev[offset] + weights[1]; +} + +kernel void linearScale2DBackwardHalf( + const device half * delta, + constant float * weights, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + uint dirty; + + if (pNbChannels && pDimensions && pNbBatch && pDirty && + delta && weights && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * height; + uint offsetPrev = j + (offsetStartPrev + i) * width; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offsetPrev] * weights[0]; + } + else + { + deltaPrev[offsetPrev] += delta[offsetPrev] * weights[0]; + } +} + +float _getScaleValue( + const uint i, + const uint j, + const uint dimension) +{ + float freq = sqrt(float(i * i + j * j)) / float(dimension); + freq = max(freq, 1.0 / float(dimension)); + return (1.0 / freq) * float(dimension); +} + +kernel void setDataFTFrequences2DHalf( + constant uint * pNbChannels, + constant uint * pDimension, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint dimension; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimension && pNbBatch && outs) + { + dimension = *pDimension; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / dimension; + uint elem = id[1] / dimension; + uint i = id[1] % dimension; + uint j = id[0] % dimension; + + if (i * elem >= dimension * nbBatch || + j * depth >= dimension * nbChannels) + { + return ; + } + + uint end = dimension % 2 == 0 ? dimension / 2 : (dimension - 1) / 2; + uint jTmp = j; + uint iTmp = i; + if (dimension % 2 == 0) + { + if (jTmp >= end) + { + jTmp = jTmp - end + 1; + jTmp = end + 1 - jTmp; + } + if (iTmp >= end) + { + iTmp = iTmp - end + 1; + iTmp = end + 1 - iTmp; + } + } + else + { + if (jTmp > end) + { + jTmp = jTmp - end; + jTmp = end + 1 - jTmp; + } + if (iTmp > end) + { + iTmp = iTmp - end; + iTmp = end + 1 - iTmp; + } + } + + uint offsetStart = (depth + nbChannels * elem) * dimension; + uint offset = j + (offsetStart + i) * dimension; + + outs[offset] = _getScaleValue(iTmp, jTmp, dimension); +} + +kernel void pad2DForwardHalf( + const device half * outsPrev, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pPadDimension, + constant float * pPadValue, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint padDimension; + float padValue; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && + pPadDimension && pPadValue && outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + padDimension = *pPadDimension; + padValue = *pPadValue; + widthPrev = width - 2 * padDimension; + heightPrev = height - 2 * padDimension; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + if (i < padDimension || i >= height - padDimension || + j < padDimension || j >= width - padDimension) + { + outs[offset] = padValue; + } + else + { + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev = j-padDimension + + (offsetStartPrev + i-padDimension) * widthPrev; + + outs[offset] = outsPrev[offsetPrev]; + } +} + +kernel void pad2DBackwardHalf( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pPadDimension, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint padDimension; + uint nbBatch; + uint dirty; + + if (pNbChannels && pDimensions && pPadDimension && pNbBatch && pDirty && + delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + padDimension = *pPadDimension; + widthPrev = width - 2 * padDimension; + heightPrev = height - 2 * padDimension; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depth >= widthPrev * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j+padDimension + + (offsetStart + i+padDimension) * width; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offset]; + } + else + { + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void crop2DForwardHalf( + const device half * outsPrev, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pCropDimension, + constant uint * pCropOffsets, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint cropDimension; + uint offsetI, offsetJ; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && + pCropDimension && pCropOffsets && outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + cropDimension = *pCropDimension; + offsetJ = pCropOffsets[0]; + offsetI = pCropOffsets[1]; + widthPrev = width + cropDimension; + heightPrev = height + cropDimension; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev = j+offsetJ + + (offsetStartPrev + i+offsetI) * widthPrev; + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void crop2DBackwardHalf( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pCropDimension, + constant uint * pCropOffsets, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint nbChannels; + uint cropDimension; + uint offsetI, offsetJ; + uint nbBatch; + uint dirty; + + if (pNbChannels && pDimensions && pNbBatch && + pCropDimension && pCropOffsets && pDirty && + delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + cropDimension = *pCropDimension; + offsetJ = pCropOffsets[0]; + offsetI = pCropOffsets[1]; + widthPrev = width + cropDimension; + heightPrev = height + cropDimension; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depth >= widthPrev * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + if (dirty && + (i < offsetI || i >= height + offsetI || + j < offsetJ || j >= width + offsetJ)) + { + deltaPrev[offsetPrev] = 0.0; + } + else if (dirty) + { + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j-offsetJ + (offsetStart + i-offsetI) * width; + + deltaPrev[offsetPrev] = delta[offset]; + } + else if (i >= offsetI && i < height + offsetI && + j >= offsetJ && j < width + offsetJ) + { + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j-offsetJ + (offsetStart + i-offsetI) * width; + + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void resizeBilinearPadForwardHalf( + const device half * outsPrev, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimensionsResize, + constant uint * pPadDimensions, + constant float * pPadValue, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint heightResize, widthResize; + uint nbChannels; + uint padStartI, padEndI; + uint padStartJ, padEndJ; + float padValue; + uint nbBatch; + + if (pNbChannels && pDimensions && pDimensionsPrev && pDimensionsResize && + pPadDimensions && pPadValue && pNbBatch && outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + widthResize = pDimensionsResize[0]; + heightResize = pDimensionsResize[1]; + padStartI = pPadDimensions[0]; + padEndI = pPadDimensions[1]; + padStartJ = pPadDimensions[2]; + padEndJ = pPadDimensions[3]; + padValue = *pPadValue; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + float ratioInOutI = float(heightPrev - 1) / float(heightResize - 1); + float ratioInOutJ = float(widthPrev - 1) / float(widthResize - 1); + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + if (i < padStartI || i >= height - padEndI || + j < padStartJ || j >= width - padEndJ) + { + outs[offset] = padValue; + } + else + { + float I = i-padStartI; + float J = j-padStartJ; + + float iPrev = I * ratioInOutI; + float jPrev = J * ratioInOutJ; + + uint iPrevInf = floor(iPrev); + uint iPrevSup = ceil(iPrev); + uint jPrevInf = floor(jPrev); + uint jPrevSup = ceil(jPrev); + + float iWeight = ratioInOutI * I - iPrevInf; + float jWeight = ratioInOutJ * J - jPrevInf; + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev11 = jPrevInf + (offsetStartPrev + iPrevInf) * widthPrev; + uint offsetPrev12 = jPrevSup + (offsetStartPrev + iPrevInf) * widthPrev; + uint offsetPrev21 = jPrevInf + (offsetStartPrev + iPrevSup) * widthPrev; + uint offsetPrev22 = jPrevSup + (offsetStartPrev + iPrevSup) * widthPrev; + + float out = outsPrev[offsetPrev11] * (1.0 - iWeight) * (1.0 - jWeight); + out += outsPrev[offsetPrev12] * (1.0 - iWeight) * jWeight; + out += outsPrev[offsetPrev21] * iWeight * (1.0 - jWeight); + out += outsPrev[offsetPrev22] * iWeight * jWeight; + + outs[offset] = out; + } +} + +kernel void resizeBilinearPadBackwardHalf( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimensionsResize, + constant uint * pPadDimensions, + constant uint * pNbBatch, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint heightResize, widthResize; + uint nbChannels; + uint padStartI, padEndI; + uint padStartJ, padEndJ; + uint nbBatch; + + if (pNbChannels && pDimensions && pDimensionsPrev && pDimensionsResize && + pPadDimensions && pNbBatch && delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + widthResize = pDimensionsResize[0]; + heightResize = pDimensionsResize[1]; + padStartI = pPadDimensions[0]; + padEndI = pPadDimensions[1]; + padStartJ = pPadDimensions[2]; + padEndJ = pPadDimensions[3]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depth >= widthPrev * nbChannels) + { + return ; + } + + float ratioInOutI = float(heightPrev - 1) / float(heightResize - 1); + float ratioInOutJ = float(widthPrev - 1) / float(widthResize - 1); + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + float kLow = (i-1.0) / ratioInOutI; + float kHigh = (i+1.0) / ratioInOutI; + float lLow = (j-1.0) / ratioInOutJ; + float lHigh = (j+1.0) / ratioInOutJ; + + int kStart = ceil(kLow); + int kEnd = floor(kHigh); + int lStart = ceil(lLow); + int lEnd = floor(lHigh); + + for (int k = kStart; k <= kEnd; k++) { + for (int l = lStart; l <= lEnd; l++) + { + if (k >= 0 && k < (int)heightResize && + l >= 0 && l < (int)widthResize) + { + float kPrev = k * ratioInOutI; + float lPrev = l * ratioInOutJ; + + uint kPrevInf = floor(kPrev); + uint kPrevSup = ceil(kPrev); + uint lPrevInf = floor(lPrev); + uint lPrevSup = ceil(lPrev); + + float kWeight = ratioInOutI * k - kPrevInf; + float lWeight = ratioInOutJ * l - lPrevInf; + + if (kPrevInf == i && lPrevInf == j) + { + uint offset = l+padStartJ + + (offsetStart + k+padStartI) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += + deltaCur * (1.0 - kWeight) * (1.0 - lWeight); + } + else if (kPrevInf == i && lPrevSup == j) + { + uint offset = l+padStartJ + + (offsetStart + k+padStartI) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += deltaCur * (1.0 - kWeight) * lWeight; + } + else if (kPrevSup == i && lPrevInf == j) + { + uint offset = l+padStartJ + + (offsetStart + k+padStartI) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += deltaCur * kWeight * (1.0 - lWeight); + } + else if (kPrevSup == i && lPrevSup == j) + { + uint offset = l+padStartJ + + (offsetStart + k+padStartI) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += deltaCur * kWeight * lWeight; + } + } + }} +} + +kernel void rotate2DForwardHalf( + const device half * outsPrev, + constant uint * pNbChannels, + constant uint * pDimensions, + constant float * pAngle, + constant float * pPadValue, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + float angle, padValue; + uint nbBatch; + + if (pNbChannels && pDimensions && pAngle && pPadValue && pNbBatch && + outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + angle = *pAngle; + padValue = *pPadValue; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + float centerI = float(height - 1) / 2.0; + float centerJ = float(width - 1) / 2.0; + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float prevJ = + cos(-angle) * (float(j) - centerJ) + + sin(-angle) * (float(i) - centerI) + centerJ; + float prevI = + cos(-angle) * (float(i) - centerI) - + sin(-angle) * (float(j) - centerJ) + centerI; + + if (round(prevJ) < 0 || round(prevJ) >= float(width) || + round(prevI) < 0 || round(prevI) >= float(height)) + { + outs[offset] = padValue; + } + else + { + uint offsetPrev = round(prevJ) + (offsetStart + round(prevI)) * width; + outs[offset] = outsPrev[offsetPrev]; + } +} + +kernel void rotate2DBackwardHalf( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant float * pAngle, + constant uint * pNbBatch, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + float angle; + uint nbBatch; + + if (pNbChannels && pDimensions && pAngle && pNbBatch && + delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + angle = *pAngle; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + float centerI = float(height - 1) / 2.0; + float centerJ = float(width - 1) / 2.0; + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetPrev = j + (offsetStart + i) * width; + + float rotJ = + cos(angle) * (float(j) - centerJ) + + sin(angle) * (float(i) - centerI) + centerJ; + float rotI = + cos(angle) * (float(i) - centerI) - + sin(angle) * (float(j) - centerJ) + centerI; + + for (int k = floor(rotI); k <= ceil(rotI); k++) { + for (int l = floor(rotJ); l <= ceil(rotJ); l++) + { + float prevL = + cos(-angle) * (float(l) - centerJ) + + sin(-angle) * (float(k) - centerI) + centerJ; + float prevK = + cos(-angle) * (float(k) - centerI) - + sin(-angle) * (float(l) - centerJ) + centerI; + + if (round(prevL) == j && round(prevK) == i && + l >= 0 && l < (int)width && k >= 0 && k < (int)height) + { + uint offset = l + (offsetStart + k) * width; + deltaPrev[offsetPrev] += delta[offset]; + } + }} +} + +kernel void resizeBilinearCropForwardHalf( + const device half * outsPrev, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimensions2Resize, + constant uint * pCropOffsets, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint height2Resize, width2Resize; + uint offsetI, offsetJ; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pDimensionsPrev && pDimensions2Resize && + pCropOffsets && pNbBatch && outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + width2Resize = pDimensions2Resize[0]; + height2Resize = pDimensions2Resize[1]; + offsetJ = pCropOffsets[0]; + offsetI = pCropOffsets[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + float ratioInOutI = float(height2Resize - 1) / float(height - 1); + float ratioInOutJ = float(width2Resize - 1) / float(width - 1); + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float iPrev = i * ratioInOutI; + float jPrev = j * ratioInOutJ; + + uint iPrevInf = floor(iPrev); + uint iPrevSup = ceil(iPrev); + uint jPrevInf = floor(jPrev); + uint jPrevSup = ceil(jPrev); + + float iWeight = ratioInOutI * i - iPrevInf; + float jWeight = ratioInOutJ * j - jPrevInf; + + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev11 = jPrevInf+offsetJ + + (offsetStartPrev + iPrevInf+offsetI) * widthPrev; + uint offsetPrev12 = jPrevSup+offsetJ + + (offsetStartPrev + iPrevInf+offsetI) * widthPrev; + uint offsetPrev21 = jPrevInf+offsetJ + + (offsetStartPrev + iPrevSup+offsetI) * widthPrev; + uint offsetPrev22 = jPrevSup+offsetJ + + (offsetStartPrev + iPrevSup+offsetI) * widthPrev; + + float out = outsPrev[offsetPrev11] * (1.0 - iWeight) * (1.0 - jWeight); + out += outsPrev[offsetPrev12] * (1.0 - iWeight) * jWeight; + out += outsPrev[offsetPrev21] * iWeight * (1.0 - jWeight); + out += outsPrev[offsetPrev22] * iWeight * jWeight; + + outs[offset] = out; +} + +kernel void resizeBilinearCropBackwardHalf( + const device half * delta, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pDimensionsPrev, + constant uint * pDimensions2Resize, + constant uint * pCropOffsets, + constant uint * pNbBatch, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint heightPrev, widthPrev; + uint height2Resize, width2Resize; + uint offsetI, offsetJ; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pDimensionsPrev && pDimensions2Resize && + pCropOffsets && pNbBatch && delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + width2Resize = pDimensions2Resize[0]; + height2Resize = pDimensions2Resize[1]; + offsetJ = pCropOffsets[0]; + offsetI = pCropOffsets[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / widthPrev; + uint elem = id[1] / heightPrev; + uint i = id[1] % heightPrev; + uint j = id[0] % widthPrev; + + if (i * elem >= heightPrev * nbBatch || + j * depth >= widthPrev * nbChannels) + { + return ; + } + if (i < offsetI || i >= height2Resize + offsetI || + j < offsetJ || j >= width2Resize + offsetJ) + { + return ; + } + + float ratioInOutI = float(height2Resize - 1) / float(height - 1); + float ratioInOutJ = float(width2Resize - 1) / float(width - 1); + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offsetStartPrev = (depth + nbChannels * elem) * heightPrev; + uint offsetPrev = j + (offsetStartPrev + i) * widthPrev; + + float I = i-offsetI; + float J = j-offsetJ; + + float kLow = (I-1.0) / ratioInOutI; + float kHigh = (I+1.0) / ratioInOutI; + float lLow = (J-1.0) / ratioInOutJ; + float lHigh = (J+1.0) / ratioInOutJ; + + int kStart = ceil(kLow); + int kEnd = floor(kHigh); + int lStart = ceil(lLow); + int lEnd = floor(lHigh); + + for (int k = kStart; k <= kEnd; k++) { + for (int l = lStart; l <= lEnd; l++) + { + if (k >= 0 && k < (int)height && + l >= 0 && l < (int)width) + { + float kPrev = k * ratioInOutI; + float lPrev = l * ratioInOutJ; + + uint kPrevInf = floor(kPrev); + uint kPrevSup = ceil(kPrev); + uint lPrevInf = floor(lPrev); + uint lPrevSup = ceil(lPrev); + + float kWeight = ratioInOutI * k - kPrevInf; + float lWeight = ratioInOutJ * l - lPrevInf; + + if (kPrevInf == I && lPrevInf == J) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += + deltaCur * (1.0 - kWeight) * (1.0 - lWeight); + } + else if (kPrevInf == I && lPrevSup == J) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += deltaCur * (1.0 - kWeight) * lWeight; + } + else if (kPrevSup == I && lPrevInf == J) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += deltaCur * kWeight * (1.0 - lWeight); + } + else if (kPrevSup == I && lPrevSup == J) + { + uint offset = l + (offsetStart + k) * width; + float deltaCur = delta[offset]; + + deltaPrev[offsetPrev] += deltaCur * kWeight * lWeight; + } + } + }} +} + +kernel void concat02DForwardHalf( + const device half * outsPrev, + constant uint * pGlobalOffset, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + uint globalOffset; + + if (pGlobalOffset && pNbChannels && pDimensions && + pNbBatch && outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + globalOffset = *pGlobalOffset; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * height; + uint offsetStart = (depth + nbChannels * (globalOffset+elem)) * height; + + uint offsetPrev = j + (offsetStartPrev + i) * width; + uint offset = j + (offsetStart + i) * width; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void concat02DBackwardHalf( + const device half * delta, + constant uint * pGlobalOffset, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + uint globalOffset; + uint dirty; + + if (pGlobalOffset && pNbChannels && pDimensions && + pNbBatch && pDirty && delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + globalOffset = *pGlobalOffset; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStartPrev = (depth + nbChannels * elem) * height; + uint offsetStart = (depth + nbChannels * (globalOffset+elem)) * height; + + uint offsetPrev = j + (offsetStartPrev + i) * width; + uint offset = j + (offsetStart + i) * width; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offset]; + } + else + { + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void concat12DForwardHalf( + const device half * outsPrev, + constant uint * pGlobalOffset, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbChannelsPrev; + uint nbBatch; + uint globalOffset; + + if (pGlobalOffset && pNbChannels && pNbChannelsPrev && pDimensions && + pNbBatch && outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + globalOffset = *pGlobalOffset; + } + else + return ; + + uint depthPrev = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depthPrev >= width * nbChannelsPrev) + { + return ; + } + + uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * height; + uint offsetStart = (globalOffset+depthPrev + nbChannels * elem) * height; + + uint offsetPrev = j + (offsetStartPrev + i) * width; + uint offset = j + (offsetStart + i) * width; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void concat12DBackwardHalf( + const device half * delta, + constant uint * pGlobalOffset, + constant uint * pNbChannels, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbChannelsPrev; + uint nbBatch; + uint globalOffset; + uint dirty; + + if (pGlobalOffset && pNbChannels && pNbChannelsPrev && pDimensions && + pNbBatch && pDirty && delta && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + globalOffset = *pGlobalOffset; + dirty = *pDirty; + } + else + return ; + + uint depthPrev = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depthPrev >= width * nbChannelsPrev) + { + return ; + } + + uint offsetStartPrev = (depthPrev + nbChannelsPrev * elem) * height; + uint offsetStart = (globalOffset+depthPrev + nbChannels * elem) * height; + + uint offsetPrev = j + (offsetStartPrev + i) * width; + uint offset = j + (offsetStart + i) * width; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offset]; + } + else + { + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void constant2DForwardHalf( + const device half * weights, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && weights && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + outs[offset] = weights[depth]; +} + +kernel void MSE2DLossHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * losses, + uint id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && outs && groundTruth && losses) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id; + if (elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float gt = groundTruth[offset]; + float out = outs[offset]; + float diff = out - gt; + + if (dirty) + { + deltaPrev[offset] = 2 * coeff * diff / + float(nbBatch * nbChannels * height * width); + } + else + { + deltaPrev[offset] += 2 * coeff * diff / + float(nbBatch * nbChannels * height * width); + } +} + +kernel void selfCorrelate2DForwardHalf( + const device half * outsPrev, + constant uint * pNbChannelsPrev, + constant uint * pDimensionsPrev, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint heightPrev, widthPrev; + uint nbChannelsPrev; + uint nbBatch; + + if (pNbChannelsPrev && pDimensionsPrev && pNbBatch && + outsPrev && outs) + { + widthPrev = pDimensionsPrev[0]; + heightPrev = pDimensionsPrev[1]; + nbChannelsPrev = *pNbChannelsPrev; + nbBatch = *pNbBatch; + } + else + return ; + + uint channel1 = id[0] / nbChannelsPrev; + uint channel2 = id[0] % nbChannelsPrev; + uint elem = id[1]; + + if (channel1 * channel2 >= nbChannelsPrev * nbChannelsPrev || + elem >= nbBatch) + { + return ; + } + + uint offsetStart1 = (channel1 + nbChannelsPrev * elem) * heightPrev; + uint offsetStart2 = (channel2 + nbChannelsPrev * elem) * heightPrev; + + float correlation = 0.0; + for (uint i=0; i= heightPrev * nbBatch || + j * depthPrev >= widthPrev * nbChannelsPrev) + { + return ; + } + + float correlation = 0.0; + for (uint col=0; col= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + float norm = 0.0; + for (uint depth1=0; depth1= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + float normTmp = 0.0; + for (uint depth1=0; depth1 1e-12) + { + for (uint depth1=0; depth1= nbChannels * height * width || + elem >= nbBatch) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float outPrev = outsPrev[offset]; + normShared[threadId[0]] = outPrev * outPrev; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1) + { + uint index = threadId[0] + groupId[0] * threadsPerThreadgroup; + if (threadId[0] < stride && + (index + stride) < nbChannels * height * width) + { + normShared[threadId[0]] += normShared[threadId[0] + stride]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (threadId[0] == 0) + { + uint offset = elem * nbThreadgroups + groupId[0]; + squaredNorms[offset] = normShared[0]; + } +} + +kernel void normalize122DForwardHalf( + const device half * outsPrev, + const device half * squaredNorms, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbThreadgroups, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbThreadgroups; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch && + outsPrev && squaredNorms && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbThreadgroups = *pNbThreadgroups; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float norm = sqrt(squaredNorms[elem]); + float outPrev = outsPrev[offset]; + + outs[offset] = outPrev / max(norm, 1e-12); +} + +kernel void computeDeltaTmp122DHalf( + const device half * delta, + const device half * outsPrev, + const device half * squaredNorms, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbThreadgroups, + constant uint * pNbBatch, + device half * deltaTmp, + uint2 groupId [[ threadgroup_position_in_grid ]], + uint2 threadId [[ thread_position_in_threadgroup ]], + uint2 id [[ thread_position_in_grid ]]) +{ + constexpr uint threadsPerThreadgroup = 64; + threadgroup float deltaShared[threadsPerThreadgroup]; + + uint height, width; + uint nbChannels; + uint nbThreadgroups; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch && + delta && outsPrev && squaredNorms && deltaTmp) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbThreadgroups = *pNbThreadgroups; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id[1]; + uint remains = id[0]; + uint depth = remains / (height * width); + remains = remains % (height * width); + uint i = remains / width; + uint j = remains % width; + + if (depth * i * j >= nbChannels * height * width || + elem >= nbBatch) + { + return ; + } + + float norm = sqrt(squaredNorms[elem]); + if (norm > 1e-12) + { + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float deltaCur = delta[offset]; + float outPrev = outsPrev[offset]; + + deltaShared[threadId[0]] = outPrev * deltaCur; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1) + { + uint index = threadId[0] + groupId[0] * threadsPerThreadgroup; + if (threadId[0] < stride && + (index + stride) < nbChannels * height * width) + { + deltaShared[threadId[0]] += deltaShared[threadId[0] + stride]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (threadId[0] == 0) + { + uint offset = elem * nbThreadgroups + groupId[0]; + deltaTmp[offset] = deltaShared[0]; + } + } +} + +kernel void normalize122DBackwardHalf( + const device half * delta, + const device half * outsPrev, + const device half * squaredNorms, + const device half * deltaTmp, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbThreadgroups, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbThreadgroups; + uint nbBatch; + uint dirty; + + if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch && pDirty && + delta && outsPrev && squaredNorms && deltaTmp && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbThreadgroups = *pNbThreadgroups; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + float norm = sqrt(squaredNorms[elem]); + float deltaCurTmp = deltaTmp[elem]; + float normTmp = pow(norm, 3); + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float outPrev = outsPrev[offset]; + float deltaCur = delta[offset]; + + float newValue = 0.0; + if (norm > 1e-12) + { + newValue = deltaCur / norm - deltaCurTmp * outPrev / normTmp; + } + else + { + newValue = deltaCur / 1e-12; + } + + if (dirty) + { + deltaPrev[offset] = newValue; + } + else + { + deltaPrev[offset] += newValue; + } +} + +kernel void similarBatchError2DLossHalf( + const device half * outs, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * losses, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && outs && losses) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem1 = id[0]; + uint elem2 = id[1]; + + if (elem1 >= nbBatch || elem2 >= nbBatch) + { + return ; + } + + if (elem1 == elem2) + { + losses[elem2 + nbBatch * elem1] = 0.0; + } + else + { + float sum = 0.0; + for (uint i=0; i= width * height || elem >= nbBatch) + { + return ; + } + + float sum = 0.0; + for (uint elem1=0; elem1= width * height || elem >= nbBatchPrev) + { + return ; + } + + float sum = 0.0; + for (uint elem1=0; elem1= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset1 = j + (offsetStart + i) * width; + uint offset2 = offset1; + if (doFlip) + { + offset2 = width-1-j + (offsetStart + i) * width; + } + + outs[offset1] = outsPrev[offset2]; +} + +kernel void flipHorizontal2DBackwardHalf( + const device half * delta, + constant uint * pDoFlip, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint doFlip; + uint height, width; + uint nbChannels; + uint nbBatch; + uint dirty; + + if (pDoFlip && pNbChannels && pDimensions && pNbBatch && pDirty && + delta && deltaPrev) + { + doFlip = *pDoFlip; + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset1 = j + (offsetStart + i) * width; + uint offset2 = offset1; + if (doFlip) + { + offset2 = width-1-j + (offsetStart + i) * width; + } + + if (dirty) + { + deltaPrev[offset1] = delta[offset2]; + } + else + { + deltaPrev[offset1] += delta[offset2]; + } +} + +kernel void flipVertical2DForwardHalf( + const device half * outsPrev, + constant uint * pDoFlip, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint doFlip; + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pDoFlip && pNbChannels && pDimensions && pNbBatch && + outsPrev && outs) + { + doFlip = *pDoFlip; + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset1 = j + (offsetStart + i) * width; + uint offset2 = offset1; + if (doFlip) + { + offset2 = j + (offsetStart + height-1-i) * width; + } + + outs[offset1] = outsPrev[offset2]; +} + +kernel void flipVertical2DBackwardHalf( + const device half * delta, + constant uint * pDoFlip, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint doFlip; + uint height, width; + uint nbChannels; + uint nbBatch; + uint dirty; + + if (pDoFlip && pNbChannels && pDimensions && pNbBatch && pDirty && + delta && deltaPrev) + { + doFlip = *pDoFlip; + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset1 = j + (offsetStart + i) * width; + uint offset2 = offset1; + if (doFlip) + { + offset2 = j + (offsetStart + height-1-i) * width; + } + + if (dirty) + { + deltaPrev[offset1] = delta[offset2]; + } + else + { + deltaPrev[offset1] += delta[offset2]; + } +} + +kernel void colorJitterHSVForwardHalf( + const device half * outsPrev, + constant float * pNoise, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + float noiseH, noiseS, noiseV; + uint height, width; + uint nbBatch; + + if (pNoise && pDimensions && pNbBatch && outsPrev && outs) + { + noiseH = pNoise[0]; + noiseS = pNoise[1]; + noiseV = pNoise[2]; + width = pDimensions[0]; + height = pDimensions[1]; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id[1]; + uint row = id[0] / width; + uint col = id[0] % width; + + if (row * col >= height * width || + elem >= nbBatch) + { + return ; + } + + uint offsetStartR = (0 + 3 * elem) * height; + uint offsetStartG = (1 + 3 * elem) * height; + uint offsetStartB = (2 + 3 * elem) * height; + + uint offsetR = col + (offsetStartR + row) * width; + uint offsetG = col + (offsetStartG + row) * width; + uint offsetB = col + (offsetStartB + row) * width; + + float r = outsPrev[offsetR]; + float g = outsPrev[offsetG]; + float b = outsPrev[offsetB]; + + float maxValue = max(max(r, g), b); + float minValue = min(min(r, g), b); + float delta = maxValue - minValue; + + float h; + if (delta == 0) + { + h = 0.0; + } + else if (maxValue == r) + { + h = (g - b) / delta; + } + else if (maxValue == g) + { + h = (g - b) / delta + 2.0; + } + else + { + h = (g - b) / delta + 4.0; + } + h *= 60.0; + + float s = 0.0; + if (maxValue != 0) + { + s = delta / maxValue; + } + + float v = maxValue; + + h += noiseH; h = max(h, 0.0); h = min(h, 360.0); + s += noiseS; s = max(s, 0.0); s = min(s, 1.0); + v += noiseV; v = max(v, 0.0); v = min(v, 1.0); + + if (s == 0.0) + { + r = v; g = v; b = v; + } + + float angle = h; + float sector = angle / 60; // Sector + float i = floor(sector); + float f = sector - i; // Factorial part of h + + float p = v * (1 - s); + float q = v * (1 - (s * f)); + float t = v * (1 - (s * (1 - f))); + + if (i == 0) + { + r = v; g = t; b = p; + } + else if (i == 1) + { + r = q; g = v; b = p; + } + else if (i == 2) + { + r = p; g = v; b = t; + } + else if (i == 3) + { + r = p; g = q; b = v; + } + else if (i == 4) + { + r = t; g = p; b = v; + } + else + { + r = v; g = p; b = q; + } + + outs[offsetR] = r; + outs[offsetG] = g; + outs[offsetB] = b; +} + +kernel void BCE2DLossHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * losses, + uint id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && outs && groundTruth && losses) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id; + if (elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float gt = groundTruth[offset]; + float out = outs[offset]; + float derivative = 0.0; + + if (gt == 1.0) + { + derivative = -1 / out; + } + else if (gt == 0.0) + { + derivative = 1 / (1 - out); + } + + if (dirty) + { + deltaPrev[offset] = coeff * derivative / + float(nbBatch * nbChannels * height * width); + } + else + { + deltaPrev[offset] += coeff * derivative / + float(nbBatch * nbChannels * height * width); + } +} + +kernel void BCESigmoid2DLossHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbBatch, + device half * losses, + uint id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbBatch && outs && groundTruth && losses) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id; + if (elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth 0) + { + value = (1 - gt) * out; + value += log(1 + exp(-out)); + } + else + { + value = -out * gt; + value += log(exp(out) + 1); + } + + tmp += value; + }} + } + + losses[elem] = tmp; +} + +kernel void BCESigmoid2DLossDerivativeHalf( + const device half * outs, + const device half * groundTruth, + constant uint * pNbChannels, + constant uint * pDimensions, + constant float * pCoeff, + constant uint * pNbBatch, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + float coeff; + uint nbBatch; + uint dirty; + + if (pNbChannels && pDimensions && pNbBatch && pCoeff && pDirty && + outs && groundTruth && deltaPrev) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + coeff = *pCoeff; + nbBatch = *pNbBatch; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0] / width; + uint elem = id[1] / height; + uint i = id[1] % height; + uint j = id[0] % width; + + if (i * elem >= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + float gt = groundTruth[offset]; + float out = outs[offset]; + float value; + + if (out >= 0) + { + value = 1.0 / (1.0 + exp(-out)); + } + else + { + value = exp(out) / (1.0 + exp(out)); + } + + if (dirty) + { + deltaPrev[offset] = coeff * (value - gt) / + float(nbBatch * nbChannels * height * width); + } + else + { + deltaPrev[offset] += coeff * (value - gt) / + float(nbBatch * nbChannels * height * width); + } +} + +kernel void layerCAM2DForwardHalf( + const device half * outsPrev, + const device half * deltaPrev, + constant uint * pNbChannelsPrev, + constant uint * pDimensions, + constant uint * pKeepPositive, + constant uint * pNbBatch, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbBatch; + uint nbChannelsPrev; + uint keepPositive; + + if (pNbChannelsPrev && pDimensions && pKeepPositive && pNbBatch && + outsPrev && outs) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannelsPrev = *pNbChannelsPrev; + keepPositive = *pKeepPositive; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id[1]; + uint i = id[0] / width; + uint j = id[0] % width; + + if (i * j >= height * width || elem >= nbBatch) + { + return ; + } + + float sum = 0.0; + for (uint depthPrev=0; depthPrev using namespace metal; -kernel void sum1( +kernel void sum1Float( const device float * ins, constant uint * pNbElems, device float * outs, @@ -31,7 +31,7 @@ kernel void sum1( outs[id] = ins[id]; } -kernel void sum14( +kernel void sum14Float( const device float4 * ins, constant uint * pNbElems, device float4 * outs, @@ -54,7 +54,7 @@ kernel void sum14( outs[id] = ins[id]; } -kernel void sum2( +kernel void sum2Float( const device float * ins, constant uint * pNbElems, device float * outs, @@ -77,7 +77,7 @@ kernel void sum2( outs[id] += ins[id]; } -kernel void sum24( +kernel void sum24Float( const device float4 * ins, constant uint * pNbElems, device float4 * outs, @@ -100,7 +100,7 @@ kernel void sum24( outs[id] += ins[id]; } -kernel void multiplyForward( +kernel void multiplyForwardFloat( const device float * outsPrev, constant uint * pNbElems, device float * outs, @@ -123,7 +123,7 @@ kernel void multiplyForward( outs[id] *= outsPrev[id]; } -kernel void multiplyBackward( +kernel void multiplyBackwardFloat( const device float * outs, const device float * delta, constant uint * pNbElems, diff --git a/Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal b/Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal new file mode 100644 index 00000000..d3ca0403 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/LayerMergeHalf.metal @@ -0,0 +1,161 @@ +// +// LayerMerge.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void sum1Half( + const device half * ins, + constant uint * pNbElems, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && ins && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + outs[id] = ins[id]; +} + +kernel void sum14Half( + const device half4 * ins, + constant uint * pNbElems, + device half4 * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && ins && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id * 4 >= nbElems) + { + return ; + } + + outs[id] = ins[id]; +} + +kernel void sum2Half( + const device half * ins, + constant uint * pNbElems, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && ins && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + outs[id] += ins[id]; +} + +kernel void sum24Half( + const device half4 * ins, + constant uint * pNbElems, + device half4 * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && ins && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id * 4 >= nbElems) + { + return ; + } + + outs[id] += ins[id]; +} + +kernel void multiplyForwardHalf( + const device half * outsPrev, + constant uint * pNbElems, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && outsPrev && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + outs[id] *= outsPrev[id]; +} + +kernel void multiplyBackwardHalf( + const device half * outs, + const device half * delta, + constant uint * pNbElems, + constant uint * pDirty, + device half * deltaPrev, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + uint dirty; + + if (pNbElems && pDirty && outs && delta && deltaPrev) + { + nbElems = pNbElems[0]; + dirty = *pDirty; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float tmp = outs[id]; + float deltaCur = delta[id]; + + if (dirty) + { + deltaPrev[id] = deltaCur * tmp; + } + else + { + deltaPrev[id] += deltaCur * tmp; + } +} diff --git a/Sources/GrAIdient/Metal/Kernel/LayerNorm.metal b/Sources/GrAIdient/Metal/Kernel/LayerNormFloat.metal similarity index 96% rename from Sources/GrAIdient/Metal/Kernel/LayerNorm.metal rename to Sources/GrAIdient/Metal/Kernel/LayerNormFloat.metal index 7049fea2..51a25688 100644 --- a/Sources/GrAIdient/Metal/Kernel/LayerNorm.metal +++ b/Sources/GrAIdient/Metal/Kernel/LayerNormFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void computeLayerNormSeqμ( +kernel void computeLayerNormSeqμFloat( const device float * tmps, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -48,7 +48,7 @@ kernel void computeLayerNormSeqμ( μ[seq + sequence * elem] = sum / nbElems; } -kernel void computeLayerNormSeqμ4( +kernel void computeLayerNormSeqμ4Float( const device float4 * tmps, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -89,7 +89,7 @@ kernel void computeLayerNormSeqμ4( μ[seq + sequence * elem] = (sum[0] + sum[1] + sum[2] + sum[3]) / nbElems; } -kernel void computeLayerNormSeqσ2( +kernel void computeLayerNormSeqσ2Float( const device float * tmps, const device float * μ, constant uint * pNbNeurons, @@ -132,7 +132,7 @@ kernel void computeLayerNormSeqσ2( σ2[seq + sequence * elem] = sum / nbElems; } -kernel void computeLayerNormSeqσ24( +kernel void computeLayerNormSeqσ24Float( const device float4 * tmps, const device float * μ, constant uint * pNbNeurons, @@ -176,7 +176,7 @@ kernel void computeLayerNormSeqσ24( σ2[seq + sequence * elem] = (sum[0] + sum[1] + sum[2] + sum[3]) / nbElems; } -kernel void forwardLayerNormSeq( +kernel void forwardLayerNormSeqFloat( const device float * β, const device float * Ɣ, const device float * μ, @@ -221,7 +221,7 @@ kernel void forwardLayerNormSeq( tmps[offset] = Ɣ[depth] * xhat + β[depth]; } -kernel void forwardLayerNormSeq4( +kernel void forwardLayerNormSeq4Float( const device float4 * β, const device float4 * Ɣ, const device float * μ, @@ -267,7 +267,7 @@ kernel void forwardLayerNormSeq4( tmps[offset] = Ɣ[depth] * xhat + β[depth]; } -kernel void backwardWeights1LayerNormSeq( +kernel void backwardWeights1LayerNormSeqFloat( const device float * delta, const device float * xHat, const device float * Ɣ, @@ -316,7 +316,7 @@ kernel void backwardWeights1LayerNormSeq( sum2[seq + sequence * elem] = tmp2; } -kernel void backwardWeights1LayerNormSeq4( +kernel void backwardWeights1LayerNormSeq4Float( const device float4 * delta, const device float4 * xHat, const device float4 * Ɣ, @@ -365,7 +365,7 @@ kernel void backwardWeights1LayerNormSeq4( sum2[seq + sequence * elem] = tmp2[0] + tmp2[1] + tmp2[2] + tmp2[3]; } -kernel void backwardWeights2LayerNormSeq( +kernel void backwardWeights2LayerNormSeqFloat( const device float * delta, const device float * xHat, constant uint * pNbNeurons, @@ -424,7 +424,7 @@ kernel void backwardWeights2LayerNormSeq( } } -kernel void backwardWeights2LayerNormSeq4( +kernel void backwardWeights2LayerNormSeq4Float( const device float4 * delta, const device float4 * xHat, constant uint * pNbNeurons, @@ -483,7 +483,7 @@ kernel void backwardWeights2LayerNormSeq4( } } -kernel void backwardLayerNormSeq( +kernel void backwardLayerNormSeqFloat( const device float * σ2, const device float * xHat, const device float * Ɣ, @@ -532,7 +532,7 @@ kernel void backwardLayerNormSeq( delta[offset] = mult * (tmp1 - tmp2 - tmp3); } -kernel void backwardLayerNormSeq4( +kernel void backwardLayerNormSeq4Float( const device float * σ2, const device float4 * xHat, const device float4 * Ɣ, diff --git a/Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal b/Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal new file mode 100644 index 00000000..cfecfa0f --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/LayerNormHalf.metal @@ -0,0 +1,583 @@ +// +// LayerNorm.metal +// GrAIdient +// +// Created by Jean-François Reboud on 09/03/2023. +// + +#include +using namespace metal; + +kernel void computeLayerNormSeqμHalf( + const device half * tmps, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half * μ, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbBatch && pSequence && tmps && μ) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint elem = id[1]; + uint seq = id[0]; + if (elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint nbElems = nbNeurons; + float sum = 0.0; + + for (uint depth=0; depth= nbBatch || seq >= sequence) + { + return ; + } + + uint nbElems = nbNeurons; + half4 sum = 0.0; + + for (uint depth=0; depth= nbBatch || seq >= sequence) + { + return ; + } + + uint nbElems = nbNeurons; + float sum = 0.0; + + for (uint depth=0; depth= nbBatch || seq >= sequence) + { + return ; + } + + uint nbElems = nbNeurons; + half4 sum = 0.0; + + for (uint depth=0; depth= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem; + + float tmp1 = tmps[offset] - μ[seq + sequence * elem]; + float tmp2 = sqrt(σ2[seq + sequence * elem] + Ɛ); + float xhat = tmp1 / tmp2; + xHat[offset] = xhat; + tmps[offset] = Ɣ[depth] * xhat + β[depth]; +} + +kernel void forwardLayerNormSeq4Half( + const device half4 * β, + const device half4 * Ɣ, + const device half * μ, + const device half * σ2, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half4 * tmps, + device half4 * xHat, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + float Ɛ = 1e-5; + + if (pNbNeurons && pNbBatch && pSequence && β && Ɣ && + tmps && xHat && μ && σ2) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = + (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4; + + half4 tmp1 = tmps[offset] - μ[seq + sequence * elem]; + float tmp2 = sqrt(σ2[seq + sequence * elem] + Ɛ); + half4 xhat = tmp1 / tmp2; + xHat[offset] = xhat; + tmps[offset] = Ɣ[depth] * xhat + β[depth]; +} + +kernel void backwardWeights1LayerNormSeqHalf( + const device half * delta, + const device half * xHat, + const device half * Ɣ, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half * sum1, + device half * sum2, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbBatch && pSequence && + delta && xHat && Ɣ && sum1 && sum2) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint elem = id[1]; + uint seq = id[0]; + if (elem >= nbBatch || seq >= sequence) + { + return ; + } + + float tmp1 = 0.0, tmp2 = 0.0; + for (uint depth=0; depth= nbBatch || seq >= sequence) + { + return ; + } + + half4 tmp1 = 0.0, tmp2 = 0.0; + for (uint depth=0; depth= nbNeurons) + { + return ; + } + + float tmp1 = 0.0, tmp2 = 0.0; + for (uint elem=0; elem= nbNeurons) + { + return ; + } + + half4 tmp1 = 0.0, tmp2 = 0.0; + for (uint elem=0; elem= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem; + + float mult = + 1.0 / ((float)nbElems * sqrt(σ2[seq + sequence * elem] + Ɛ)); + float dxHat = Ɣ[depth] * delta[offset]; + float tmp1 = nbElems * dxHat; + float tmp2 = sum1[seq + sequence * elem]; + float tmp3 = xHat[offset] * sum2[seq + sequence * elem]; + + delta[offset] = mult * (tmp1 - tmp2 - tmp3); +} + +kernel void backwardLayerNormSeq4Half( + const device half * σ2, + const device half4 * xHat, + const device half4 * Ɣ, + const device half * sum1, + const device half * sum2, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half4 * delta, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + float Ɛ = 1e-5; + + if (pNbNeurons && pNbBatch && pSequence && + σ2 && xHat && Ɣ && sum1 && sum2 && delta) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + uint nbElems = nbNeurons; + + if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = + (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4; + + float mult = + 1.0 / ((float)nbElems * sqrt(σ2[seq + sequence * elem] + Ɛ)); + half4 dxHat = Ɣ[depth] * delta[offset]; + half4 tmp1 = nbElems * dxHat; + float tmp2 = sum1[seq + sequence * elem]; + half4 tmp3 = xHat[offset] * sum2[seq + sequence * elem]; + + delta[offset] = mult * (tmp1 - tmp2 - tmp3); +} diff --git a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal b/Sources/GrAIdient/Metal/Kernel/LayerSeqFloat.metal similarity index 97% rename from Sources/GrAIdient/Metal/Kernel/LayerSeq.metal rename to Sources/GrAIdient/Metal/Kernel/LayerSeqFloat.metal index a5957708..b0bcfb3c 100644 --- a/Sources/GrAIdient/Metal/Kernel/LayerSeq.metal +++ b/Sources/GrAIdient/Metal/Kernel/LayerSeqFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void avgPoolSeqForward( +kernel void avgPoolSeqForwardFloat( const device float * outsPrev, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -50,7 +50,7 @@ kernel void avgPoolSeqForward( outs[offset] = tmp; } -kernel void avgPoolSeqBackward( +kernel void avgPoolSeqBackwardFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -98,7 +98,7 @@ kernel void avgPoolSeqBackward( } } -kernel void selectSeqForward( +kernel void selectSeqForwardFloat( const device float * outsPrev, constant uint * pNbNeurons, constant uint * pTargetSeq, @@ -137,7 +137,7 @@ kernel void selectSeqForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void selectSeqBackward( +kernel void selectSeqBackwardFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pTargetSeq, @@ -176,7 +176,7 @@ kernel void selectSeqBackward( deltaPrev[offsetPrev] += delta[offset]; } -kernel void concat1SeqForward( +kernel void concat1SeqForwardFloat( const device float * outsPrev, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -221,7 +221,7 @@ kernel void concat1SeqForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void concat1Seq4Forward( +kernel void concat1Seq4ForwardFloat( const device float4 * outsPrev, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -266,7 +266,7 @@ kernel void concat1Seq4Forward( outs[offset] = outsPrev[offsetPrev]; } -kernel void concat1SeqBackward( +kernel void concat1SeqBackwardFloat( const device float * delta, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -321,7 +321,7 @@ kernel void concat1SeqBackward( } } -kernel void concat1Seq4Backward( +kernel void concat1Seq4BackwardFloat( const device float4 * delta, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -376,7 +376,7 @@ kernel void concat1Seq4Backward( } } -kernel void concat2SeqForward( +kernel void concat2SeqForwardFloat( const device float * outsPrev, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -421,7 +421,7 @@ kernel void concat2SeqForward( outs[offset] = outsPrev[offsetPrev]; } -kernel void concat2SeqBackward( +kernel void concat2SeqBackwardFloat( const device float * delta, constant uint * pGlobalOffset, constant uint * pNbNeurons, @@ -476,7 +476,7 @@ kernel void concat2SeqBackward( } } -kernel void constant12SeqForward( +kernel void constant12SeqForwardFloat( const device float * weights, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -510,7 +510,7 @@ kernel void constant12SeqForward( outs[offset] = weights[depth + nbNeurons * seq]; } -kernel void constant12Seq4Forward( +kernel void constant12Seq4ForwardFloat( const device float4 * weights, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -545,7 +545,7 @@ kernel void constant12Seq4Forward( outs[offset] = weights[(depth * 4 + nbNeurons * seq) / 4]; } -kernel void constant12SeqBackward( +kernel void constant12SeqBackwardFloat( const device float * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -593,7 +593,7 @@ kernel void constant12SeqBackward( } } -kernel void constant12Seq4Backward( +kernel void constant12Seq4BackwardFloat( const device float4 * delta, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -642,7 +642,7 @@ kernel void constant12Seq4Backward( } } -kernel void constant2SeqForward( +kernel void constant2SeqForwardFloat( const device float * weights, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -676,7 +676,7 @@ kernel void constant2SeqForward( outs[offset] = weights[depth]; } -kernel void constant2Seq4Forward( +kernel void constant2Seq4ForwardFloat( const device float4 * weights, constant uint * pNbNeurons, constant uint * pNbBatch, @@ -711,7 +711,7 @@ kernel void constant2Seq4Forward( outs[offset] = weights[depth]; } -kernel void querySeqForward( +kernel void querySeqForwardFloat( const device float * query, const device float * key, constant uint * pNbHeads, @@ -772,7 +772,7 @@ kernel void querySeqForward( outs[offset] = tmp; } -kernel void querySeq4Forward( +kernel void querySeq4ForwardFloat( const device float4 * query, const device float4 * key, constant uint * pNbHeads, @@ -833,7 +833,7 @@ kernel void querySeq4Forward( outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3]; } -kernel void queryQuerySeqBackward( +kernel void queryQuerySeqBackwardFloat( const device float * delta, const device float * key, constant uint * pNbHeads, @@ -905,7 +905,7 @@ kernel void queryQuerySeqBackward( } } -kernel void queryQuerySeq4Backward( +kernel void queryQuerySeq4BackwardFloat( const device float * delta, const device float4 * key, constant uint * pNbHeads, @@ -977,7 +977,7 @@ kernel void queryQuerySeq4Backward( } } -kernel void queryKeySeqBackward( +kernel void queryKeySeqBackwardFloat( const device float * delta, const device float * query, constant uint * pNbHeads, @@ -1049,7 +1049,7 @@ kernel void queryKeySeqBackward( } } -kernel void queryKeySeq4Backward( +kernel void queryKeySeq4BackwardFloat( const device float * delta, const device float4 * query, constant uint * pNbHeads, @@ -1121,7 +1121,7 @@ kernel void queryKeySeq4Backward( } } -kernel void querySelfSeqForward( +kernel void querySelfSeqForwardFloat( const device float * outsPrev, constant uint * pNbHeads, constant uint * pNbNeurons, @@ -1191,7 +1191,7 @@ kernel void querySelfSeqForward( outs[offset] = tmp; } -kernel void querySelfSeq4Forward( +kernel void querySelfSeq4ForwardFloat( const device float4 * outsPrev, constant uint * pNbHeads, constant uint * pNbNeurons, @@ -1261,7 +1261,7 @@ kernel void querySelfSeq4Forward( outs[offset] = tmp[0] + tmp[1] + tmp[2] + tmp[3]; } -kernel void querySelfQuerySeqBackward( +kernel void querySelfQuerySeqBackwardFloat( const device float * outsPrev, const device float * delta, constant uint * pNbHeads, @@ -1342,7 +1342,7 @@ kernel void querySelfQuerySeqBackward( } } -kernel void querySelfQuerySeq4Backward( +kernel void querySelfQuerySeq4BackwardFloat( const device float4 * outsPrev, const device float * delta, constant uint * pNbHeads, @@ -1423,7 +1423,7 @@ kernel void querySelfQuerySeq4Backward( } } -kernel void querySelfKeySeqBackward( +kernel void querySelfKeySeqBackwardFloat( const device float * outsPrev, const device float * delta, constant uint * pNbHeads, @@ -1504,7 +1504,7 @@ kernel void querySelfKeySeqBackward( } } -kernel void querySelfKeySeq4Backward( +kernel void querySelfKeySeq4BackwardFloat( const device float4 * outsPrev, const device float * delta, constant uint * pNbHeads, @@ -1585,7 +1585,7 @@ kernel void querySelfKeySeq4Backward( } } -kernel void softmaxSeqForward( +kernel void softmaxSeqForwardFloat( const device float * outsPrev, constant uint * pNbHeads, constant uint * pNbNeurons, @@ -1651,7 +1651,7 @@ kernel void softmaxSeqForward( outs[offset] = exp(outPrev - cMax) / sum1; } -kernel void softmaxSeq4Forward( +kernel void softmaxSeq4ForwardFloat( const device float4 * outsPrev, constant uint * pNbHeads, constant uint * pNbNeurons, @@ -1723,7 +1723,7 @@ kernel void softmaxSeq4Forward( outs[offset] = exp(outPrev - cMax) / sum2; } -kernel void softmaxSeqBackward( +kernel void softmaxSeqBackwardFloat( const device float * outs, const device float * delta, constant uint * pNbHeads, @@ -1789,7 +1789,7 @@ kernel void softmaxSeqBackward( } } -kernel void softmaxSeq4Backward( +kernel void softmaxSeq4BackwardFloat( const device float4 * outs, const device float4 * delta, constant uint * pNbHeads, @@ -1857,7 +1857,7 @@ kernel void softmaxSeq4Backward( } } -kernel void valueSeqForward( +kernel void valueSeqForwardFloat( const device float * value, const device float * score, constant uint * pNbHeads, @@ -1915,7 +1915,7 @@ kernel void valueSeqForward( outs[offset] = tmp; } -kernel void valueSeq4Forward( +kernel void valueSeq4ForwardFloat( const device float4 * value, const device float * score, constant uint * pNbHeads, @@ -1973,7 +1973,7 @@ kernel void valueSeq4Forward( outs[offset] = tmp; } -kernel void valueValueSeqBackward( +kernel void valueValueSeqBackwardFloat( const device float * delta, const device float * score, constant uint * pNbHeads, @@ -2042,7 +2042,7 @@ kernel void valueValueSeqBackward( } } -kernel void valueValueSeq4Backward( +kernel void valueValueSeq4BackwardFloat( const device float4 * delta, const device float * score, constant uint * pNbHeads, @@ -2113,7 +2113,7 @@ kernel void valueValueSeq4Backward( } } -kernel void valueScoreSeqBackward( +kernel void valueScoreSeqBackwardFloat( const device float * delta, const device float * value, constant uint * pNbHeads, @@ -2184,7 +2184,7 @@ kernel void valueScoreSeqBackward( } } -kernel void valueScoreSeq4Backward( +kernel void valueScoreSeq4BackwardFloat( const device float4 * delta, const device float4 * value, constant uint * pNbHeads, @@ -2256,7 +2256,7 @@ kernel void valueScoreSeq4Backward( } } -kernel void valueSelfSeqForward( +kernel void valueSelfSeqForwardFloat( const device float * value, const device float * score, constant uint * pNbHeads, @@ -2323,7 +2323,7 @@ kernel void valueSelfSeqForward( outs[offset] = tmp; } -kernel void valueSelfSeq4Forward( +kernel void valueSelfSeq4ForwardFloat( const device float4 * value, const device float * score, constant uint * pNbHeads, @@ -2391,7 +2391,7 @@ kernel void valueSelfSeq4Forward( outs[offset] = tmp; } -kernel void valueSelfValueSeqBackward( +kernel void valueSelfValueSeqBackwardFloat( const device float * delta, const device float * score, constant uint * pNbHeads, @@ -2459,7 +2459,7 @@ kernel void valueSelfValueSeqBackward( value[offsetValue] += tmp; } -kernel void valueSelfValueSeq4Backward( +kernel void valueSelfValueSeq4BackwardFloat( const device float4 * delta, const device float * score, constant uint * pNbHeads, @@ -2528,7 +2528,7 @@ kernel void valueSelfValueSeq4Backward( value[offsetValue] += tmp; } -kernel void valueSelfScoreSeqBackward( +kernel void valueSelfScoreSeqBackwardFloat( const device float * delta, const device float * value, constant uint * pNbHeads, @@ -2607,7 +2607,7 @@ kernel void valueSelfScoreSeqBackward( } } -kernel void valueSelfScoreSeq4Backward( +kernel void valueSelfScoreSeq4BackwardFloat( const device float4 * delta, const device float4 * value, constant uint * pNbHeads, @@ -2687,7 +2687,7 @@ kernel void valueSelfScoreSeq4Backward( } } -kernel void layerCAMSeqForward( +kernel void layerCAMSeqForwardFloat( const device float * outsPrev, const device float * deltaPrev, constant uint * pNbNeuronsPrev, diff --git a/Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal b/Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal new file mode 100644 index 00000000..bc1c1bed --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/LayerSeqHalf.metal @@ -0,0 +1,2745 @@ +// +// LayerSeq.metal +// GrAIdient +// +// Created by Jean-François Reboud on 27/02/2023. +// + +#include +using namespace metal; + +kernel void avgPoolSeqForwardHalf( + const device half * outsPrev, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbBatch && pSequence && + outsPrev && outs) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint seq=0; seq= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + float deltaCur = delta[offset]; + + uint offsetPrev = depth + nbNeurons * seq + sequence * nbNeurons * elem; + if (dirty) + { + deltaPrev[offsetPrev] = deltaCur / sequence; + } + else + { + deltaPrev[offsetPrev] += deltaCur / sequence; + } +} + +kernel void selectSeqForwardHalf( + const device half * outsPrev, + constant uint * pNbNeurons, + constant uint * pTargetSeq, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint targetSeq; + uint nbNeurons; + uint nbBatch; + uint sequence; + + if (pTargetSeq && pNbNeurons && pNbBatch && pSequence && + outsPrev && outs) + { + targetSeq = *pTargetSeq; + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + uint offsetPrev = depth + + nbNeurons * targetSeq + sequence * nbNeurons * elem; + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void selectSeqBackwardHalf( + const device half * delta, + constant uint * pNbNeurons, + constant uint * pTargetSeq, + constant uint * pNbBatch, + constant uint * pSequence, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + uint targetSeq; + + if (pNbNeurons && pTargetSeq && pNbBatch && pSequence && + deltaPrev && delta) + { + targetSeq = *pTargetSeq; + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1]; + + if (depth >= nbNeurons || elem >= nbBatch) + { + return ; + } + + uint offset = depth + nbNeurons * elem; + uint offsetPrev = depth + + nbNeurons * targetSeq + sequence * nbNeurons * elem; + deltaPrev[offsetPrev] += delta[offset]; +} + +kernel void concat1SeqForwardHalf( + const device half * outsPrev, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + constant uint * pSequencePrev, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + uint sequencePrev; + uint globalOffset; + + if (pGlobalOffset && pNbNeurons && + pNbBatch && pSequence && pSequencePrev && outsPrev && outs) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + sequencePrev = *pSequencePrev; + globalOffset = *pGlobalOffset; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequencePrev; + uint seq = id[1] % sequencePrev; + + if (depth >= nbNeurons || elem >= nbBatch || seq >= sequencePrev) + { + return ; + } + + uint offsetPrev = depth + + nbNeurons * seq + sequencePrev * nbNeurons * elem; + uint offset = depth + + nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void concat1Seq4ForwardHalf( + const device half4 * outsPrev, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + constant uint * pSequencePrev, + device half4 * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + uint sequencePrev; + uint globalOffset; + + if (pGlobalOffset && pNbNeurons && + pNbBatch && pSequence && pSequencePrev && outsPrev && outs) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + sequencePrev = *pSequencePrev; + globalOffset = *pGlobalOffset; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequencePrev; + uint seq = id[1] % sequencePrev; + + if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequencePrev) + { + return ; + } + + uint offsetPrev = (depth * 4 + + nbNeurons * seq + sequencePrev * nbNeurons * elem) / 4; + uint offset = (depth * 4 + + nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem) / 4; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void concat1SeqBackwardHalf( + const device half * delta, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + constant uint * pSequencePrev, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + uint sequencePrev; + uint globalOffset; + uint dirty; + + if (pGlobalOffset && pNbNeurons && + pNbBatch && pSequence && pSequencePrev && pDirty && deltaPrev && delta) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + sequencePrev = *pSequencePrev; + globalOffset = *pGlobalOffset; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequencePrev; + uint seq = id[1] % sequencePrev; + + if (depth >= nbNeurons || elem >= nbBatch || seq >= sequencePrev) + { + return ; + } + + uint offsetPrev = depth + + nbNeurons * seq + sequencePrev * nbNeurons * elem; + uint offset = depth + + nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offset]; + } + else + { + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void concat1Seq4BackwardHalf( + const device half4 * delta, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + constant uint * pSequencePrev, + constant uint * pDirty, + device half4 * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + uint sequencePrev; + uint globalOffset; + uint dirty; + + if (pGlobalOffset && pNbNeurons && + pNbBatch && pSequence && pSequencePrev && pDirty && deltaPrev && delta) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + sequencePrev = *pSequencePrev; + globalOffset = *pGlobalOffset; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequencePrev; + uint seq = id[1] % sequencePrev; + + if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequencePrev) + { + return ; + } + + uint offsetPrev = (depth * 4 + + nbNeurons * seq + sequencePrev * nbNeurons * elem) / 4; + uint offset = (depth * 4 + + nbNeurons * (globalOffset+seq) + sequence * nbNeurons * elem) / 4; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offset]; + } + else + { + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void concat2SeqForwardHalf( + const device half * outsPrev, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + uint sequence; + uint globalOffset; + + if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev && + pNbBatch && pSequence && outsPrev && outs) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + sequence = *pSequence; + globalOffset = *pGlobalOffset; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth >= nbNeuronsPrev || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offsetPrev = depth + + nbNeuronsPrev * seq + sequence * nbNeuronsPrev * elem; + uint offset = globalOffset+depth + + nbNeurons * seq + sequence * nbNeurons * elem; + + outs[offset] = outsPrev[offsetPrev]; +} + +kernel void concat2SeqBackwardHalf( + const device half * delta, + constant uint * pGlobalOffset, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + constant uint * pSequence, + constant uint * pDirty, + device half * deltaPrev, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + uint sequence; + uint globalOffset; + uint dirty; + + if (pGlobalOffset && pNbNeurons && pNbNeuronsPrev && + pNbBatch && pSequence && pDirty && deltaPrev && delta) + { + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + sequence = *pSequence; + globalOffset = *pGlobalOffset; + dirty = *pDirty; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth >= nbNeuronsPrev || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offsetPrev = depth + + nbNeuronsPrev * seq + sequence * nbNeuronsPrev * elem; + uint offset = globalOffset+depth + + nbNeurons * seq + sequence * nbNeurons * elem; + + if (dirty) + { + deltaPrev[offsetPrev] = delta[offset]; + } + else + { + deltaPrev[offsetPrev] += delta[offset]; + } +} + +kernel void constant12SeqForwardHalf( + const device half * weights, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbBatch && pSequence && weights && outs) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth >= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem; + outs[offset] = weights[depth + nbNeurons * seq]; +} + +kernel void constant12Seq4ForwardHalf( + const device half4 * weights, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half4 * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbBatch && pSequence && weights && outs) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = + (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4; + outs[offset] = weights[(depth * 4 + nbNeurons * seq) / 4]; +} + +kernel void constant12SeqBackwardHalf( + const device half * delta, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + constant uint * pAccumulate, + device half * grads, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + uint accumulate; + + if (pNbNeurons && pNbBatch && pSequence && pAccumulate && delta && grads) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + accumulate = *pAccumulate; + } + else + return ; + + uint depth = id[0]; + uint seq = id[1]; + if (depth >= nbNeurons || seq >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint elem=0; elem= nbNeurons || seq >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint elem=0; elem= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem; + outs[offset] = weights[depth]; +} + +kernel void constant2Seq4ForwardHalf( + const device half4 * weights, + constant uint * pNbNeurons, + constant uint * pNbBatch, + constant uint * pSequence, + device half4 * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbBatch && pSequence && weights && outs) + { + nbNeurons = *pNbNeurons; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint depth = id[0]; + uint elem = id[1] / sequence; + uint seq = id[1] % sequence; + + if (depth * 4 >= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = + (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4; + outs[offset] = weights[depth]; +} + +kernel void querySeqForwardHalf( + const device half * query, + const device half * key, + constant uint * pNbHeads, + constant uint * pNbNeurons, + constant uint * pNbNeuronsPrev, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbHeads; + uint nbNeurons; + uint nbNeuronsPrev; + uint nbBatch; + uint sequence; + uint size; + + if (pNbHeads && pNbNeurons && pNbNeuronsPrev && pNbBatch && pSequence && + query && key && outs) + { + nbHeads = *pNbHeads; + nbNeurons = *pNbNeurons; + nbNeuronsPrev = *pNbNeuronsPrev; + nbBatch = *pNbBatch; + sequence = *pSequence; + size = nbNeuronsPrev / nbHeads; + } + else + return ; + + uint head = id[0] / sequence; + uint seqK = id[0] % sequence; + uint elem = id[1] / sequence; + uint seqQ = id[1] % sequence; + + if (head >= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint j=0; j= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint j=0; j= nbHeads || j >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j * 4 >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqQ=0; seqQ= nbHeads || j * 4 >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqQ=0; seqQ= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint j=0; j= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint j=0; j= nbHeads || j >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j * 4 >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqQ=0; seqQ= nbHeads || j * 4 >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqQ=0; seqQ= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + float cMax = outsPrev[ + 0+head*size + nbNeurons * seq + sequence * nbNeurons * elem + ]; + for (uint j=0; j cMax) + { + cMax = outPrev; + } + } + + float sum1 = 0.0; + for (uint j=0; j= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + float cMax = outsPrev[ + (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4 + ][0]; + for (uint j=0; j cMax) + { + cMax = max3; + } + } + + half4 sum1 = 0.0; + for (uint j=0; j= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem; + float outCur = outs[offset]; + float deltaCur = delta[offset]; + + float sum1 = 0.0; + for (uint j=0; j= nbNeurons || elem >= nbBatch || seq >= sequence) + { + return ; + } + + uint offset = + (depth * 4 + nbNeurons * seq + sequence * nbNeurons * elem) / 4; + half4 outCur = outs[offset]; + half4 deltaCur = delta[offset]; + + half4 sum1 = 0.0; + for (uint j=0; j= nbHeads || j >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqQ=0; seqQ= nbHeads || j * 4 >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqQ=0; seqQ= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint j=0; j= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint j=0; j= nbHeads || j >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j >= size || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqK=0; seqK= nbHeads || j >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint seqQ=0; seqQ= nbHeads || j * 4 >= size || + elem >= nbBatch || seqK >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint seqQ=0; seqQ= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + float tmp = 0.0; + for (uint j=0; j= nbHeads || seqK >= sequence || + elem >= nbBatch || seqQ >= sequence) + { + return ; + } + + half4 tmp = 0.0; + for (uint j=0; j= sequence || elem >= nbBatch) + { + return ; + } + + float sum = 0.0; + for (uint depthPrev=0; depthPrev using namespace metal; -kernel void clipGradients( +kernel void clipGradientsFloat( constant uint * pNbElems, constant float * pGradientNorm, constant float * pNormThreshold, @@ -36,7 +36,7 @@ kernel void clipGradients( grads[id] = grads[id] * normThreshold / gradientNorm; } -kernel void multiplyGradients( +kernel void multiplyGradientsFloat( constant uint * pNbElems, constant float * pFactor, device float * grads, @@ -61,7 +61,7 @@ kernel void multiplyGradients( grads[id] = grads[id] * factor; } -kernel void weightsSGD( +kernel void weightsSGDFloat( const device float * grads, constant uint * pNbElems, constant float * pAlpha, @@ -94,7 +94,7 @@ kernel void weightsSGD( weights[id] = weights[id] - alpha * g; } -kernel void weightsMomentum( +kernel void weightsMomentumFloat( const device float * grads, constant uint * pNbElems, constant float * pAlpha, @@ -133,7 +133,7 @@ kernel void weightsMomentum( weights[id] = weights[id] - v; } -kernel void weightsAdam( +kernel void weightsAdamFloat( const device float * grads, constant uint * pNbElems, constant float * pAlpha, @@ -184,7 +184,7 @@ kernel void weightsAdam( weights[id] = weights[id] - alpha * m / (sqrt(v) + Ɛ); } -kernel void weightsAMSGrad( +kernel void weightsAMSGradFloat( const device float * grads, constant uint * pNbElems, constant float * pAlpha, @@ -239,7 +239,7 @@ kernel void weightsAMSGrad( weights[id] = weights[id] - alpha * m / (sqrt(vHat) + Ɛ); } -kernel void weightsAdamRectified( +kernel void weightsAdamRectifiedFloat( const device float * grads, constant uint * pNbElems, constant float * pAlpha, @@ -302,7 +302,7 @@ kernel void weightsAdamRectified( } } -kernel void weightsAdaBound( +kernel void weightsAdaBoundFloat( const device float * grads, constant uint * pNbElems, constant float * pAlpha, @@ -368,7 +368,7 @@ kernel void weightsAdaBound( weights[id] = weights[id] - alphaHat * m; } -kernel void weightsAMSBound( +kernel void weightsAMSBoundFloat( const device float * grads, constant uint * pNbElems, constant float * pAlpha, diff --git a/Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal b/Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal new file mode 100644 index 00000000..ea7c7ce8 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/OptimizerHalf.metal @@ -0,0 +1,438 @@ +// +// Optimizer.metal +// GrAIdient +// +// Created by Jean-François Reboud on 09/10/2022. +// + +#include +using namespace metal; + +kernel void clipGradientsHalf( + constant uint * pNbElems, + constant float * pGradientNorm, + constant float * pNormThreshold, + device half * grads, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float gradientNorm; + float normThreshold; + + if (pNbElems && pGradientNorm && pNormThreshold && grads) + { + nbElems = *pNbElems; + gradientNorm = *pGradientNorm; + normThreshold = *pNormThreshold; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + grads[id] = grads[id] * normThreshold / gradientNorm; +} + +kernel void multiplyGradientsHalf( + constant uint * pNbElems, + constant float * pFactor, + device half * grads, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float factor; + + if (pNbElems && pFactor && grads) + { + nbElems = *pNbElems; + factor = *pFactor; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + grads[id] = grads[id] * factor; +} + +kernel void weightsSGDHalf( + const device half * grads, + constant uint * pNbElems, + constant float * pAlpha, + constant float * pLambda, + device half * weights, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float alpha, lambda; + + if (pNbElems && pAlpha && pLambda && grads && weights) + { + nbElems = pNbElems[0]; + alpha = *pAlpha; + lambda = *pLambda; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float g = grads[id]; + if (lambda != 0) + { + g += lambda * weights[id]; + } + weights[id] = weights[id] - alpha * g; +} + +kernel void weightsMomentumHalf( + const device half * grads, + constant uint * pNbElems, + constant float * pAlpha, + constant float * pLambda, + device half * weights, + device half * mPtr, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float alpha, lambda; + float β1 = 0.9; + + if (pNbElems && pAlpha && pLambda && grads && weights && mPtr) + { + nbElems = pNbElems[0]; + alpha = *pAlpha; + lambda = *pLambda; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float g = grads[id]; + if (lambda != 0) + { + g += lambda * weights[id]; + } + + float v = β1 * mPtr[id] + alpha * g; + mPtr[id] = v; + + weights[id] = weights[id] - v; +} + +kernel void weightsAdamHalf( + const device half * grads, + constant uint * pNbElems, + constant float * pAlpha, + constant float * pLambda, + constant float * pT, + device half * weights, + device half * mPtr, + device half * vPtr, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float alpha, lambda; + float t; + float β1 = 0.9; + float β2 = 0.999; + float Ɛ = 0.00000001; + + if (pNbElems && pAlpha && pLambda && pT && + grads && weights && mPtr && vPtr) + { + nbElems = pNbElems[0]; + alpha = *pAlpha; + lambda = *pLambda; + t = *pT; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float g = grads[id]; + if (lambda != 0) + { + g += lambda * weights[id]; + } + + float m = β1 * mPtr[id] + (1 - β1) * g; + float v = β2 * vPtr[id] + (1 - β2) * g * g; + mPtr[id] = m; + vPtr[id] = v; + + m /= (1 - pow(β1, t)); + v /= (1 - pow(β2, t)); + + weights[id] = weights[id] - alpha * m / (sqrt(v) + Ɛ); +} + +kernel void weightsAMSGradHalf( + const device half * grads, + constant uint * pNbElems, + constant float * pAlpha, + constant float * pLambda, + constant float * pT, + device half * weights, + device half * mPtr, + device half * vPtr, + device half * vHatPtr, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float alpha, lambda; + float t; + float β1 = 0.9; + float β2 = 0.999; + float Ɛ = 0.00000001; + + if (pNbElems && pAlpha && pLambda && pT && + grads && weights && mPtr && vPtr && vHatPtr) + { + nbElems = pNbElems[0]; + alpha = *pAlpha; + lambda = *pLambda; + t = *pT; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float g = grads[id]; + if (lambda != 0) + { + g += lambda * weights[id]; + } + + half m = β1 * mPtr[id] + (1 - β1) * g; + half v = β2 * vPtr[id] + (1 - β2) * g * g; + half vHat = max(v, vHatPtr[id]); + + mPtr[id] = m; + vPtr[id] = v; + vHatPtr[id] = vHat; + + m /= (1 - pow(β1, t)); + vHat /= (1 - pow(β2, t)); + + weights[id] = weights[id] - alpha * m / (sqrt(vHat) + Ɛ); +} + +kernel void weightsAdamRectifiedHalf( + const device half * grads, + constant uint * pNbElems, + constant float * pAlpha, + constant float * pLambda, + constant float * pT, + device half * weights, + device half * mPtr, + device half * vPtr, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float alpha, lambda; + float t; + float β1 = 0.9; + float β2 = 0.999; + float Ɛ = 0.00000001; + float ρinf = 2.0 / (1.0 - β2) - 1.0; + + if (pNbElems && pAlpha && pLambda && pT && + grads && weights && mPtr && vPtr) + { + nbElems = pNbElems[0]; + alpha = *pAlpha; + lambda = *pLambda; + t = *pT; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float g = grads[id]; + if (lambda != 0) + { + g += lambda * weights[id]; + } + + float m = β1 * mPtr[id] + (1 - β1) * g; + float v = β2 * vPtr[id] + (1 - β2) * g * g; + mPtr[id] = m; + vPtr[id] = v; + + m /= (1 - pow(β1, t)); + float ρ = ρinf - 2.0 * t * pow(β2, t) / (1 - pow(β2, t)); + + if (ρ > 5.0) + { + float l = sqrt((1 - pow(β2, t)) / (v + Ɛ)); + float r = sqrt(((ρ - 4.0) * (ρ - 2.0) * ρinf) / + ((ρinf - 4.0) * (ρinf - 2.0) * ρ)); + + weights[id] = weights[id] - alpha * m * r * l; + } + else + { + weights[id] = weights[id] - alpha * m; + } +} + +kernel void weightsAdaBoundHalf( + const device half * grads, + constant uint * pNbElems, + constant float * pAlpha, + constant float * pLambda, + constant float * pT, + constant float * pLowerBound, + constant float * pUpperBound, + device half * weights, + device half * mPtr, + device half * vPtr, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float alpha, lambda; + float t; + float β1 = 0.9; + float β2 = 0.999; + float Ɛ = 0.00000001; + float lowerBound; + float upperBound; + + if (pNbElems && pAlpha && pLambda && pT && pLowerBound && pUpperBound && + grads && weights && mPtr && vPtr) + { + nbElems = pNbElems[0]; + alpha = *pAlpha; + lambda = *pLambda; + t = *pT; + lowerBound = *pLowerBound; + upperBound = *pUpperBound; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float g = grads[id]; + if (lambda != 0) + { + g += lambda * weights[id]; + } + + float m = β1 * mPtr[id] + (1 - β1) * g; + float v = β2 * vPtr[id] + (1 - β2) * g * g; + + mPtr[id] = m; + vPtr[id] = v; + + float alphaHat = alpha * + sqrt(1 - pow(β2, t)) / ((sqrt(v) + Ɛ) * (1 - pow(β1, t))); + if (alphaHat < lowerBound) + { + alphaHat = lowerBound; + } + else if (alphaHat > upperBound) + { + alphaHat = upperBound; + } + + weights[id] = weights[id] - alphaHat * m; +} + +kernel void weightsAMSBoundHalf( + const device half * grads, + constant uint * pNbElems, + constant float * pAlpha, + constant float * pLambda, + constant float * pT, + constant float * pLowerBound, + constant float * pUpperBound, + device half * weights, + device half * mPtr, + device half * vPtr, + device half * vHatPtr, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + float alpha, lambda; + float t; + float β1 = 0.9; + float β2 = 0.999; + float Ɛ = 0.00000001; + float lowerBound; + float upperBound; + + if (pNbElems && pAlpha && pLambda && pT && pLowerBound && pUpperBound && + grads && weights && mPtr && vPtr && vHatPtr) + { + nbElems = pNbElems[0]; + alpha = *pAlpha; + lambda = *pLambda; + t = *pT; + lowerBound = *pLowerBound; + upperBound = *pUpperBound; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + float g = grads[id]; + if (lambda != 0) + { + g += lambda * weights[id]; + } + + half m = β1 * mPtr[id] + (1 - β1) * g; + half v = β2 * vPtr[id] + (1 - β2) * g * g; + half vHat = max(v, vHatPtr[id]); + + mPtr[id] = m; + vPtr[id] = v; + vHatPtr[id] = vHat; + + float alphaHat = alpha * + sqrt(1 - pow(β2, t)) / ((sqrt(vHat) + Ɛ) * (1 - pow(β1, t))); + if (alphaHat < lowerBound) + { + alphaHat = lowerBound; + } + else if (alphaHat > upperBound) + { + alphaHat = upperBound; + } + + weights[id] = weights[id] - alphaHat * m; +} diff --git a/Sources/GrAIdient/Metal/Kernel/Reduce.metal b/Sources/GrAIdient/Metal/Kernel/ReduceFloat.metal similarity index 97% rename from Sources/GrAIdient/Metal/Kernel/Reduce.metal rename to Sources/GrAIdient/Metal/Kernel/ReduceFloat.metal index 4fd9fd1b..e390ae83 100644 --- a/Sources/GrAIdient/Metal/Kernel/Reduce.metal +++ b/Sources/GrAIdient/Metal/Kernel/ReduceFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void reduceSum64( +kernel void reduceSum64Float( const device float * ins, constant uint * pDimensions, constant uint * pNbThreadgroups, @@ -62,7 +62,7 @@ kernel void reduceSum64( } } -kernel void reduceSum( +kernel void reduceSumFloat( const device float * ins, constant uint * pDimensions, device float * outs, @@ -94,7 +94,7 @@ kernel void reduceSum( outs[elem2] = sum; } -kernel void reduceMax64( +kernel void reduceMax64Float( const device float * ins, constant uint * pDimensions, constant uint * pNbThreadgroups, @@ -151,7 +151,7 @@ kernel void reduceMax64( } } -kernel void reduceMax( +kernel void reduceMaxFloat( const device float * ins, constant uint * pDimensions, device float * outs, diff --git a/Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal b/Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal new file mode 100644 index 00000000..99662efb --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/ReduceHalf.metal @@ -0,0 +1,184 @@ +// +// Reduce.metal +// GrAIdient +// +// Created by Jean-François Reboud on 17/05/2023. +// + +#include +using namespace metal; + +kernel void reduceSum64Half( + const device half * ins, + constant uint * pDimensions, + constant uint * pNbThreadgroups, + device half * outs, + uint2 groupId [[ threadgroup_position_in_grid ]], + uint2 threadId [[ thread_position_in_threadgroup ]], + uint2 id [[ thread_position_in_grid ]]) +{ + constexpr uint threadsPerThreadgroup = 64; + threadgroup float sumShared[threadsPerThreadgroup]; + + uint dim1; + uint dim2; + uint nbThreadgroups; + + if (pDimensions && pNbThreadgroups && ins && outs) + { + dim1 = pDimensions[0]; + dim2 = pDimensions[1]; + nbThreadgroups = *pNbThreadgroups; + } + else + return ; + + uint elem1 = id[0]; + uint elem2 = id[1]; + + if (elem1 >= dim1 && elem2 >= dim2) + { + return ; + } + + uint offset = elem2 * dim1 + elem1; + sumShared[threadId[0]] = ins[offset]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1) + { + uint index = threadId[0] + groupId[0] * threadsPerThreadgroup; + if (threadId[0] < stride && (index + stride) < dim1) + { + sumShared[threadId[0]] += sumShared[threadId[0] + stride]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (threadId[0] == 0) + { + uint offset = elem2 * nbThreadgroups + groupId[0]; + outs[offset] = sumShared[0]; + } +} + +kernel void reduceSumHalf( + const device half * ins, + constant uint * pDimensions, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint dim1; + uint dim2; + + if (pDimensions && ins && outs) + { + dim1 = pDimensions[0]; + dim2 = pDimensions[1]; + } + else + return ; + + uint elem2 = id; + if (elem2 >= dim2) + { + return ; + } + + float sum = 0.0; + for (uint elem1=0; elem1= dim1 && elem2 >= dim2) + { + return ; + } + + uint offset = elem2 * dim1 + elem1; + valShared[threadId[0]] = ins[offset]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1) + { + uint index = threadId[0] + groupId[0] * threadsPerThreadgroup; + if (threadId[0] < stride && (index + stride) < dim1) + { + valShared[threadId[0]] = max( + valShared[threadId[0] + stride], + valShared[threadId[0]] + ); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (threadId[0] == 0) + { + uint offset = elem2 * nbThreadgroups + groupId[0]; + outs[offset] = valShared[0]; + } +} + +kernel void reduceMaxHalf( + const device half * ins, + constant uint * pDimensions, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint dim1; + uint dim2; + + if (pDimensions && ins && outs) + { + dim1 = pDimensions[0]; + dim2 = pDimensions[1]; + } + else + return ; + + uint elem2 = id; + if (elem2 >= dim2) + { + return ; + } + + half val = ins[elem2 * dim1]; + for (uint elem1=0; elem1 using namespace metal; -kernel void reset( +kernel void resetFloat( constant uint * pNbElems, device float * outs, uint id [[ thread_position_in_grid ]]) diff --git a/Sources/GrAIdient/Metal/Kernel/ResetHalf.metal b/Sources/GrAIdient/Metal/Kernel/ResetHalf.metal new file mode 100644 index 00000000..6fadea01 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/ResetHalf.metal @@ -0,0 +1,77 @@ +// +// Reset.metal +// GrAIdient +// +// Created by Jean-François Reboud on 14/10/2022. +// + +#include +using namespace metal; + +kernel void resetHalf( + constant uint * pNbElems, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + outs[id] = 0.0; +} + +kernel void convertFloat2Half( + constant float * ins, + constant uint * pNbElems, + device half * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + outs[id] = (half)ins[id]; +} + +kernel void convertHalf2Float( + constant half * ins, + constant uint * pNbElems, + device float * outs, + uint id [[ thread_position_in_grid ]]) +{ + uint nbElems; + + if (pNbElems && outs) + { + nbElems = pNbElems[0]; + } + else + return ; + + if (id >= nbElems) + { + return ; + } + + outs[id] = (float)ins[id]; +} diff --git a/Sources/GrAIdient/Metal/Kernel/VQ2D.metal b/Sources/GrAIdient/Metal/Kernel/VQ2DFloat.metal similarity index 98% rename from Sources/GrAIdient/Metal/Kernel/VQ2D.metal rename to Sources/GrAIdient/Metal/Kernel/VQ2DFloat.metal index 720a64b6..10f74050 100644 --- a/Sources/GrAIdient/Metal/Kernel/VQ2D.metal +++ b/Sources/GrAIdient/Metal/Kernel/VQ2DFloat.metal @@ -8,7 +8,7 @@ #include using namespace metal; -kernel void vq2DForward( +kernel void vq2DForwardFloat( const device float * outsPrev, const device float * weights, constant uint * pNbChannels, @@ -83,7 +83,7 @@ kernel void vq2DForward( } } -kernel void vq2DBackward( +kernel void vq2DBackwardFloat( const device float * outsPrev, const device float * delta, const device float * weights, @@ -160,7 +160,7 @@ kernel void vq2DBackward( } } -kernel void vq2DBatchDerWeights( +kernel void vq2DBatchDerWeightsFloat( const device float * outsPrev, const device float * weights, const device int * indices, @@ -223,7 +223,7 @@ kernel void vq2DBatchDerWeights( grads[depth + nbChannels * k] += sum; } -kernel void vq2DDerWeights( +kernel void vq2DDerWeightsFloat( const device float * outsPrev, const device float * weights, const device int * indices, @@ -286,7 +286,7 @@ kernel void vq2DDerWeights( deltaWeights[depth + nbChannels * k + K * nbChannels * elem] += sum; } -kernel void vq2DReduceWeights( +kernel void vq2DReduceWeightsFloat( const device float * deltaWeights, constant uint * pNbChannels, constant uint * pK, @@ -336,7 +336,7 @@ kernel void vq2DReduceWeights( } } -kernel void vq2DLoss( +kernel void vq2DLossFloat( const device float * outsPrev, const device float * outs, const device int * indices, @@ -391,7 +391,7 @@ kernel void vq2DLoss( losses[elem] = tmp; } -kernel void vqLayerCAMMax2D( +kernel void vqLayerCAMMax2DFloat( const device float * camLayer, constant uint * pNbChannels, constant uint * pDimensions, @@ -455,7 +455,7 @@ kernel void vqLayerCAMMax2D( } } -kernel void vqGrad2DForward( +kernel void vqGrad2DForwardFloat( const device float * outsPrev, const device float * camLayer, const device float * camMax, diff --git a/Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal b/Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal new file mode 100644 index 00000000..d1edee8f --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/VQ2DHalf.metal @@ -0,0 +1,544 @@ +// +// VQ2D.metal +// GrAIdient +// +// Created by Jean-François Reboud on 29/03/2023. +// + +#include +using namespace metal; + +kernel void vq2DForwardHalf( + const device half * outsPrev, + const device half * weights, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pK, + constant uint * pNbBatch, + device half * outs, + device int * indices, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint K; + uint nbBatch; + + if (pNbChannels && pDimensions && pK && pNbBatch && + weights && outsPrev && outs && indices) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + K = *pK; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id[1]; + uint i = id[0] / width; + uint j = id[0] % width; + + if (i * j >= height * width || elem >= nbBatch) + { + return ; + } + + int minIndex = -1; + float minValue = 0.0; + for (uint k=0; k= 0) + { + for (uint depth=0; depth= height * nbBatch || + j * depth >= width * nbChannels) + { + return ; + } + + uint offsetStart = (depth + nbChannels * elem) * height; + uint offset = j + (offsetStart + i) * width; + + int minIndex = indices[j + (elem * height + i) * width]; + if (minIndex >= 0) + { + uint offsetWeights = depth + nbChannels * minIndex; + + float vq = weights[offsetWeights]; + float deltaCur = delta[offset]; + float outPrev = outsPrev[offset]; + + if (dirty) + { + deltaPrev[offset] = deltaCur; + } + else + { + deltaPrev[offset] += deltaCur; + } + + // Commitment term. + deltaPrev[offset] += beta / (float)(nbBatch * height * width) * + 2.0 * (outPrev - vq); + } + else if (dirty) + { + deltaPrev[offset] = 0.0; + } +} + +kernel void vq2DBatchDerWeightsHalf( + const device half * outsPrev, + const device half * weights, + const device int * indices, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pK, + constant float * pCoeff, + constant uint * pNbBatch, + device half * grads, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint K; + float coeff; + uint nbBatch; + + if (pNbChannels && pDimensions && pK && pCoeff && pNbBatch && + outsPrev && weights && indices && grads) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + K = *pK; + coeff = *pCoeff; + nbBatch = *pNbBatch; + } + else + return ; + + uint k = id[1]; + uint depth = id[0]; + + if (depth >= nbChannels || k >= K) + { + return ; + } + + float sum = 0.0; + for (uint elem=0; elem= nbChannels || elem * k >= nbBatch * K) + { + return ; + } + + float sum = 0.0; + for (uint i=0; i= nbChannels || k >= K) + { + return ; + } + + float sum = 0.0; + for (uint elem=0; elem= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= 0) + { + uint offset = j + (offsetStart + i) * width; + + float outPrev = outsPrev[offset]; + float vq = outs[offset]; + float diff = outPrev - vq; + + tmp += diff * diff; + } + }} + } + losses[elem] = tmp; +} + +kernel void vqLayerCAMMax2DHalf( + const device half * camLayer, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pNbThreadgroups, + constant uint * pNbBatch, + device half * camMax, + uint2 groupId [[ threadgroup_position_in_grid ]], + uint2 threadId [[ thread_position_in_threadgroup ]], + uint2 id [[ thread_position_in_grid ]]) +{ + constexpr uint threadsPerThreadgroup = 64; + threadgroup float camShared[threadsPerThreadgroup]; + + uint height, width; + uint nbChannels; + uint nbThreadgroups; + uint nbBatch; + + if (pNbChannels && pDimensions && pNbThreadgroups && pNbBatch && + camLayer && camMax) + { + width = pDimensions[0]; + height = pDimensions[1]; + nbChannels = *pNbChannels; + nbThreadgroups = *pNbThreadgroups; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id[1]; + uint i = id[0] / width; + uint j = id[0] % width; + + if (i * j >= height * width || elem >= nbBatch) + { + return ; + } + + camShared[threadId[0]] = camLayer[j + (elem * height + i) * width]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1) + { + uint index = threadId[0] + groupId[0] * threadsPerThreadgroup; + if (threadId[0] < stride && + (index + stride) < height * width) + { + camShared[threadId[0]] = max( + camShared[threadId[0] + stride], + camShared[threadId[0]] + ); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (threadId[0] == 0) + { + uint offset = elem * nbThreadgroups + groupId[0]; + camMax[offset] = camShared[0]; + } +} + +kernel void vqGrad2DForwardHalf( + const device half * outsPrev, + const device half * camLayer, + const device half * camMax, + const device half * weights, + constant uint * pNbChannels, + constant uint * pDimensions, + constant uint * pK, + constant float * pMagnitudeCoeff, + constant uint * pNbBatch, + device half * outs, + device int * indices, + uint2 id [[ thread_position_in_grid ]]) +{ + uint height, width; + uint nbChannels; + uint K; + float magnitudeCoeff; + uint nbBatch; + + if (pNbChannels && pDimensions && pK && pMagnitudeCoeff && pNbBatch && + outsPrev && camLayer && camMax && weights && outs && indices) + { + width = pDimensions[0]; + height = pDimensions[1]; + magnitudeCoeff = *pMagnitudeCoeff; + nbChannels = *pNbChannels; + K = *pK; + nbBatch = *pNbBatch; + } + else + return ; + + uint elem = id[1]; + uint i = id[0] / width; + uint j = id[0] % width; + + if (i * j >= height * width || elem >= nbBatch) + { + return ; + } + + float cam = camLayer[j + (elem * height + i) * width]; + if (cam / camMax[elem] >= magnitudeCoeff) + { + int minIndex = -1; + float minValue = 0.0; + for (uint k=0; k= 0) + { + for (uint depth=0; depth using namespace metal; -kernel void vqSeqForward( +kernel void vqSeqForwardFloat( const device float * outsPrev, const device float * weights, constant uint * pNbNeurons, @@ -79,7 +79,7 @@ kernel void vqSeqForward( } } -kernel void vqSeqBackward( +kernel void vqSeqBackwardFloat( const device float * outsPrev, const device float * delta, const device float * weights, @@ -153,7 +153,7 @@ kernel void vqSeqBackward( } } -kernel void vqSeqBatchDerWeights( +kernel void vqSeqBatchDerWeightsFloat( const device float * outsPrev, const device float * weights, const device int * indices, @@ -213,7 +213,7 @@ kernel void vqSeqBatchDerWeights( grads[depth + nbNeurons * k] += sum; } -kernel void vqSeqDerWeights( +kernel void vqSeqDerWeightsFloat( const device float * outsPrev, const device float * weights, const device int * indices, @@ -273,7 +273,7 @@ kernel void vqSeqDerWeights( deltaWeights[depth + nbNeurons * k + K * nbNeurons * elem] += sum; } -kernel void vqSeqLoss( +kernel void vqSeqLossFloat( const device float * outsPrev, const device float * outs, const device int * indices, @@ -323,7 +323,7 @@ kernel void vqSeqLoss( losses[elem] = tmp; } -kernel void vqLayerCAMMaxSeq( +kernel void vqLayerCAMMaxSeqFloat( const device float * camLayer, constant uint * pNbNeurons, constant uint * pNbThreadgroups, @@ -385,7 +385,7 @@ kernel void vqLayerCAMMaxSeq( } } -kernel void vqGradSeqForward( +kernel void vqGradSeqForwardFloat( const device float * outsPrev, const device float * camLayer, const device float * camMax, diff --git a/Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal b/Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal new file mode 100644 index 00000000..91ebc250 --- /dev/null +++ b/Sources/GrAIdient/Metal/Kernel/VQSeqHalf.metal @@ -0,0 +1,472 @@ +// +// VQSeq.metal +// GrAIdient +// +// Created by Jean-François Reboud on 18/06/2023. +// + +#include +using namespace metal; + +kernel void vqSeqForwardHalf( + const device half * outsPrev, + const device half * weights, + constant uint * pNbNeurons, + constant uint * pK, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + device int * indices, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint K; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pK && pNbBatch && pSequence && + weights && outsPrev && outs && indices) + { + nbNeurons = *pNbNeurons; + K = *pK; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint elem = id[1]; + uint seq = id[0]; + + if (seq >= sequence || elem >= nbBatch) + { + return ; + } + + int minIndex = -1; + float minValue = 0.0; + for (uint k=0; k= 0) + { + for (uint depth=0; depth= sequence * nbBatch || + depth >= nbNeurons) + { + return ; + } + + uint offset = depth + nbNeurons * seq + sequence * nbNeurons * elem; + + int minIndex = indices[seq + elem * sequence]; + if (minIndex >= 0) + { + uint offsetWeights = depth + nbNeurons * minIndex; + + float vq = weights[offsetWeights]; + float deltaCur = delta[offset]; + float outPrev = outsPrev[offset]; + + if (dirty) + { + deltaPrev[offset] = deltaCur; + } + else + { + deltaPrev[offset] += deltaCur; + } + + // Commitment term. + deltaPrev[offset] += beta / (float)(nbBatch * sequence) * + 2.0 * (outPrev - vq); + } + else if (dirty) + { + deltaPrev[offset] = 0.0; + } +} + +kernel void vqSeqBatchDerWeightsHalf( + const device half * outsPrev, + const device half * weights, + const device int * indices, + constant uint * pNbNeurons, + constant uint * pK, + constant float * pCoeff, + constant uint * pNbBatch, + constant uint * pSequence, + device half * grads, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint K; + float coeff; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pK && pCoeff && pNbBatch && pSequence && + outsPrev && weights && indices && grads) + { + nbNeurons = *pNbNeurons; + K = *pK; + coeff = *pCoeff; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint k = id[1]; + uint depth = id[0]; + + if (depth >= nbNeurons || k >= K) + { + return ; + } + + float sum = 0.0; + for (uint elem=0; elem= nbNeurons || elem * k >= nbBatch * K) + { + return ; + } + + float sum = 0.0; + for (uint seq=0; seq= nbBatch) + { + return ; + } + + float tmp = 0.0; + for (uint depth=0; depth= 0) + { + uint offset = + depth + nbNeurons * seq + sequence * nbNeurons * elem; + + float outPrev = outsPrev[offset]; + float vq = outs[offset]; + float diff = outPrev - vq; + + tmp += diff * diff; + } + }} + losses[elem] = tmp; +} + +kernel void vqLayerCAMMaxSeqHalf( + const device half * camLayer, + constant uint * pNbNeurons, + constant uint * pNbThreadgroups, + constant uint * pNbBatch, + constant uint * pSequence, + device half * camMax, + uint2 groupId [[ threadgroup_position_in_grid ]], + uint2 threadId [[ thread_position_in_threadgroup ]], + uint2 id [[ thread_position_in_grid ]]) +{ + constexpr uint threadsPerThreadgroup = 64; + threadgroup float camShared[threadsPerThreadgroup]; + + uint nbNeurons; + uint nbThreadgroups; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pNbThreadgroups && pNbBatch && pSequence && + camLayer && camMax) + { + nbNeurons = *pNbNeurons; + nbThreadgroups = *pNbThreadgroups; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint elem = id[1]; + uint seq = id[0]; + + if (seq >= sequence || elem >= nbBatch) + { + return ; + } + + camShared[threadId[0]] = camLayer[seq + sequence * elem]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint stride=threadsPerThreadgroup/2; stride>0; stride>>=1) + { + uint index = threadId[0] + groupId[0] * threadsPerThreadgroup; + if (threadId[0] < stride && + (index + stride) < sequence) + { + camShared[threadId[0]] = max( + camShared[threadId[0] + stride], + camShared[threadId[0]] + ); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (threadId[0] == 0) + { + uint offset = elem * nbThreadgroups + groupId[0]; + camMax[offset] = camShared[0]; + } +} + +kernel void vqGradSeqForwardHalf( + const device half * outsPrev, + const device half * camLayer, + const device half * camMax, + const device half * weights, + constant uint * pNbNeurons, + constant uint * pK, + constant float * pMagnitudeCoeff, + constant uint * pNbBatch, + constant uint * pSequence, + device half * outs, + device int * indices, + uint2 id [[ thread_position_in_grid ]]) +{ + uint nbNeurons; + uint K; + float magnitudeCoeff; + uint nbBatch; + uint sequence; + + if (pNbNeurons && pK && pMagnitudeCoeff && pNbBatch && pSequence && + outsPrev && camLayer && camMax && weights && outs && indices) + { + nbNeurons = *pNbNeurons; + K = *pK; + magnitudeCoeff = *pMagnitudeCoeff; + nbBatch = *pNbBatch; + sequence = *pSequence; + } + else + return ; + + uint elem = id[1]; + uint seq = id[0]; + + if (seq >= sequence || elem >= nbBatch) + { + return ; + } + + float cam = camLayer[seq + sequence * elem]; + if (cam / camMax[elem] >= magnitudeCoeff) + { + int minIndex = -1; + float minValue = 0.0; + for (uint k=0; k= 0) + { + for (uint depth=0; depth? = nil + /// Float16 buffer. + var _float16: MetalBuffer? = nil + + /// Get Metal buffer. + public var metal: MTLBuffer + { + get { + if GrAI.Precision.float16 + { + if _float16 == nil + { + if shared + { + _float16 = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + _float16 = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + } + } + return _float16!.metal + } + else + { + if _float == nil + { + if shared + { + _float = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + _float = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + } + } + return _float!.metal + } + } + } + + /// + /// Create a wrapper of Metal buffer. + /// + /// - Parameters: + /// - nbElems: The number of elements in the array. + /// - deviceID: GPU ID where the array will be sent. + /// - shared: Whether to create a shared buffer or a private one. + /// + public init(nbElems: Int, deviceID: Int, shared: Bool = false) + { + self.deviceID = deviceID + self.nbElems = nbElems + self.shared = shared + } + + /// Clean the buffers. + func reset() + { + _float = nil + _float16 = nil + } + + /// Initialize Metal buffer. + public func initialize() + { + if GrAI.Precision.float16 + { + if _float16 == nil + { + if shared + { + _float16 = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + let buffer = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + _float16 = buffer + _ = buffer.shared + } + } + _float16!.upload() + } + else + { + if _float == nil + { + if shared + { + _float = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + let buffer = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + _float = buffer + _ = buffer.shared + } + } + _float!.upload() + } + } + + /// + /// Initialize Metal buffer. + /// + /// - Parameters: + /// - array: Input array. + /// - start: Start offset. + /// + public func initialize( + array: inout [Float], + start: Int = 0) + { + if GrAI.Precision.float16 + { + if _float16 == nil + { + if shared + { + _float16 = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + _float16 = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + } + } + setupHalfBuffer( + array: &array, + out: _float16!, + start: start, + nbElems: nbElems, + deviceID: deviceID + ) + } + else + { + if _float == nil + { + if shared + { + _float = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + _float = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + } + } + setupFloatBuffer( + array: &array, + out: _float!, + start: start, + nbElems: nbElems, + deviceID: deviceID + ) + } + } + + /// Retrieve Metal buffer content. + public func download() -> [Float] + { + if GrAI.Precision.float16 + { + if _float16 == nil + { + if shared + { + _float16 = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + _float16 = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + } + } + return getHalfBuffer(_float16!).array + } + else + { + if _float == nil + { + if shared + { + _float = MetalSharedBuffer( + nbElems, deviceID: deviceID + ) + } + else + { + _float = MetalPrivateBuffer( + nbElems, deviceID: deviceID + ) + } + } + return [Float](_float!.download()) + } + } +} + /// Abstract array of elements that can be sent to the GPU. public class MetalBuffer { diff --git a/Sources/GrAIdient/Metal/MetalConfig.swift b/Sources/GrAIdient/Metal/MetalConfig.swift index 8776d4d4..5e76ccce 100644 --- a/Sources/GrAIdient/Metal/MetalConfig.swift +++ b/Sources/GrAIdient/Metal/MetalConfig.swift @@ -7,275 +7,548 @@ let CONFIG_KERNELS = [ - "Activation": [ - "forwardReLU", - "backwardReLU", - "forwardLeakyReLU", - "backwardLeakyReLU", - "forwardSoftReLU", - "backwardSoftReLU", - "forwardSigmoid", - "backwardSigmoid", - "forwardGELUApprox", - "backwardGELUApprox", - "forwardGELU", - "backwardGELU", - ], - "Biases": [ - "reduceBiases", - ], - "BatchNorm": [ - "computeBNConvμ", - "computeBNConvσ2", - "forwardBNConvTraining", - "forwardBNConvInference", - "backwardWeightsBNConv", - "backwardBNConvTraining", - "backwardBNConvInference", - ], - "Convolution": [ - "convForward", - "conv16Forward", - "convBackward", - "conv16Backward", - "convBatchDerWeights", - "conv34BatchDerWeights", - "convBatchDerBiases", - "convDerWeights", - "convDerBiases", - "convReduceWeights", - ], - "Deconvolution": [ - "deconvForward", - "deconvBackward", - "deconvBatchDerWeights", - "deconvDerWeights", - ], - "FullyConnected": [ - "flForward", - "flBackward", - "flBatchDerWeights", - "flBatchDerBiases", - "flDerWeights", - "flDerBiases", - "flReduceWeights", - ], - "FullyConnectedPatch": [ - "flPatchForward", - "flPatchBackward", - "flPatchBatchDerWeights", - "flPatchBatchDerBiases", - "flPatchBatch4DerBiases", - "flPatchDerWeights", - "flPatchDerBiases", - "flPatchReduceWeights", - ], - "FullyConnectedSeq": [ - "flSeqForward", - "flSeq48Forward", - "flSeq4Forward", - "flSeqBackward", - "flSeq48Backward", - "flSeq4Backward", - "flSeqBatchDerWeights", - "flSeqBatch4DerWeights", - "flSeqDerWeights", - "flSeqReduceWeights", - ], - "InstanceNorm": [ - "computeInstanceNormConvμ", - "computeInstanceNormConvσ2", - "forwardInstanceNormConv", - "forwardAdaIN", - "backwardWeightsInstanceNormConv", - "backward2AdaIN", - "backwardInstanceNormConv", - "backward1AdaIN", - ], - "Layer1D": [ - "MSE1DLoss", - "MSE1DLossDerivative", - "linearErrorLoss", - "linearErrorLossDerivative", - "selectNeurons1DForward", - "selectNeurons1DBackward", - "concat1DForward", - "concat1DBackward", - "softmax1DForward", - "softmax1DBackward", - "dotProduct1DForward", - "dotProduct1DBackward", - "constant1DForward", - "BCE1DLoss", - "BCE1DLossDerivative", - "BCESigmoid1DLoss", - "BCESigmoid1DLossDerivative", - "dropout1DForward", - "dropout1DBackward", - ], - "Layer2D": [ - "avgPoolForward", - "avgPoolBackward", - "maxPoolForward", - "maxPoolBackward", - "adaptiveAvgPoolForward1", - "adaptiveAvgPoolForward2", - "adaptiveAvgPoolBackward1", - "adaptiveAvgPoolBackward2", - "selectNeurons2DForward", - "selectNeurons2DBackward", - "IRDFT2RGBForward", - "IRDFT2RGBBackward", - "decorrelateRGBForward", - "decorrelateRGBBackward", - "linearScale2DForward", - "linearScale2DBackward", - "setDataFTFrequences2D", - "pad2DForward", - "pad2DBackward", - "crop2DForward", - "crop2DBackward", - "resizeBilinearPadForward", - "resizeBilinearPadBackward", - "rotate2DForward", - "rotate2DBackward", - "resizeBilinearCropForward", - "resizeBilinearCropBackward", - "concat02DForward", - "concat02DBackward", - "concat12DForward", - "concat12DBackward", - "constant2DForward", - "MSE2DLoss", - "MSE2DLossDerivative", - "selfCorrelate2DForward", - "selfCorrelate2DBackward", - "normalize12DForward", - "normalize12DBackward", - "computeSquaredNorm122D", - "normalize122DForward", - "computeDeltaTmp122D", - "normalize122DBackward", - "similarBatchError2DLoss", - "similarBatchError2DLossDerivative", - "similarError2DLossDerivative", - "flipHorizontal2DForward", - "flipHorizontal2DBackward", - "flipVertical2DForward", - "flipVertical2DBackward", - "colorJitterHSVForward", - "BCE2DLoss", - "BCE2DLossDerivative", - "BCESigmoid2DLoss", - "BCESigmoid2DLossDerivative", - "layerCAM2DForward", - ], - "LayerMerge": [ - "sum1", - "sum14", - "sum2", - "sum24", - "multiplyForward", - "multiplyBackward", - ], - "LayerNorm": [ - "computeLayerNormSeqμ", - "computeLayerNormSeqμ4", - "computeLayerNormSeqσ2", - "computeLayerNormSeqσ24", - "forwardLayerNormSeq", - "forwardLayerNormSeq4", - "backwardWeights1LayerNormSeq", - "backwardWeights1LayerNormSeq4", - "backwardWeights2LayerNormSeq", - "backwardWeights2LayerNormSeq4", - "backwardLayerNormSeq", - "backwardLayerNormSeq4", - ], - "LayerSeq": [ - "avgPoolSeqForward", - "avgPoolSeqBackward", - "concat1SeqForward", - "concat1Seq4Forward", - "concat1SeqBackward", - "concat1Seq4Backward", - "concat2SeqForward", - "concat2SeqBackward", - "constant12SeqForward", - "constant12Seq4Forward", - "constant12SeqBackward", - "constant12Seq4Backward", - "constant2SeqForward", - "constant2Seq4Forward", - "querySeqForward", - "querySeq4Forward", - "queryQuerySeqBackward", - "queryQuerySeq4Backward", - "queryKeySeqBackward", - "queryKeySeq4Backward", - "querySelfSeqForward", - "querySelfSeq4Forward", - "querySelfQuerySeqBackward", - "querySelfQuerySeq4Backward", - "querySelfKeySeqBackward", - "querySelfKeySeq4Backward", - "softmaxSeqForward", - "softmaxSeq4Forward", - "softmaxSeqBackward", - "softmaxSeq4Backward", - "valueSeqForward", - "valueSeq4Forward", - "valueValueSeqBackward", - "valueValueSeq4Backward", - "valueScoreSeqBackward", - "valueScoreSeq4Backward", - "valueSelfSeqForward", - "valueSelfSeq4Forward", - "valueSelfValueSeqBackward", - "valueSelfValueSeq4Backward", - "valueSelfScoreSeqBackward", - "valueSelfScoreSeq4Backward", - "selectSeqForward", - "selectSeqBackward", - "layerCAMSeqForward", - ], - "Optimizer": [ - "clipGradients", - "multiplyGradients", - "weightsSGD", - "weightsMomentum", - "weightsAdam", - "weightsAMSGrad", - "weightsAdamRectified", - "weightsAdaBound", - "weightsAMSBound", - ], - "Reduce": [ - "reduceSum64", - "reduceSum", - "reduceMax64", - "reduceMax", - ], - "Reset": [ - "reset" - ], - "VQ2D": [ - "vq2DForward", - "vq2DBackward", - "vq2DBatchDerWeights", - "vq2DDerWeights", - "vq2DReduceWeights", - "vq2DLoss", - "vqLayerCAMMax2D", - "vqGrad2DForward" - ], - "VQSeq": [ - "vqSeqForward", - "vqSeqBackward", - "vqSeqBatchDerWeights", - "vqSeqDerWeights", - "vqSeqLoss", - "vqLayerCAMMaxSeq", - "vqGradSeqForward" - ] + "ActivationFloat": [ + "forwardReLUFloat", + "backwardReLUFloat", + "forwardLeakyReLUFloat", + "backwardLeakyReLUFloat", + "forwardSoftReLUFloat", + "backwardSoftReLUFloat", + "forwardSigmoidFloat", + "backwardSigmoidFloat", + "forwardGELUApproxFloat", + "backwardGELUApproxFloat", + "forwardGELUFloat", + "backwardGELUFloat", + ], + "ActivationHalf": [ + "forwardReLUHalf", + "backwardReLUHalf", + "forwardLeakyReLUHalf", + "backwardLeakyReLUHalf", + "forwardSoftReLUHalf", + "backwardSoftReLUHalf", + "forwardSigmoidHalf", + "backwardSigmoidHalf", + "forwardGELUApproxHalf", + "backwardGELUApproxHalf", + "forwardGELUHalf", + "backwardGELUHalf", + ], + "BiasesFloat": [ + "reduceBiasesFloat", + ], + "BiasesHalf": [ + "reduceBiasesHalf", + ], + "BatchNormFloat": [ + "computeBNConvμFloat", + "computeBNConvσ2Float", + "forwardBNConvTrainingFloat", + "forwardBNConvInferenceFloat", + "backwardWeightsBNConvFloat", + "backwardBNConvTrainingFloat", + "backwardBNConvInferenceFloat", + ], + "BatchNormHalf": [ + "computeBNConvμHalf", + "computeBNConvσ2Half", + "forwardBNConvTrainingHalf", + "forwardBNConvInferenceHalf", + "backwardWeightsBNConvHalf", + "backwardBNConvTrainingHalf", + "backwardBNConvInferenceHalf", + ], + "ConvolutionFloat": [ + "convForwardFloat", + "conv16ForwardFloat", + "convBackwardFloat", + "conv16BackwardFloat", + "convBatchDerWeightsFloat", + "conv34BatchDerWeightsFloat", + "convBatchDerBiasesFloat", + "convDerWeightsFloat", + "convDerBiasesFloat", + "convReduceWeightsFloat", + ], + "ConvolutionHalf": [ + "convForwardHalf", + "conv16ForwardHalf", + "convBackwardHalf", + "conv16BackwardHalf", + "convBatchDerWeightsHalf", + "conv34BatchDerWeightsHalf", + "convBatchDerBiasesHalf", + "convDerWeightsHalf", + "convDerBiasesHalf", + "convReduceWeightsHalf", + ], + "DeconvolutionFloat": [ + "deconvForwardFloat", + "deconvBackwardFloat", + "deconvBatchDerWeightsFloat", + "deconvDerWeightsFloat", + ], + "DeconvolutionHalf": [ + "deconvForwardHalf", + "deconvBackwardHalf", + "deconvBatchDerWeightsHalf", + "deconvDerWeightsHalf", + ], + "FullyConnectedFloat": [ + "flForwardFloat", + "flBackwardFloat", + "flBatchDerWeightsFloat", + "flBatchDerBiasesFloat", + "flDerWeightsFloat", + "flDerBiasesFloat", + "flReduceWeightsFloat", + ], + "FullyConnectedHalf": [ + "flForwardHalf", + "flBackwardHalf", + "flBatchDerWeightsHalf", + "flBatchDerBiasesHalf", + "flDerWeightsHalf", + "flDerBiasesHalf", + "flReduceWeightsHalf", + ], + "FullyConnectedPatchFloat": [ + "flPatchForwardFloat", + "flPatchBackwardFloat", + "flPatchBatchDerWeightsFloat", + "flPatchBatchDerBiasesFloat", + "flPatchBatch4DerBiasesFloat", + "flPatchDerWeightsFloat", + "flPatchDerBiasesFloat", + "flPatchReduceWeightsFloat", + ], + "FullyConnectedPatchHalf": [ + "flPatchForwardHalf", + "flPatchBackwardHalf", + "flPatchBatchDerWeightsHalf", + "flPatchBatchDerBiasesHalf", + "flPatchBatch4DerBiasesHalf", + "flPatchDerWeightsHalf", + "flPatchDerBiasesHalf", + "flPatchReduceWeightsHalf", + ], + "FullyConnectedSeqFloat": [ + "flSeqForwardFloat", + "flSeq48ForwardFloat", + "flSeq4ForwardFloat", + "flSeqBackwardFloat", + "flSeq48BackwardFloat", + "flSeq4BackwardFloat", + "flSeqBatchDerWeightsFloat", + "flSeqBatch4DerWeightsFloat", + "flSeqDerWeightsFloat", + "flSeqReduceWeightsFloat", + ], + "FullyConnectedSeqHalf": [ + "flSeqForwardHalf", + "flSeq48ForwardHalf", + "flSeq4ForwardHalf", + "flSeqBackwardHalf", + "flSeq48BackwardHalf", + "flSeq4BackwardHalf", + "flSeqBatchDerWeightsHalf", + "flSeqBatch4DerWeightsHalf", + "flSeqDerWeightsHalf", + "flSeqReduceWeightsHalf", + ], + "InstanceNormFloat": [ + "computeInstanceNormConvμFloat", + "computeInstanceNormConvσ2Float", + "forwardInstanceNormConvFloat", + "forwardAdaINFloat", + "backwardWeightsInstanceNormConvFloat", + "backward2AdaINFloat", + "backwardInstanceNormConvFloat", + "backward1AdaINFloat", + ], + "InstanceNormHalf": [ + "computeInstanceNormConvμHalf", + "computeInstanceNormConvσ2Half", + "forwardInstanceNormConvHalf", + "forwardAdaINHalf", + "backwardWeightsInstanceNormConvHalf", + "backward2AdaINHalf", + "backwardInstanceNormConvHalf", + "backward1AdaINHalf", + ], + "Layer1DFloat": [ + "MSE1DLossFloat", + "MSE1DLossDerivativeFloat", + "linearErrorLossFloat", + "linearErrorLossDerivativeFloat", + "selectNeurons1DForwardFloat", + "selectNeurons1DBackwardFloat", + "concat1DForwardFloat", + "concat1DBackwardFloat", + "softmax1DForwardFloat", + "softmax1DBackwardFloat", + "dotProduct1DForwardFloat", + "dotProduct1DBackwardFloat", + "constant1DForwardFloat", + "BCE1DLossFloat", + "BCE1DLossDerivativeFloat", + "BCESigmoid1DLossFloat", + "BCESigmoid1DLossDerivativeFloat", + "dropout1DForwardFloat", + "dropout1DBackwardFloat", + ], + "Layer1DHalf": [ + "MSE1DLossHalf", + "MSE1DLossDerivativeHalf", + "linearErrorLossHalf", + "linearErrorLossDerivativeHalf", + "selectNeurons1DForwardHalf", + "selectNeurons1DBackwardHalf", + "concat1DForwardHalf", + "concat1DBackwardHalf", + "softmax1DForwardHalf", + "softmax1DBackwardHalf", + "dotProduct1DForwardHalf", + "dotProduct1DBackwardHalf", + "constant1DForwardHalf", + "BCE1DLossHalf", + "BCE1DLossDerivativeHalf", + "BCESigmoid1DLossHalf", + "BCESigmoid1DLossDerivativeHalf", + "dropout1DForwardHalf", + "dropout1DBackwardHalf", + ], + "Layer2DFloat": [ + "avgPoolForwardFloat", + "avgPoolBackwardFloat", + "maxPoolForwardFloat", + "maxPoolBackwardFloat", + "adaptiveAvgPoolForward1Float", + "adaptiveAvgPoolForward2Float", + "adaptiveAvgPoolBackward1Float", + "adaptiveAvgPoolBackward2Float", + "selectNeurons2DForwardFloat", + "selectNeurons2DBackwardFloat", + "IRDFT2RGBForwardFloat", + "IRDFT2RGBBackwardFloat", + "decorrelateRGBForwardFloat", + "decorrelateRGBBackwardFloat", + "linearScale2DForwardFloat", + "linearScale2DBackwardFloat", + "setDataFTFrequences2DFloat", + "pad2DForwardFloat", + "pad2DBackwardFloat", + "crop2DForwardFloat", + "crop2DBackwardFloat", + "resizeBilinearPadForwardFloat", + "resizeBilinearPadBackwardFloat", + "rotate2DForwardFloat", + "rotate2DBackwardFloat", + "resizeBilinearCropForwardFloat", + "resizeBilinearCropBackwardFloat", + "concat02DForwardFloat", + "concat02DBackwardFloat", + "concat12DForwardFloat", + "concat12DBackwardFloat", + "constant2DForwardFloat", + "MSE2DLossFloat", + "MSE2DLossDerivativeFloat", + "selfCorrelate2DForwardFloat", + "selfCorrelate2DBackwardFloat", + "normalize12DForwardFloat", + "normalize12DBackwardFloat", + "computeSquaredNorm122DFloat", + "normalize122DForwardFloat", + "computeDeltaTmp122DFloat", + "normalize122DBackwardFloat", + "similarBatchError2DLossFloat", + "similarBatchError2DLossDerivativeFloat", + "similarError2DLossDerivativeFloat", + "flipHorizontal2DForwardFloat", + "flipHorizontal2DBackwardFloat", + "flipVertical2DForwardFloat", + "flipVertical2DBackwardFloat", + "colorJitterHSVForwardFloat", + "BCE2DLossFloat", + "BCE2DLossDerivativeFloat", + "BCESigmoid2DLossFloat", + "BCESigmoid2DLossDerivativeFloat", + "layerCAM2DForwardFloat", + ], + "Layer2DHalf": [ + "avgPoolForwardHalf", + "avgPoolBackwardHalf", + "maxPoolForwardHalf", + "maxPoolBackwardHalf", + "adaptiveAvgPoolForward1Half", + "adaptiveAvgPoolForward2Half", + "adaptiveAvgPoolBackward1Half", + "adaptiveAvgPoolBackward2Half", + "selectNeurons2DForwardHalf", + "selectNeurons2DBackwardHalf", + "IRDFT2RGBForwardHalf", + "IRDFT2RGBBackwardHalf", + "decorrelateRGBForwardHalf", + "decorrelateRGBBackwardHalf", + "linearScale2DForwardHalf", + "linearScale2DBackwardHalf", + "setDataFTFrequences2DHalf", + "pad2DForwardHalf", + "pad2DBackwardHalf", + "crop2DForwardHalf", + "crop2DBackwardHalf", + "resizeBilinearPadForwardHalf", + "resizeBilinearPadBackwardHalf", + "rotate2DForwardHalf", + "rotate2DBackwardHalf", + "resizeBilinearCropForwardHalf", + "resizeBilinearCropBackwardHalf", + "concat02DForwardHalf", + "concat02DBackwardHalf", + "concat12DForwardHalf", + "concat12DBackwardHalf", + "constant2DForwardHalf", + "MSE2DLossHalf", + "MSE2DLossDerivativeHalf", + "selfCorrelate2DForwardHalf", + "selfCorrelate2DBackwardHalf", + "normalize12DForwardHalf", + "normalize12DBackwardHalf", + "computeSquaredNorm122DHalf", + "normalize122DForwardHalf", + "computeDeltaTmp122DHalf", + "normalize122DBackwardHalf", + "similarBatchError2DLossHalf", + "similarBatchError2DLossDerivativeHalf", + "similarError2DLossDerivativeHalf", + "flipHorizontal2DForwardHalf", + "flipHorizontal2DBackwardHalf", + "flipVertical2DForwardHalf", + "flipVertical2DBackwardHalf", + "colorJitterHSVForwardHalf", + "BCE2DLossHalf", + "BCE2DLossDerivativeHalf", + "BCESigmoid2DLossHalf", + "BCESigmoid2DLossDerivativeHalf", + "layerCAM2DForwardHalf", + ], + "LayerMergeFloat": [ + "sum1Float", + "sum14Float", + "sum2Float", + "sum24Float", + "multiplyForwardFloat", + "multiplyBackwardFloat", + ], + "LayerMergeHalf": [ + "sum1Half", + "sum14Half", + "sum2Half", + "sum24Half", + "multiplyForwardHalf", + "multiplyBackwardHalf", + ], + "LayerNormFloat": [ + "computeLayerNormSeqμFloat", + "computeLayerNormSeqμ4Float", + "computeLayerNormSeqσ2Float", + "computeLayerNormSeqσ24Float", + "forwardLayerNormSeqFloat", + "forwardLayerNormSeq4Float", + "backwardWeights1LayerNormSeqFloat", + "backwardWeights1LayerNormSeq4Float", + "backwardWeights2LayerNormSeqFloat", + "backwardWeights2LayerNormSeq4Float", + "backwardLayerNormSeqFloat", + "backwardLayerNormSeq4Float", + ], + "LayerNormHalf": [ + "computeLayerNormSeqμHalf", + "computeLayerNormSeqμ4Half", + "computeLayerNormSeqσ2Half", + "computeLayerNormSeqσ24Half", + "forwardLayerNormSeqHalf", + "forwardLayerNormSeq4Half", + "backwardWeights1LayerNormSeqHalf", + "backwardWeights1LayerNormSeq4Half", + "backwardWeights2LayerNormSeqHalf", + "backwardWeights2LayerNormSeq4Half", + "backwardLayerNormSeqHalf", + "backwardLayerNormSeq4Half", + ], + "LayerSeqFloat": [ + "avgPoolSeqForwardFloat", + "avgPoolSeqBackwardFloat", + "concat1SeqForwardFloat", + "concat1Seq4ForwardFloat", + "concat1SeqBackwardFloat", + "concat1Seq4BackwardFloat", + "concat2SeqForwardFloat", + "concat2SeqBackwardFloat", + "constant12SeqForwardFloat", + "constant12Seq4ForwardFloat", + "constant12SeqBackwardFloat", + "constant12Seq4BackwardFloat", + "constant2SeqForwardFloat", + "constant2Seq4ForwardFloat", + "querySeqForwardFloat", + "querySeq4ForwardFloat", + "queryQuerySeqBackwardFloat", + "queryQuerySeq4BackwardFloat", + "queryKeySeqBackwardFloat", + "queryKeySeq4BackwardFloat", + "querySelfSeqForwardFloat", + "querySelfSeq4ForwardFloat", + "querySelfQuerySeqBackwardFloat", + "querySelfQuerySeq4BackwardFloat", + "querySelfKeySeqBackwardFloat", + "querySelfKeySeq4BackwardFloat", + "softmaxSeqForwardFloat", + "softmaxSeq4ForwardFloat", + "softmaxSeqBackwardFloat", + "softmaxSeq4BackwardFloat", + "valueSeqForwardFloat", + "valueSeq4ForwardFloat", + "valueValueSeqBackwardFloat", + "valueValueSeq4BackwardFloat", + "valueScoreSeqBackwardFloat", + "valueScoreSeq4BackwardFloat", + "valueSelfSeqForwardFloat", + "valueSelfSeq4ForwardFloat", + "valueSelfValueSeqBackwardFloat", + "valueSelfValueSeq4BackwardFloat", + "valueSelfScoreSeqBackwardFloat", + "valueSelfScoreSeq4BackwardFloat", + "selectSeqForwardFloat", + "selectSeqBackwardFloat", + "layerCAMSeqForwardFloat", + ], + "LayerSeqHalf": [ + "avgPoolSeqForwardHalf", + "avgPoolSeqBackwardHalf", + "concat1SeqForwardHalf", + "concat1Seq4ForwardHalf", + "concat1SeqBackwardHalf", + "concat1Seq4BackwardHalf", + "concat2SeqForwardHalf", + "concat2SeqBackwardHalf", + "constant12SeqForwardHalf", + "constant12Seq4ForwardHalf", + "constant12SeqBackwardHalf", + "constant12Seq4BackwardHalf", + "constant2SeqForwardHalf", + "constant2Seq4ForwardHalf", + "querySeqForwardHalf", + "querySeq4ForwardHalf", + "queryQuerySeqBackwardHalf", + "queryQuerySeq4BackwardHalf", + "queryKeySeqBackwardHalf", + "queryKeySeq4BackwardHalf", + "querySelfSeqForwardHalf", + "querySelfSeq4ForwardHalf", + "querySelfQuerySeqBackwardHalf", + "querySelfQuerySeq4BackwardHalf", + "querySelfKeySeqBackwardHalf", + "querySelfKeySeq4BackwardHalf", + "softmaxSeqForwardHalf", + "softmaxSeq4ForwardHalf", + "softmaxSeqBackwardHalf", + "softmaxSeq4BackwardHalf", + "valueSeqForwardHalf", + "valueSeq4ForwardHalf", + "valueValueSeqBackwardHalf", + "valueValueSeq4BackwardHalf", + "valueScoreSeqBackwardHalf", + "valueScoreSeq4BackwardHalf", + "valueSelfSeqForwardHalf", + "valueSelfSeq4ForwardHalf", + "valueSelfValueSeqBackwardHalf", + "valueSelfValueSeq4BackwardHalf", + "valueSelfScoreSeqBackwardHalf", + "valueSelfScoreSeq4BackwardHalf", + "selectSeqForwardHalf", + "selectSeqBackwardHalf", + "layerCAMSeqForwardHalf", + ], + "OptimizerFloat": [ + "clipGradientsFloat", + "multiplyGradientsFloat", + "weightsSGDFloat", + "weightsMomentumFloat", + "weightsAdamFloat", + "weightsAMSGradFloat", + "weightsAdamRectifiedFloat", + "weightsAdaBoundFloat", + "weightsAMSBoundFloat", + ], + "OptimizerHalf": [ + "clipGradientsHalf", + "multiplyGradientsHalf", + "weightsSGDHalf", + "weightsMomentumHalf", + "weightsAdamHalf", + "weightsAMSGradHalf", + "weightsAdamRectifiedHalf", + "weightsAdaBoundHalf", + "weightsAMSBoundHalf", + ], + "ReduceFloat": [ + "reduceSum64Float", + "reduceSumFloat", + "reduceMax64Float", + "reduceMaxFloat", + ], + "ReduceHalf": [ + "reduceSum64Half", + "reduceSumHalf", + "reduceMax64Half", + "reduceMaxHalf", + ], + "ResetFloat": [ + "resetFloat", + ], + "ResetHalf": [ + "resetHalf", + "convertFloat2Half", + "convertHalf2Float", + ], + "VQ2DFloat": [ + "vq2DForwardFloat", + "vq2DBackwardFloat", + "vq2DBatchDerWeightsFloat", + "vq2DDerWeightsFloat", + "vq2DReduceWeightsFloat", + "vq2DLossFloat", + "vqLayerCAMMax2DFloat", + "vqGrad2DForwardFloat", + ], + "VQ2DHalf": [ + "vq2DForwardHalf", + "vq2DBackwardHalf", + "vq2DBatchDerWeightsHalf", + "vq2DDerWeightsHalf", + "vq2DReduceWeightsHalf", + "vq2DLossHalf", + "vqLayerCAMMax2DHalf", + "vqGrad2DForwardHalf", + ], + "VQSeqFloat": [ + "vqSeqForwardFloat", + "vqSeqBackwardFloat", + "vqSeqBatchDerWeightsFloat", + "vqSeqDerWeightsFloat", + "vqSeqLossFloat", + "vqLayerCAMMaxSeqFloat", + "vqGradSeqForwardFloat", + ], + "VQSeqHalf": [ + "vqSeqForwardHalf", + "vqSeqBackwardHalf", + "vqSeqBatchDerWeightsHalf", + "vqSeqDerWeightsHalf", + "vqSeqLossHalf", + "vqLayerCAMMaxSeqHalf", + "vqGradSeqForwardHalf", + ], ] diff --git a/Sources/GrAIdient/Metal/MetalKernel.swift b/Sources/GrAIdient/Metal/MetalKernel.swift index f3ebd173..d3a834af 100644 --- a/Sources/GrAIdient/Metal/MetalKernel.swift +++ b/Sources/GrAIdient/Metal/MetalKernel.swift @@ -704,11 +704,31 @@ private class MetalDevice /// func createCommand(_ pipeline: String) -> MetalCommand { - if let pipelineTmp = _pipelines[pipeline] + var pipelineFullName = pipeline + if GrAI.Precision.float16 + { + pipelineFullName += "Half" + } + else + { + pipelineFullName += "Float" + } + + if let pipelineTmp = _pipelines[pipelineFullName] { return MetalCommand(queue: _queue, pipeline: pipelineTmp) } - fatalError("Could not find pipeline: \(pipeline).") + else if let pipelineTmp = _pipelines[pipeline] + { + return MetalCommand(queue: _queue, pipeline: pipelineTmp) + } + else + { + fatalError( + "Could not find pipeline: " + + "\(pipelineFullName), nor \(pipeline)." + ) + } } /// diff --git a/Sources/GrAIdient/Utils/Buffer.swift b/Sources/GrAIdient/Utils/Buffer.swift index 37489c4d..05b2e6dd 100644 --- a/Sources/GrAIdient/Utils/Buffer.swift +++ b/Sources/GrAIdient/Utils/Buffer.swift @@ -9,46 +9,173 @@ import Foundation import Accelerate /// -/// Copy array to buffer. +/// Copy, convert and upload Float array to Half buffer. /// /// - Parameters: -/// - array: input array -/// - buffer: output buffer -/// - start: start index in `array` +/// - array: Input array. +/// - out: Output buffer. +/// - start: Start index in `array`. /// - nbElems: Number of elements to copy. +/// - deviceID: GPU device. /// -func copyFloatArrayToBuffer( +public func setupHalfBuffer( array: inout [Float], - buffer: UnsafeMutableBufferPointer, + out: MetalBuffer, start: Int, - nbElems: Int) + nbElems: Int, + deviceID: Int) +{ + let temp = MetalSharedBuffer(nbElems, deviceID: deviceID) + copyArrayToBuffer( + array: &array, + buffer: temp.buffer, + start: start, + nbElems: nbElems + ) + + temp.upload() + convertFloat2Half( + inBuffer: temp, + outBuffer: out, + nbElems: nbElems, + deviceID: deviceID + ) + + // Make sure operation has ended because returning. + _ = out.download() +} + +/// +/// Copy, convert and upload Float array to Half buffer. +/// +/// - Parameters: +/// - array: Input array. +/// - out: Output buffer. +/// - start: Start index in `array`. +/// - nbElems: Number of elements to copy. +/// - deviceID: GPU device. +/// +public func setupFloatBuffer( + array: inout [Float], + out: MetalBuffer, + start: Int, + nbElems: Int, + deviceID: Int) { - if #available(macOS 13.0, *) + if let out_s = out as? MetalSharedBuffer { copyArrayToBuffer( array: &array, - buffer: buffer, - start: start, + buffer: out_s.buffer, + start: start, nbElems: nbElems ) } else { - fatalError() + let out_p = out as! MetalPrivateBuffer + copyArrayToBuffer( + array: &array, + buffer: out_p.shared.buffer, + start: start, + nbElems: nbElems + ) } + out.upload() +} + +/// +/// Convert Half buffer to Float buffer and download content. +/// +/// - Parameter buffer: Input buffer. +/// +/// - Returns: Float buffer. +/// +public func getHalfBuffer( + _ buffer: MetalBuffer +) -> MetalSharedBuffer +{ + let temp = MetalSharedBuffer( + buffer.nbElems, + deviceID: buffer.deviceID + ) + convertHalf2Float( + inBuffer: buffer, + outBuffer: temp, + nbElems: buffer.nbElems, + deviceID: buffer.deviceID + ) + + _ = temp.download() + return temp +} + +/// +/// Convert a Float32 buffer into a Float16 buffer. +/// +/// - Parameters: +/// - inBuffer: Input buffer. +/// - outBuffer: Output buffer. +/// - nbElems: Number of elements. +/// - deviceID: GPU device. +/// +public func convertFloat2Half( + inBuffer: MetalBuffer, + outBuffer: MetalBuffer, + nbElems: Int, + deviceID: Int) +{ + let pNbElems: [UInt32] = [UInt32(nbElems)] + + let command = MetalKernel.get.createCommand( + "convertFloat2Half", deviceID: deviceID + ) + command.setBuffer(inBuffer.metal, atIndex: 0) + command.setBytes(pNbElems, atIndex: 1) + command.setBuffer(outBuffer.metal, atIndex: 2) + + command.dispatchThreads(nbElems) + command.enqueue() +} + +/// +/// Convert a Float16 into a Float32 buffer. +/// +/// - Parameters: +/// - inBuffer: Input buffer. +/// - outBuffer: Output buffer. +/// - nbElems: Number of elements. +/// - deviceID: GPU device. +/// +public func convertHalf2Float( + inBuffer: MetalBuffer, + outBuffer: MetalBuffer, + nbElems: Int, + deviceID: Int) +{ + let pNbElems: [UInt32] = [UInt32(nbElems)] + + let command = MetalKernel.get.createCommand( + "convertHalf2Float", deviceID: deviceID + ) + command.setBuffer(inBuffer.metal, atIndex: 0) + command.setBytes(pNbElems, atIndex: 1) + command.setBuffer(outBuffer.metal, atIndex: 2) + + command.dispatchThreads(nbElems) + command.enqueue() } -@available(macOS 13.0, *) /// /// Copy array to buffer. /// /// - Parameters: -/// - array: input array -/// - buffer: output buffer -/// - start: start index in `array` +/// - array: Input array. +/// - buffer: Output buffer. +/// - start: Start index in `array`. /// - nbElems: Number of elements to copy. /// -func copyArrayToBuffer( +public func copyArrayToBuffer( array: inout [T], buffer: UnsafeMutableBufferPointer, start: Int, diff --git a/Sources/GrAIdient/Utils/Image.swift b/Sources/GrAIdient/Utils/Image.swift index 9c24c81d..bab6b6a6 100644 --- a/Sources/GrAIdient/Utils/Image.swift +++ b/Sources/GrAIdient/Utils/Image.swift @@ -44,14 +44,14 @@ public class Image /// the output buffer in the .Neuron format. /// /// - Parameters: - /// - metalBuffer: Buffer of images. + /// - imagesURL: Images on the disk. + /// - imagesBuffer: Buffer of images. /// - width: Width of the images. /// - height: Height of the images. - /// - Returns: The list of images as list of pixels. /// public static func loadImages( imagesURL: [URL], - imagesBuffer: MetalBuffer, + imagesBuffer: FloatBuffer, width: Int, height: Int) throws { @@ -61,7 +61,13 @@ public class Image throw ImageError.MissingSpace } - let bufferPtr = imagesBuffer.download() + _ = imagesBuffer.download() + + var buffer = [Float]( + repeating: 0.0, + count: batchSize * 3 * height * width + ) + for (elem, imageURL) in imagesURL.enumerated() { let image = NSImage(contentsOfFile: imageURL.path)! @@ -79,12 +85,12 @@ public class Image let offsetStart = (depth + 3 * elem) * height let offsetSet = j + (offsetStart + i) * width - bufferPtr[offsetSet] = + buffer[offsetSet] = Float(pixels[3 * offsetGet + depth]) / 255.0 } }} } - imagesBuffer.upload() + imagesBuffer.initialize(array: &buffer) } /// @@ -100,18 +106,18 @@ public class Image /// - Returns: The list of images as list of pixels. /// public static func extractPixels( - _ metalBuffer: MetalBuffer, + _ metalBuffer: FloatBuffer, width: Int, height: Int) -> [[UInt8]] { - let bufferPtr = metalBuffer.download() + let buffer = metalBuffer.download() let nbImages = metalBuffer.nbElems / (width * height * 3) var images = [[Float]]() for i in 0..(_batchSize, deviceID: 0) - let gtBuffer = groundTruth.buffer + let groundTruth = FloatBuffer( + nbElems: _batchSize, deviceID: 0, shared: true + ) + var gtBuffer = [Float](repeating: 0.0, count: _batchSize) for elem in 0..<_batchSize / 2 { gtBuffer[elem] = 0.0 @@ -248,18 +252,20 @@ final class TransformerBenchmark: XCTestCase { gtBuffer[elem] = 1.0 } - groundTruth.upload() + groundTruth.initialize(array: >Buffer) // Initialize data once and for all. - let data = MetalPrivateBuffer( - _batchSize * 3 * _size * _size, deviceID: 0 + let data = FloatBuffer( + nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true + ) + var dataBuffer = [Float]( + repeating: 0.0, count: _batchSize * 3 * _size * _size ) - let dataBuffer = data.shared.buffer for i in 0..<_batchSize * 3 * _size * _size { dataBuffer[i] = Float.random(in: -1..<1) } - data.upload() + data.initialize(array: &dataBuffer) let nbEpochs = 2 let nbSteps = 20 @@ -349,8 +355,10 @@ final class TransformerBenchmark: XCTestCase let lastLayer: MSE1D = transformer.layers.last as! MSE1D // Initialize the ground truth once and for all. - let groundTruth = MetalSharedBuffer(_batchSize, deviceID: 0) - let gtBuffer = groundTruth.buffer + let groundTruth = FloatBuffer( + nbElems: _batchSize, deviceID: 0, shared: true + ) + var gtBuffer = [Float](repeating: 0.0, count: _batchSize) for elem in 0..<_batchSize / 2 { gtBuffer[elem] = 0.0 @@ -359,18 +367,20 @@ final class TransformerBenchmark: XCTestCase { gtBuffer[elem] = 1.0 } - groundTruth.upload() + groundTruth.initialize(array: >Buffer) // Initialize data once and for all. - let data = MetalPrivateBuffer( - _batchSize * 3 * _size * _size, deviceID: 0 + let data = FloatBuffer( + nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true + ) + var dataBuffer = [Float]( + repeating: 0.0, count: _batchSize * 3 * _size * _size ) - let dataBuffer = data.shared.buffer for i in 0..<_batchSize * 3 * _size * _size { dataBuffer[i] = Float.random(in: -1..<1) } - data.upload() + data.initialize(array: &dataBuffer) let nbEpochs = 2 let nbSteps = 20 diff --git a/Tests/GrAIExamples/TransformerExample.swift b/Tests/GrAIExamples/TransformerExample.swift index 5d39e2be..bd2a08be 100644 --- a/Tests/GrAIExamples/TransformerExample.swift +++ b/Tests/GrAIExamples/TransformerExample.swift @@ -29,7 +29,9 @@ final class TransformerExample: XCTestCase { setPythonLib() _ = MetalKernel.get + GrAI.Opti.GPU = true + GrAI.Precision.float = true } /// @@ -287,17 +289,19 @@ final class TransformerExample: XCTestCase let lastLayer: MSE1D = transformer.layers.last as! MSE1D // Initialize the ground truth once and for all. - let groundTruth = MetalSharedBuffer(_batchSize, deviceID: 0) - let buffer = groundTruth.buffer + let groundTruth = FloatBuffer( + nbElems: _batchSize, deviceID: 0, shared: true + ) + var gtBuffer = [Float](repeating: 0.0, count: _batchSize) for elem in 0..<_batchSize / 2 { - buffer[elem] = 0.0 + gtBuffer[elem] = 0.0 } for elem in _batchSize / 2..<_batchSize { - buffer[elem] = 1.0 + gtBuffer[elem] = 1.0 } - groundTruth.upload() + groundTruth.initialize(array: >Buffer) let nbEpochs = 2 for epoch in 0..(_batchSize, deviceID: 0) - let buffer = groundTruth.buffer + let groundTruth = FloatBuffer( + nbElems: _batchSize, deviceID: 0, shared: true + ) + var gtBuffer = [Float](repeating: 0.0, count: _batchSize) for elem in 0..<_batchSize / 2 { - buffer[elem] = 0.0 + gtBuffer[elem] = 0.0 } for elem in _batchSize / 2..<_batchSize { - buffer[elem] = 1.0 + gtBuffer[elem] = 1.0 } - groundTruth.upload() + groundTruth.initialize(array: >Buffer) // Initialize data once and for all. - let data = MetalPrivateBuffer( - _batchSize * 3 * _size * _size, deviceID: 0 + let data = FloatBuffer( + nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true + ) + var dataBuffer = [Float]( + repeating: 0.0, count: _batchSize * 3 * _size * _size ) - let dataBuffer = data.shared.buffer for i in 0..<_batchSize * 3 * _size * _size { dataBuffer[i] = Float.random(in: -1..<1) } - data.upload() + data.initialize(array: &dataBuffer) let nbEpochs = 1 let nbSteps = 20 @@ -328,8 +334,10 @@ final class VGGBenchmark: XCTestCase let lastLayer: MSE1D = vgg.layers.last as! MSE1D // Initialize the ground truth once and for all. - let groundTruth = MetalSharedBuffer(_batchSize, deviceID: 0) - let gtBuffer = groundTruth.buffer + let groundTruth = FloatBuffer( + nbElems: _batchSize, deviceID: 0, shared: true + ) + var gtBuffer = [Float](repeating: 0.0, count: _batchSize) for elem in 0..<_batchSize / 2 { gtBuffer[elem] = 0.0 @@ -338,18 +346,20 @@ final class VGGBenchmark: XCTestCase { gtBuffer[elem] = 1.0 } - groundTruth.upload() + groundTruth.initialize(array: >Buffer) // Initialize data once and for all. - let data = MetalPrivateBuffer( - _batchSize * 3 * _size * _size, deviceID: 0 + let data = FloatBuffer( + nbElems: _batchSize * 3 * _size * _size, deviceID: 0, shared: true + ) + var dataBuffer = [Float]( + repeating: 0.0, count: _batchSize * 3 * _size * _size ) - let dataBuffer = data.shared.buffer for i in 0..<_batchSize * 3 * _size * _size { dataBuffer[i] = Float.random(in: -1..<1) } - data.upload() + data.initialize(array: &dataBuffer) let nbEpochs = 2 let nbSteps = 20 diff --git a/Tests/GrAIExamples/VGGExample.swift b/Tests/GrAIExamples/VGGExample.swift index 685967d3..d36fad54 100644 --- a/Tests/GrAIExamples/VGGExample.swift +++ b/Tests/GrAIExamples/VGGExample.swift @@ -29,7 +29,9 @@ final class VGGExample: XCTestCase { setPythonLib() _ = MetalKernel.get + GrAI.Opti.GPU = true + GrAI.Precision.float = true } /// @@ -396,17 +398,19 @@ final class VGGExample: XCTestCase let lastLayer: MSE1D = vgg.layers.last as! MSE1D // Initialize the ground truth once and for all. - let groundTruth = MetalSharedBuffer(_batchSize, deviceID: 0) - let buffer = groundTruth.buffer + let groundTruth = FloatBuffer( + nbElems: _batchSize, deviceID: 0, shared: true + ) + var gtBuffer = [Float](repeating: 0.0, count: _batchSize) for elem in 0..<_batchSize / 2 { - buffer[elem] = 0.0 + gtBuffer[elem] = 0.0 } for elem in _batchSize / 2..<_batchSize { - buffer[elem] = 1.0 + gtBuffer[elem] = 1.0 } - MetalKernel.get.upload([groundTruth]) + groundTruth.initialize(array: >Buffer) let nbEpochs = 5 for epoch in 0..( - batchSize * 3 * _size * _size, deviceID: 0 + let buffer = FloatBuffer(nbElems: + batchSize * 3 * _size * _size, deviceID: 0, shared: true ) try! Image.loadImages( diff --git a/Tests/GrAITests/Layer2DTests.swift b/Tests/GrAITests/Layer2DTests.swift index 9171ef89..3d17dc81 100644 --- a/Tests/GrAITests/Layer2DTests.swift +++ b/Tests/GrAITests/Layer2DTests.swift @@ -1843,13 +1843,13 @@ class Layer2DFlowTests: Input2DMSE1DCase func testNormalize1() throws { let trainer = _buildTrainer(model: "Normalize1", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } func testNormalize12() throws { let trainer = _buildTrainer(model: "Normalize12", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } func testFlipHorizontal1() throws @@ -2371,13 +2371,13 @@ class Layer2DFlowResetTests: Layer2DFlowTests override func testNormalize1() throws { let trainer = _buildTrainer(model: "Normalize1", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } override func testNormalize12() throws { let trainer = _buildTrainer(model: "Normalize12", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } override func testFlipHorizontal1() throws @@ -2771,13 +2771,13 @@ class Layer2DFlowReverseTests: Layer2DFlowTests override func testNormalize1() throws { let trainer = _buildTrainer(model: "Normalize1", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } override func testNormalize12() throws { let trainer = _buildTrainer(model: "Normalize12", bn: false) - run(trainer) + run(trainer, diffThreshold: 0.0001) } override func testFlipHorizontal1() throws diff --git a/Tests/GrAITests/OptimizerTests.swift b/Tests/GrAITests/OptimizerTests.swift index 88c29e10..f5dc764c 100644 --- a/Tests/GrAITests/OptimizerTests.swift +++ b/Tests/GrAITests/OptimizerTests.swift @@ -18,7 +18,9 @@ class OptimizerTests: Input1DMSE1DCase { batchSize = 5 _ = MetalKernel.get + GrAI.Opti.GPU = true + GrAI.Precision.float = true setOptimizerParams(params: &optimizerParams) optimizerParams.nbLoops = 10 @@ -132,6 +134,7 @@ class OptimizerTests: Input1DMSE1DCase func testAdamRectified() throws { + optimizerParams.nbLoops = 5 setOptimizerParams(params: &optimizerParams, optimizerClass: .AdamRectified) let trainer = _buildTrainer() @@ -140,6 +143,7 @@ class OptimizerTests: Input1DMSE1DCase func testAdamRectifiedDecay() throws { + optimizerParams.nbLoops = 5 setOptimizerParams(params: &optimizerParams, optimizerClass: .AdamRectified, lambda: 1e-3) diff --git a/Tests/GrAITests/ReduceTests.swift b/Tests/GrAITests/ReduceTests.swift index b658f102..e4000ab3 100644 --- a/Tests/GrAITests/ReduceTests.swift +++ b/Tests/GrAITests/ReduceTests.swift @@ -11,19 +11,19 @@ import GrAIdient /// Test reduce sum kernel. class ReduceSumTests: XCTestCase { - var _buffer: MetalSharedBuffer! = nil + var _buffer: FloatBuffer! = nil var _array = [Float]() override func setUp() { _ = MetalKernel.get + GrAI.Opti.GPU = true } - private func _testBuffer(dim1: Int, dim2: Int) + private func _testBuffer(dim1: Int, dim2: Int, shared: Bool) { _array = [Float](repeating: 0.0, count: dim1 * dim2) - _buffer = MetalSharedBuffer(dim1 * dim2, deviceID: 0) - let buffer = _buffer.buffer + _buffer = FloatBuffer(nbElems: dim1 * dim2, deviceID: 0, shared: shared) for elem1 in 0..! = nil + var _buffer: FloatBuffer! = nil var _array = [Float]() override func setUp() @@ -106,11 +146,10 @@ class ReduceMaxTests: XCTestCase _ = MetalKernel.get } - private func _testBuffer(dim1: Int, dim2: Int) + private func _testBuffer(dim1: Int, dim2: Int, shared: Bool) { _array = [Float](repeating: 0.0, count: dim1 * dim2) - _buffer = MetalSharedBuffer(dim1 * dim2, deviceID: 0) - let buffer = _buffer.buffer + _buffer = FloatBuffer(nbElems: dim1 * dim2, deviceID: 0, shared: shared) for elem1 in 0..( - 1, deviceID: DEVICE_ID + let groundTruth = FloatBuffer( + nbElems: 1, deviceID: DEVICE_ID ) - groundTruth.buffer[0] = 0 - MetalKernel.get.upload([groundTruth]) + var buffer: [Float] = [0.0] + groundTruth.initialize(array: &buffer) let inputData1: [[Float]] = [[0.0]] let inputData2: [[Float]] = [[1.0]] @@ -610,11 +612,11 @@ class UpdateManagementTests: XCTestCase deviceID: DEVICE_ID ) - let groundTruth = MetalSharedBuffer( - 1, deviceID: DEVICE_ID + let groundTruth = FloatBuffer( + nbElems: 1, deviceID: DEVICE_ID ) - groundTruth.buffer[0] = 0 - MetalKernel.get.upload([groundTruth]) + var buffer: [Float] = [0.0] + groundTruth.initialize(array: &buffer) let inputData1: [Float] = [0.0] let inputData2: [Float] = [1.0] diff --git a/Tests/GrAITorchTests/Base/setup.py b/Tests/GrAITorchTests/Base/setup.py index aa80f954..7d7862e1 100644 --- a/Tests/GrAITorchTests/Base/setup.py +++ b/Tests/GrAITorchTests/Base/setup.py @@ -8,7 +8,7 @@ license='MIT', install_requires=[ "torch==1.13.1", - "torchvision==0.11.2", + "torchvision==0.14.1", "numpy==1.23.1", "pillow==9.2.0", ], diff --git a/Tests/GrAITorchTests/GrAITorchTests.swift b/Tests/GrAITorchTests/GrAITorchTests.swift index 16fe2128..a4e0b68f 100644 --- a/Tests/GrAITorchTests/GrAITorchTests.swift +++ b/Tests/GrAITorchTests/GrAITorchTests.swift @@ -21,7 +21,9 @@ final class GrAITorchTests: XCTestCase { setPythonLib() _ = MetalKernel.get + GrAI.Opti.GPU = true + GrAI.Precision.float = true } ///